-
Notifications
You must be signed in to change notification settings - Fork 1
/
example.R
140 lines (115 loc) · 4.52 KB
/
example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# funs --------------------------------------------------------------------
library("data.table")
pe <- new.env()
source("x_pattern_extraction_funs.R", local = pe, encoding = "UTF-8")
ut <- new.env()
source("x_util_funs.R", local = ut, encoding = "UTF-8")
ge <- new.env()
source("x_gleason_extraction_funs.R", local = ge, encoding = "UTF-8")
cf <- new.env()
source("x_confusion_funs.R", local = cf, encoding = "UTF-8")
# manually extracted Gleason scores ---------------------------------------
man_dt <- ge$example_typed_format_dataset
stopifnot(
c("text_id", "text", "a", "b", "t", "c") %in% names(man_dt)
)
# extraction --------------------------------------------------------------
fcr_dt <- man_dt[
i = !duplicated(text_id),
j = ge$extract_gleason_scores(texts = ge$prepare_text(text),
text_ids = text_id,
format = "standard",
pattern_dt = ge$fcr_pattern_dt)
]
fcr_add_dt <- man_dt[
i = !duplicated(text_id),
j = ge$extract_gleason_scores(texts = ge$prepare_text(text),
text_ids = text_id,
format = "standard",
pattern_dt = ge$fcr_add_pattern_dt)
]
# confusion ---------------------------------------------------------------
all_text_ids <- sort(unique(man_dt[["text_id"]]))
man_dt[, "obs_id" := 1:.N]
man_dt[, "match_type" := ut$infer_match_type(a, b, t, c)]
data.table::setkeyv(man_dt, c("text_id", "obs_id"))
man_dt <- ut$typed_format_dt_to_standard_format_dt(man_dt)
man_dt[, "implied_c" := data.table::fifelse(!is.na(a) & !is.na(b), a + b, c)]
man_dt <- man_dt[
(a %in% c(3:5, NA)) & (b %in% c(3:5, NA)) & (c %in% c(6:10, NA)) & (implied_c %in% c(6:10, NA))
]
man_dt[, "value" := ge$minimum_sufficient_value_strings(a, b, c)]
man_dt <- man_dt[!is.na(implied_c), ]
fcr_dt[, "implied_c" := data.table::fifelse(!is.na(a) & !is.na(b), a + b, c)]
fcr_dt <- fcr_dt[
(a %in% c(3:5, NA)) & (b %in% c(3:5, NA)) & (c %in% c(6:10, NA)) & (implied_c %in% c(6:10, NA))
]
fcr_dt[, "value" := ge$minimum_sufficient_value_strings(a, b, c)]
fcr_dt <- fcr_dt[!is.na(implied_c), ]
fcr_add_dt[, "implied_c" := data.table::fifelse(!is.na(a) & !is.na(b), a + b, c)]
fcr_add_dt <- fcr_add_dt[
(a %in% c(3:5, NA)) & (b %in% c(3:5, NA)) & (c %in% c(6:10, NA)) & (implied_c %in% c(6:10, NA))
]
fcr_add_dt[, "value" := ge$minimum_sufficient_value_strings(a, b, c)]
fcr_add_dt <- fcr_add_dt[!is.na(implied_c), ]
manual <- ge$value_string_vectors_by_text_id(
value_strings = man_dt[["value"]],
text_ids = man_dt[["text_id"]],
all_text_ids = all_text_ids
)
fcr <- ge$value_string_vectors_by_text_id(
value_strings = fcr_dt[["value"]],
text_ids = fcr_dt[["text_id"]],
all_text_ids = all_text_ids
)
fcr_add <- ge$value_string_vectors_by_text_id(
value_strings = fcr_add_dt[["value"]],
text_ids = fcr_add_dt[["text_id"]],
all_text_ids = all_text_ids
)
match_number_dt <- ge$match_number_dt(
match_list_list = mget(c("manual", "fcr", "fcr_add"))
)
print(match_number_dt)
match_type_number_dt <- ge$match_type_number_dt(
match_list_list = mget(c("manual", "fcr", "fcr_add"))
)
print(match_type_number_dt)
diff_dt <- ge$value_string_groupdiff_dt(
estimated = fcr, known = manual, text_ids = all_text_ids
)
message("found by fcr not found manually")
print(diff_dt[vapply(estimated_not_known, length, integer(1L)) > 0L, .(text_id, known, estimated, estimated_not_known)])
message("found manually not found by fcr")
print(diff_dt[vapply(known_not_estimated, length, integer(1L)) > 0L, .(text_id, known, estimated, known_not_estimated)])
confusion <- ut$bootstrap(
x = data.table::as.data.table(mget(c("manual", "fcr", "fcr_add"))),
statistics_fun = function(x, i) {
stat_nms <- c("sensitivity", "specificity", "accuracy", "F1")
fcr_add_conf <- cf$hit_confusion(
estimated = x[["fcr_add"]][i],
known = x[["manual"]][i]
)[stat_nms]
fcr_conf <- cf$hit_confusion(
estimated = x[["fcr"]][i],
known = x[["manual"]][i]
)[stat_nms]
100L * c(fcr_conf, fcr_add_conf, fcr_conf - fcr_add_conf)
},
statistics_id_dt = data.table::CJ(
set = c("fcr", "fcr_add", "fcr - fcr_add"),
statistic = c("sensitivity", "specificity", "accuracy", "F1"),
sorted = FALSE, unique = FALSE
),
n_bootstrap_samples = 1e2L,
n_threads = 4L
)
confusion_r2 <- data.table::copy(confusion)
confusion_r2[, names(confusion_r2) := lapply(.SD, function(col) {
if (is.numeric(col)) {
round(col)
} else {
col
}
})]
print(confusion_r2)