forked from broadinstitute/imaging_metric_comparison
-
Notifications
You must be signed in to change notification settings - Fork 0
/
hit_selection_jaccard_function.R
165 lines (128 loc) · 5.02 KB
/
hit_selection_jaccard_function.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# all usefull libraries
library(magrittr)
library(dplyr)
library(foreach)
library(doMC)
library(stringr)
library(tidyverse)
#' Function that perform Hit Selection.
#' Hit selection is performed based on the distance defined following the Jaccard distance, which calculate the dissimilarires between sample sets.
#' The distance is defined as the mean of the distance between 4 sets.
#' A) n.feat top features of component x
#' B) n.feat top features of component y
#' C) n.feat bottom features of component x
#' D) n.feat bottom features of component y
#' $dist(x,y) = \frac{dist_J(A,B) + dist_J(C,D)}{2}$
#' $dist_J(i,j) = \frac{|A \cup B| - |A \cap B|}{|A \cup B|}$
#'
#' @param df dataframe in '.rds'
#' @param filename name of the data file
#' @param n.feat number of features in the sets
#' @param n.replicate number of replicate for the basline (random comparison)
#' @param N number of data to make the non replicate distance distribution
#' @param seed seed number for reproductibility
#' @param nCPU number of CPU cores for parallelization
#' @return hit ratio
hit_selection_jaccard <- function(pf,
filename = "Jaccard",
n.feat = 50,
n.replicate = 4,
N = 5000,
seed = 42,
nCPU = 7,
dir.save = "BBBC022_2013/selected_single_cells",
dir.save.plus = "/hit_selected/Jaccard/"){
message(paste('Running Jaccard Hit selection...', filename))
start.time <- Sys.time()
# number of CPU cores for parallelization
registerDoMC(nCPU)
# seed for the reproducibility
set.seed(seed)
# Variables
variables <-
names(pf) %>% str_subset("^Cells_|^Cytoplasm_|^Nuclei_")
## Separation of data
# find the different compounds
IDs <- unique(pf$Metadata_broad_sample)
## Distance of the data
# load the c++ function
Rcpp::sourceCpp('jaccard_distance_function.cpp',rebuild = FALSE)
# loop over all IDs and save the median of the distance
comp.dist.median <- foreach(i = 1:length(IDs), .combine=cbind) %dopar% {
#filtering to choose only for one compound
comp <-
filter(pf, Metadata_broad_sample %in% IDs[i])
comp %<>%
select(one_of(variables)) %>%
as.matrix()
# distance of the features
comp.dist <- vecJaccardDistance(comp, n.feat)
# save median of the distance
comp.dist.median <-
median(comp.dist, na.rm=TRUE)
}
# hist(comp.dist.median,
# main="Histogram for Median Replicate Distance",
# xlab="Median Replicate Distance",
# xlim = range(0, 1))
## Thresholding of poor replicate distance
# random sequence for reproducibility
a <- sample(1:10000, N, replace=F)
# loop over N times to get a distribution
random.replicate.dist.median <- foreach(i = 1:N, .combine=cbind) %dopar% {
# set seed according to random sequence
set.seed(a[i])
# group by IDs
# sample fixed number per group -> choose 4 replicates randomly from different group
random.replicate <-
pf %>%
group_by(Metadata_broad_sample) %>%
sample_n(1, replace = FALSE) %>%
ungroup(random.replicate)
random.replicate <- sample_n(random.replicate, n.replicate, replace = FALSE)
comp <- random.replicate[,variables] %>%
as.matrix()
# distance of the features
comp.dist <- vecJaccardDistance(comp, n.feat)
# median of the non replicate distance
random.replicate.dist.median <- median(comp.dist,na.rm=TRUE)
}
# histogram plot
#hist(random.replicate.dist.median,
# main="Histogram for Non Replicate Median Distance",
# xlab="Non Replicate Median Distance",
# xlim = range(0, 1))
# threshold to determine if can reject H0
thres <- quantile(random.replicate.dist.median, .05)
## Hit Selection
# find indices of replicate median distance < threshold
inds <- which(comp.dist.median < thres)
# find values of the median that are hit selected
hit.select <- comp.dist.median[inds]
# find component that are hit selected
hit.select.IDs <- IDs[inds]
# ratio of strong median replicate correlation
hit.ratio <- length(hit.select)/length(comp.dist.median)
## Saving data
# select high median correlation replicate
pf %<>%
filter(Metadata_broad_sample %in% hit.select.IDs)
# save new dataset
filename.save <- paste("../../input/",
dir.save,
dir.save.plus,
strsplit(filename, ".rds"),
"_",
toString(n.feat),
"n_seed_",
seed,
".rds",
sep = "")
#### uncomment if want to save file
pf %>%
saveRDS(filename.save)
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
return(hit.ratio)
}