-
Notifications
You must be signed in to change notification settings - Fork 1
/
BBBC022_fs_sc_SR.R
69 lines (51 loc) · 2.25 KB
/
BBBC022_fs_sc_SR.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
library(magrittr)
library(dplyr)
library(tidyverse)
library(stringr)
# import data
pf.sc <-
readRDS(file.path("../../input/BBBC022_2013/single_cells/BBBC022_sampled_10000.rds")) %>%
# select(one_of(variables.fs.sc.fc)) %>%
scale() %>%
as.data.frame() #20,491x1777 -> no: 20,419x815
pf <- readRDS(file.path("../../input/BBBC022_2013/old/Pf_Gustafsdottir.rds"))
# difference of features name between old and new version:
# Hoechst -> DNA, Ph_golgi -> AGP, Syto -> RNA
pf$feat_cols <- str_replace(pf$feat_cols, "Hoechst", "DNA")
pf$feat_cols <- str_replace(pf$feat_cols, "Ph_golgi", "AGP")
pf$feat_cols <- str_replace(pf$feat_cols, "Syto", "RNA")
cl <- colnames(pf$data)
cl <- str_replace(cl, "Hoechst", "DNA")
cl <- str_replace(cl, "Ph_golgi", "AGP")
cl <- str_replace(cl, "Syto", "RNA")
colnames(pf$data) <- cl
profiles <- pf.sc[ , colSums(is.na(pf.sc)) == 0] # remove features (columns) where there are NA
feat.delete <- setdiff(colnames(profiles), pf$feat_cols) # features that are in profiles but not in pf
profiles <- profiles[, !names(profiles) %in% feat.delete] # remove features
start.time <- Sys.time()
# transpose the dataset to have featxobs (mxn)
X <- profiles %>% as.matrix() %>% t(.)
A <- tcrossprod(X, X)
print("cross product ok")
# load the c++ function
Rcpp::sourceCpp('ranking_SVD_entropy.cpp')
print("cpp function ok")
# SR return the entropy for each feature.
feat.idx <- CE_entropy_SR(A)
end.time <- Sys.time()
time.taken <- end.time - start.time
time.taken
save(feat.idx, file = "../../input/BBBC022_2013/single_cells/SR/entropy_feat_SR_repurposing_sc.RData")
print("entropy saved")
# SR is giving the entropy -> should link entropy to features name and sort them!
# then with this features vector, can select the best top n and see whether it improves or not
##############################
x <- data.frame(feat.name = colnames(profiles), entropy = feat.idx)
x %<>% arrange(entropy)
## can add condition to select only entropy that is positive -> then no need to do the loop.
n.feats <- seq(from = 100, to = 800, by = 100)
for(n in n.feats){
# names of the features to keep
feat.names.CEi <- x$feat[1:n]
save(feat.names.CEi, file=paste("../../input/BBBC022_2013/single_cells/SR/feat_selected/SVD_SR_", as.character(n), "feat.Rda", sep=""))
}