-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSCENIC_S2_Regulons.R
228 lines (187 loc) · 11 KB
/
SCENIC_S2_Regulons.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
source("/BIGDATA1/pku_hkdeng_1/R/R-3.4.2/profile.R")
library(data.table)
library(Biobase)
library(AUCell)
library(SCENIC)
suppressWarnings(library(NMF, verbose=FALSE, warn.conflicts=FALSE, quietly=TRUE))
# To build a personalized report, update this working directory:
setwd("SCENIC_Mouse") # Or in the first chunk if running a notebook
## ----loadTFmodules, eval=TRUE--------------------------------------------
load("int/1.7_tfModules_withCorr.RData")
## ----chooseOrg-----------------------------------------------------------
library(RcisTarget.mm9.motifDatabases.20k)
# Get genes in databases:
data(mm9_500bpUpstream_motifRanking) # or 10kbp, they should have the same genes
genesInDatabase <- mm9_500bpUpstream_motifRanking@rankings$rn
# Get TFS in databases:
data(mm9_direct_motifAnnotation)
allTFs <- mm9_direct_motifAnnotation$allTFs
# Motif rankings (genes x motifs)
data(mm9_10kbpAroundTss_motifRanking)
motifRankings <- list()
motifRankings[["500bp"]] <- mm9_500bpUpstream_motifRanking
motifRankings[["10kbp"]] <- mm9_10kbpAroundTss_motifRanking
# Motif annotation (TFs)
direct_motifAnnotation <- mm9_direct_motifAnnotation
data(mm9_inferred_motifAnnotation) # optional
inferred_motifAnnotation <- mm9_inferred_motifAnnotation
# Remove genes missing from RcisTarget databases
# (In case the input matrix wasn't already filtered)
tfModules_withCorr <- tfModules_withCorr[which(as.character(tfModules_withCorr$TF) %in% allTFs),]
geneInDb <- tfModules_withCorr$Target %in% motifRankings[["500bp"]]@rankings$rn
# Genes in co-expression modules not available in RcisTargetDatabases:
missingGenes <- sort(unique(tfModules_withCorr[which(!geneInDb),"Target"]))
missingGenes
tfModules_withCorr <- tfModules_withCorr[which(geneInDb),]
# Targets with positive correlation
tfModules_Selected <- tfModules_withCorr[which(tfModules_withCorr$corr==1),]
# Add a column with the geneSet name (TF_method)
tfModules_Selected <- cbind(tfModules_Selected, geneSetName=paste(tfModules_Selected$TF, tfModules_Selected$method, sep="_"))
head(tfModules_Selected)
# Split into tfModules (TF-modules, with several methods)
tfModules <- split(tfModules_Selected$Target, tfModules_Selected$geneSetName)
# Keep gene sets with at least 20 genes
tfModules <- tfModules[which(lengths(tfModules)>=20)]
# Add TF to the gene set (used in the following steps, careful if editing)
tfModules <- setNames(lapply(names(tfModules), function(gsn) {
tf <- strsplit(gsn, "_")[[1]][1]
unique(c(tf, tfModules[[gsn]]))
}), names(tfModules))
save(tfModules, file="int/2.1_tfModules_forMotifEnrichmet.RData")
## ----statsTFmodules------------------------------------------------------
load("int/2.1_tfModules_forMotifEnrichmet.RData")
tfModulesSummary <- t(sapply(strsplit(names(tfModules), "_"), function(x) x[1:2]))
sort(table(tfModulesSummary[,2]))
## ----RcisTarget, eval=TRUE-----------------------------------------------
library(RcisTarget)
################################################################
# 1. Calculate motif enrichment for each TF-module
### 1.1 Calculate enrichment
motifs_AUC <- lapply(motifRankings, function(ranking) calcAUC(tfModules, ranking, aucMaxRank=0.01*nrow(ranking@rankings), nCores=8 * parallel::detectCores(logical = FALSE) - 2, verbose=FALSE))
save(motifs_AUC, file="int/2.2_motifs_AUC.RData") # renamed from: 2.2_motifs_AUC_500bp_10kbp.RData
# load(file="int/2.2_motifs_AUC.RData")
### 1.2 Conver to table, filter by NES & add the TFs to which the motif is annotated
# (For each database...)
motifEnrichment <- lapply(motifs_AUC, function(aucOutput)
{
# Extract the TF of the gene-set name (i.e. MITF_w001):
tf <- sapply(setNames(strsplit(rownames(aucOutput), "_"), rownames(aucOutput)), function(x) x[[1]])
# Calculate NES and add motif annotation (provide tf in 'highlightTFs'):
addMotifAnnotation(aucOutput, highlightTFs=tf, nesThreshold=3.0, digits=3,
motifAnnot_direct=direct_motifAnnotation,
motifAnnot_inferred=inferred_motifAnnotation)
})
# Merge both tables, adding a column that contains the 'motifDb'
motifEnrichment <- do.call(rbind, lapply(names(motifEnrichment), function(dbName){
cbind(motifDb=dbName, motifEnrichment[[dbName]])
}))
save(motifEnrichment, file="int/2.3_motifEnrichment.RData")
cat("Number of motifs in the initial enrichment: ", nrow(motifEnrichment))
### 1.3 Keep only the motifs annotated to the initial TF
motifEnrichment_selfMotifs <- motifEnrichment[which(motifEnrichment$TFinDB != ""),, drop=FALSE]
save(motifEnrichment_selfMotifs, file="int/2.4_motifEnrichment_selfMotifs.RData")
cat("Number of motifs annotated to the initial TF: ", nrow(motifEnrichment_selfMotifs))
rm(motifEnrichment)
################################################################
# 2. Prune targets
library(data.table)
motifEnrichment_selfMotifs_wGenes <- lapply(names(motifRankings), function(motifDbName){
addSignificantGenes(resultsTable=motifEnrichment_selfMotifs[motifDb==motifDbName],
geneSets=tfModules,
rankings=motifRankings[[motifDbName]],
maxRank=5000, method="aprox", nCores=8 * parallel::detectCores(logical = FALSE) - 2)
})
motifEnrichment_selfMotifs_wGenes <- rbindlist(motifEnrichment_selfMotifs_wGenes)
save(motifEnrichment_selfMotifs_wGenes, file="int/2.5_motifEnrichment_selfMotifs_wGenes.RData")
# Save as text:
write.table(motifEnrichment_selfMotifs_wGenes, file="output/Step2_MotifEnrichment.tsv",
sep="\t", quote=FALSE, row.names=FALSE)
## ----showSelfMotifs------------------------------------------------------
load("int/2.5_motifEnrichment_selfMotifs_wGenes.RData")
dim(motifEnrichment_selfMotifs_wGenes)
motifEnrichment_selfMotifs_wGenes[order(NES,decreasing=TRUE)][1:5,-"enrichedGenes", with=F]
## ----regulonTargetsInfo, eval=TRUE---------------------------------------
motifEnrichment.asIncidList <- apply(motifEnrichment_selfMotifs_wGenes, 1, function(oneMotifRow) {
genes <- strsplit(oneMotifRow["enrichedGenes"], ";")[[1]]
oneMotifRow <- data.frame(rbind(oneMotifRow), stringsAsFactors=FALSE)
data.frame(oneMotifRow[rep(1, length(genes)),c("NES", "motif", "highlightedTFs", "TFinDB")], genes, stringsAsFactors = FALSE)
})
motifEnrichment.asIncidList <- rbindlist(motifEnrichment.asIncidList)
colnames(motifEnrichment.asIncidList) <- c("NES", "motif", "TF", "annot", "gene")
motifEnrichment.asIncidList <- data.frame(motifEnrichment.asIncidList, stringsAsFactors = FALSE)
# Get targets for each TF, but keep info about best motif/enrichment
# (directly annotated motifs are considered better)
regulonTargetsInfo <- lapply(split(motifEnrichment.asIncidList, motifEnrichment.asIncidList$TF), function(tfTargets){
# print(unique(tfTargets$TF))
tfTable <- as.data.frame(do.call(rbind, lapply(split(tfTargets, tfTargets$gene), function(enrOneGene){
directAnnot <- "**" %in% enrOneGene$annot
enrOneGeneByAnnot <- enrOneGene
if(directAnnot) enrOneGeneByAnnot <- enrOneGeneByAnnot[which(enrOneGene$annot == "**"),]
bestMotif <- which.max(enrOneGeneByAnnot$NES)
cbind(TF=unique(enrOneGene$TF), gene=unique(enrOneGene$gene), nMotifs=nrow(enrOneGene),
bestMotif=as.character(enrOneGeneByAnnot[bestMotif,"motif"]), NES=as.numeric(enrOneGeneByAnnot[bestMotif,"NES"]),
directAnnot=directAnnot)
})), stringsAsFactors=FALSE)
tfTable[order(tfTable$NES, decreasing = TRUE),]
})
regulonTargetsInfo <- rbindlist(regulonTargetsInfo)
colnames(regulonTargetsInfo) <- c("TF", "gene", "nMotifs", "bestMotif", "NES", "directAnnot")
# Optional: Add Genie3 score
load("int/1.5_GENIE3_linkList.RData")
linkList <- linkList[which(linkList$weight>=0.001),]
rownames(linkList) <- paste(linkList$TF, linkList$Target,sep="__")
regulonTargetsInfo <- cbind(regulonTargetsInfo, Genie3Weight=linkList[paste(regulonTargetsInfo$TF, regulonTargetsInfo$gene,sep="__"),"weight"])
save(regulonTargetsInfo, file="int/2.6_regulonTargetsInfo.RData")
write.table(regulonTargetsInfo, file="output/Step2_regulonTargetsInfo.txt", sep="\t", col.names=TRUE, row.names=FALSE, quote=FALSE)
## ----tfRegulons, eval=TRUE-----------------------------------------------
regulonTargetsInfo_splitByAnnot <- split(regulonTargetsInfo, regulonTargetsInfo$directAnnot)
regulons <- sapply(split(regulonTargetsInfo_splitByAnnot[["TRUE"]], regulonTargetsInfo_splitByAnnot[["TRUE"]][,"TF"]), function(x) sort(as.character(unlist(x[,"gene"]))))
regulons_extended <- sapply(split(regulonTargetsInfo_splitByAnnot[["FALSE"]],regulonTargetsInfo_splitByAnnot[["FALSE"]][,"TF"]), function(x) unname(x[,"gene"]))
regulons_extended <- sapply(names(regulons_extended), function(tf) sort(unique(c(regulons[[tf]], regulons_extended[[tf]]))))
names(regulons_extended) <- paste(names(regulons_extended), "_extended", sep="")
regulons <- c(regulons, regulons_extended)
save(regulons, file="int/2.6_regulons_asGeneSet.RData")
## ----tfRegulonInfo-------------------------------------------------------
load("int/2.6_regulons_asGeneSet.RData")
# Number of regulons and summary of sizes:
length(regulons)
summary(lengths(regulons))
## ----tfRegulons_faster, eval=FALSE---------------------------------------
## selfMotifs_byTF <- split(motifEnrichment_selfMotifs_wGenes, motifEnrichment_selfMotifs_wGenes$highlightedTFs)
## regulons <- lapply(selfMotifs_byTF,
## function(x) unique(unlist(strsplit(x$enrichedGenes, ";"))))
## save(regulons, file="int/2.6_regulons_B_asGeneSet.RData")
## ----incidMats-----------------------------------------------------------
incidList <- melt(regulons)
incidMat <- table(incidList[,2], incidList[,1])
save(incidMat, file="int/2.6_regulons_asIncidMat.RData")
dim(incidMat)
## ----selfRegTfs----------------------------------------------------------
table(sapply(names(regulons), function(x) x %in% regulons[[x]]))
# ## ----exampleTfMotifs-----------------------------------------------------
# selTF <- "Sox2"
# subsetTable <- motifEnrichment_selfMotifs_wGenes[highlightedTFs %in% selTF][order(NES,decreasing=TRUE)][,-"enrichedGenes", with=F]
#
# subsetTable <- addLogo(subsetTable)
# library(DT)
# datatable(subsetTable, escape=FALSE, filter="top", options=list(pageLength=5))
#
# ## ----showOneTfEnrichment-------------------------------------------------
# geneSetName <- "Sox2_top50"
# motifDbName <- "10kbp"
# selectedMotifs <- subsetTable[geneSet==geneSetName & motifDb==motifDbName, motif]
# selectedMotifs <- selectedMotifs[1:3]
## ----signifGenesPlot-----------------------------------------------------
# pdf("int/2.8_RCC_selectedMotifs.pdf")
# par(mfrow=c(2,2))
# signifGenes_SelectedMotifs <- getSignificantGenes(tfModules[[geneSetName]],
# motifRankings[[motifDbName]],
# signifRankingNames=selectedMotifs,
# plotCurve=TRUE, maxRank=5000, nCores=8 * parallel::detectCores(logical = FALSE) - 2,
# genesFormat="geneList", method="aprox")
# dev.off()
# # Motif & number of genes:
# cbind(lengths(signifGenes_SelectedMotifs$enrichedGenes))
## ----sessionInfo---------------------------------------------------------
date()
sessionInfo()