-
Notifications
You must be signed in to change notification settings - Fork 95
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reconcile Gene Aggregation Feature with Memory Footprint Overhaul #99
Changes from all commits
8fba175
cc678f5
734f992
3cde4f6
17d1579
a7e64fc
0ebf3c5
07ca1d9
a5c91b1
659e36f
bd811f4
58913fc
2b86978
ed5ad30
9d4a67a
048f055
846673e
b0d4731
932fc47
90b5b99
161d231
66cea3a
c321f90
59ff84c
62cfab4
d6d1d2c
780b8dc
d77401a
bbd8859
a5f10e3
36d9e2d
28af2a1
06dc471
ca13ad3
01fbafc
75f4b22
2a3b7ad
1b28ea1
43683da
b5f1ae9
b2e3b30
872aeb1
9cf346d
682f972
08a409b
d875e25
b5762a5
bb57ce8
bf82d58
60a4c49
e64abe5
4d077d1
7eec121
2fe6fa7
f16ff40
29453d0
08d6410
ba3b374
ffcb4a4
325c267
db4d209
c005aa4
9763f7a
482110f
2e89826
d53d14f
6401590
a1e4765
08edf11
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,9 @@ | ||
Package: sleuth | ||
Title: Tools for investigating RNA-Seq | ||
Version: 0.28.0 | ||
Version: 0.28.1 | ||
Authors@R: c(person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre"))) | ||
Description: Investigate transcript abundance from "kallisto" and differential expression analysis from RNA-Seq data. | ||
Description: Investigate transcript abundance from "kallisto" and differential | ||
expression analysis from RNA-Seq data. | ||
License: GPL-3 | ||
LazyData: true | ||
URL: https://github.com/pachterlab/sleuth | ||
|
@@ -17,11 +18,14 @@ Imports: | |
tidyr, | ||
reshape2, | ||
rhdf5, | ||
parallel, | ||
lazyeval, | ||
matrixStats, | ||
shiny | ||
Suggests: | ||
MASS, | ||
lintr, | ||
testthat, | ||
knitr | ||
VignetteBuilder: knitr | ||
RoxygenNote: 5.0.1 |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -145,7 +145,7 @@ aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id", | |
|
||
if ( any(!complete.cases(mapping)) ) { | ||
warning("Found some NAs in mapping. Removing them.") | ||
mapping <- mapping[complete.cases(mapping),] | ||
mapping <- mapping[complete.cases(mapping), ] | ||
} | ||
|
||
m_bs <- melt_bootstrap(kal, column) | ||
|
@@ -316,8 +316,8 @@ sample_bootstrap <- function(obj, n_samples = 100L) { | |
# matrix sample | ||
for (s in 1:n_samples) { | ||
for (idx in 1:nrow(which_samp)) { | ||
b <- which_samp[idx,s] | ||
sample_mat[[s]][,idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts | ||
b <- which_samp[idx, s] | ||
sample_mat[[s]][, idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts | ||
} | ||
} | ||
|
||
|
@@ -359,9 +359,143 @@ dcast_bootstrap.kallisto <- function(obj, units, nsamples = NULL) { | |
mat <- matrix(NA_real_, nrow = n_features, ncol = length(which_bs)) | ||
|
||
for (j in seq_along(which_bs)) { | ||
mat[ ,j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]] | ||
mat[, j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]] | ||
} | ||
rownames(mat) <- obj[["bootstrap"]][[1]][["target_id"]] | ||
|
||
mat | ||
} | ||
|
||
# Function to process bootstraps for parallelization | ||
process_bootstrap <- function(i, samp_name, kal_path, | ||
num_transcripts, est_count_sf, | ||
read_bootstrap_tpm, gene_mode, | ||
extra_bootstrap_summary, | ||
target_id, mappings, which_ids, | ||
aggregation_column, transform_fxn) | ||
{ | ||
dot(i) | ||
bs_quants <- list() | ||
|
||
num_bootstrap <- as.integer(rhdf5::h5read(kal_path$path, | ||
"aux/num_bootstrap")) | ||
if (num_bootstrap == 0) { | ||
stop(paste0("File ", kal_path, " has no bootstraps.", | ||
"Please generate bootstraps using \"kallisto quant -b\".")) | ||
} | ||
|
||
# TODO: only perform operations on filtered transcripts | ||
eff_len <- rhdf5::h5read(kal_path$path, "aux/eff_lengths") | ||
bs_mat <- read_bootstrap_mat(fname = kal_path$path, | ||
num_bootstraps = num_bootstrap, | ||
num_transcripts = num_transcripts, | ||
est_count_sf = est_count_sf) | ||
|
||
if (read_bootstrap_tpm) { | ||
bs_quant_tpm <- aperm(apply(bs_mat, 1, counts_to_tpm, | ||
eff_len)) | ||
|
||
# gene level code is analogous here to below code | ||
if (gene_mode) { | ||
colnames(bs_quant_tpm) <- target_id | ||
# Make bootstrap_num an explicit column; each is treated as a "sample" | ||
bs_tpm_df <- data.frame(bootstrap_num = c(1:num_bootstrap), | ||
bs_quant_tpm, check.names = F) | ||
rm(bs_quant_tpm) | ||
# Make long tidy table; this step is much faster | ||
# using data.table melt rather than tidyr gather | ||
tidy_tpm <- data.table::melt(bs_tpm_df, id.vars = "bootstrap_num", | ||
variable.name = "target_id", | ||
value.name = "tpm") | ||
tidy_tpm <- data.table::as.data.table(tidy_tpm) | ||
rm(bs_tpm_df) | ||
tidy_tpm$target_id <- as.character(tidy_tpm$target_id) | ||
tidy_tpm <- merge(tidy_tpm, mappings, by = "target_id", | ||
all.x = T) | ||
# Data.table dcast uses non-standard evaluation | ||
# So quote the full casting formula to make sure | ||
# "aggregation_column" is interpreted as a variable | ||
# see: http://stackoverflow.com/a/31295592 | ||
quant_tpm_formula <- paste("bootstrap_num ~", | ||
aggregation_column) | ||
bs_quant_tpm <- data.table::dcast(tidy_tpm, | ||
quant_tpm_formula, value.var = "tpm", | ||
fun.aggregate = sum) | ||
bs_quant_tpm <- as.matrix(bs_quant_tpm[, -1]) | ||
rm(tidy_tpm) # these tables are very large | ||
} | ||
bs_quant_tpm <- aperm(apply(bs_quant_tpm, 2, | ||
quantile)) | ||
colnames(bs_quant_tpm) <- c("min", "lower", "mid", | ||
"upper", "max") | ||
ret$bs_quants[[samp_name]]$tpm <- bs_quant_tpm | ||
} | ||
|
||
if (gene_mode) { | ||
# I can combine target_id and eff_len | ||
# I assume the order is the same, since it's read from the same kallisto | ||
# file and each kallisto file has the same order | ||
eff_len_df <- data.frame(target_id, eff_len, | ||
stringsAsFactors = F) | ||
# make bootstrap number an explicit column to facilitate melting | ||
bs_df <- data.frame(bootstrap_num = c(1:num_bootstrap), | ||
bs_mat, check.names = F) | ||
rm(bs_mat) | ||
# data.table melt function is much faster than tidyr's gather function | ||
# output is a long table with each bootstrap's value for each target_id | ||
tidy_bs <- data.table::melt(bs_df, id.vars = "bootstrap_num", | ||
variable.name = "target_id", | ||
value.name = "est_counts") | ||
rm(bs_df) | ||
# not sure why, but the melt function always returns a factor, | ||
# even when setting variable.factor = F, so I coerce target_id | ||
tidy_bs$target_id <- as.character(tidy_bs$target_id) | ||
# combine the long tidy table with eff_len and aggregation mappings | ||
# note that bootstrap number is treated as "sample" here | ||
# for backwards compatibility | ||
tidy_bs <- dplyr::select(tidy_bs, target_id, | ||
est_counts, sample = bootstrap_num) | ||
tidy_bs <- merge(data.table::as.data.table(tidy_bs), | ||
data.table::as.data.table(eff_len_df), by = "target_id", | ||
all.x = T) | ||
tidy_bs <- merge(tidy_bs, mappings, by = "target_id", | ||
all.x = T) | ||
# create the median effective length scaling factor for each gene | ||
scale_factor <- tidy_bs[, scale_factor := median(eff_len), | ||
by = eval(parse(text=aggregation_column))] | ||
# use the old reads_per_base_transform method to get gene scaled counts | ||
scaled_bs <- reads_per_base_transform(tidy_bs, | ||
scale_factor$scale_factor, | ||
aggregation_column, | ||
mappings) | ||
# this step undoes the tidying to get back a matrix format | ||
# target_ids here are now the aggregation column ids | ||
bs_mat <- data.table::dcast(scaled_bs, sample ~ target_id, | ||
value.var = "scaled_reads_per_base") | ||
# this now has the same format as the transcript matrix | ||
# but it uses gene ids | ||
bs_mat <- as.matrix(bs_mat[, -1]) | ||
rm(tidy_bs, scaled_bs) | ||
} | ||
|
||
if (extra_bootstrap_summary) { | ||
bs_quant_est_counts <- aperm(apply(bs_mat, 2, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. look into whether or not there is a faster matrixStats function. I think I tested the 1 that exists there and this is actually faster. will check soon |
||
quantile)) | ||
colnames(bs_quant_est_counts) <- c("min", "lower", | ||
"mid", "upper", "max") | ||
ret$bs_quants[[samp_name]]$est_counts <- bs_quant_est_counts | ||
} | ||
|
||
bs_mat <- transform_fxn(bs_mat) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change to done |
||
# If bs_mat was made at gene-level, already has column names | ||
# If at transcript-level, need to add target_ids | ||
if(!gene_mode) { | ||
colnames(bs_mat) <- target_id | ||
} | ||
# all_sample_bootstrap[, i] bootstrap point estimate of the inferential | ||
# variability in sample i | ||
# NOTE: we are only keeping the ones that pass the filter | ||
bootstrap_result <- matrixStats::colVars(bs_mat[, which_ids]) | ||
|
||
list(index = i, bs_quants = bs_quants, bootstrap_result = bootstrap_result) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
propagate_transcript_filter <- function(filter_df, target_mapping, | ||
grouping_column) { | ||
|
||
filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df), # nolint | ||
as.data.table(target_mapping), by = 'target_id') # nolint | ||
|
||
filtered_target_mapping <- dplyr::select_(filtered_target_mapping, | ||
grouping_column) | ||
|
||
data.table::setnames(filtered_target_mapping, grouping_column, 'target_id') | ||
filtered_target_mapping <- dplyr::distinct(filtered_target_mapping) | ||
|
||
filtered_target_mapping | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
make all explicit by using scope operator.
done