pachterlab · pimentel · May 18, 2017 · Jan 21, 2016 · Feb 25, 2016 · Feb 26, 2016
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,9 @@
 Package: sleuth
 Title: Tools for investigating RNA-Seq
-Version: 0.28.0
+Version: 0.28.1
 Authors@R: c(person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre")))
-Description: Investigate transcript abundance from "kallisto" and differential expression analysis from RNA-Seq data.
+Description: Investigate transcript abundance from "kallisto" and differential
+    expression analysis from RNA-Seq data.
 License: GPL-3
 LazyData: true
 URL: https://github.com/pachterlab/sleuth
@@ -17,11 +18,14 @@ Imports:
     tidyr,
     reshape2,
     rhdf5,
+    parallel,
     lazyeval,
+    matrixStats,
     shiny
 Suggests:
     MASS,
     lintr,
     testthat,
     knitr
 VignetteBuilder: knitr
+RoxygenNote: 5.0.1
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,5 +1,6 @@
 # Generated by roxygen2 (4.1.1): do not edit by hand
 
+S3method("$<-",sleuth)
 S3method(bias_table,kallisto)
 S3method(bias_table,sleuth)
 S3method(get_bootstraps,kallisto)
@@ -14,6 +15,7 @@ S3method(print,sleuth)
 S3method(print,sleuth_model)
 S3method(summary,sleuth)
 S3method(tests,sleuth)
+export("transform_fxn<-")
 export(basic_filter)
 export(bias_table)
 export(bs_sigma_summary)
@@ -25,6 +27,7 @@ export(get_bootstrap_summary)
 export(get_bootstraps)
 export(get_quantile)
 export(kallisto_table)
+export(log_transform)
 export(melt_bootstrap_sleuth)
 export(models)
 export(norm_factors)
@@ -63,6 +66,9 @@ export(sliding_window_grouping)
 export(tests)
 export(tpm_to_alpha)
 export(transcripts_from_gene)
+export(transform_status)
+export(transform_status.sleuth)
+export(transform_status.sleuth_model)
 import(dplyr)
 importFrom(data.table,fread)
 importFrom(lazyeval,interp)

diff --git a/R/bootstrap.R b/R/bootstrap.R
@@ -145,7 +145,7 @@ aggregate_bootstrap <- function(kal, mapping, split_by = "gene_id",
 
   if ( any(!complete.cases(mapping)) ) {
     warning("Found some NAs in mapping. Removing them.")
-    mapping <- mapping[complete.cases(mapping),]
+    mapping <- mapping[complete.cases(mapping), ]
   }
 
   m_bs <- melt_bootstrap(kal, column)
@@ -316,8 +316,8 @@ sample_bootstrap <- function(obj, n_samples = 100L) {
   # matrix sample
   for (s in 1:n_samples) {
     for (idx in 1:nrow(which_samp)) {
-      b <- which_samp[idx,s]
-      sample_mat[[s]][,idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts
+      b <- which_samp[idx, s]
+      sample_mat[[s]][, idx] <- obj$kal[[idx]]$bootstrap[[b]]$est_counts
     }
   }
 
@@ -359,9 +359,143 @@ dcast_bootstrap.kallisto <- function(obj, units, nsamples = NULL) {
   mat <- matrix(NA_real_, nrow = n_features, ncol = length(which_bs))
 
   for (j in seq_along(which_bs)) {
-    mat[ ,j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]]
+    mat[, j] <- obj[[ "bootstrap" ]][[which_bs[j]]][[ units ]]
   }
   rownames(mat) <- obj[["bootstrap"]][[1]][["target_id"]]
 
   mat
 }
+
+# Function to process bootstraps for parallelization
+process_bootstrap <- function(i, samp_name, kal_path,
+                              num_transcripts, est_count_sf,
+                              read_bootstrap_tpm, gene_mode,
+                              extra_bootstrap_summary,
+                              target_id, mappings, which_ids,
+                              aggregation_column, transform_fxn)
+{
+  dot(i)
+  bs_quants <- list()
+
+  num_bootstrap <- as.integer(rhdf5::h5read(kal_path$path, 
+                                            "aux/num_bootstrap"))
+  if (num_bootstrap == 0) {
+    stop(paste0("File ", kal_path, " has no bootstraps.",
+                "Please generate bootstraps using \"kallisto quant -b\"."))
+  }
+
+  # TODO: only perform operations on filtered transcripts
+  eff_len <- rhdf5::h5read(kal_path$path, "aux/eff_lengths")
+  bs_mat <- read_bootstrap_mat(fname = kal_path$path, 
+                               num_bootstraps = num_bootstrap,
+                               num_transcripts = num_transcripts, 
+                               est_count_sf = est_count_sf)
+
+  if (read_bootstrap_tpm) {
+    bs_quant_tpm <- aperm(apply(bs_mat, 1, counts_to_tpm, 
+                                eff_len))
+
+    # gene level code is analogous here to below code
+    if (gene_mode) {
+      colnames(bs_quant_tpm) <- target_id
+      # Make bootstrap_num an explicit column; each is treated as a "sample"
+      bs_tpm_df <- data.frame(bootstrap_num = c(1:num_bootstrap), 
+                              bs_quant_tpm, check.names = F)
+      rm(bs_quant_tpm)
+      # Make long tidy table; this step is much faster
+      # using data.table melt rather than tidyr gather
+      tidy_tpm <- data.table::melt(bs_tpm_df, id.vars = "bootstrap_num", 
+                                   variable.name = "target_id",
+                                   value.name = "tpm")
+      tidy_tpm <- data.table::as.data.table(tidy_tpm)
+      rm(bs_tpm_df)
+      tidy_tpm$target_id <- as.character(tidy_tpm$target_id)
+      tidy_tpm <- merge(tidy_tpm, mappings, by = "target_id", 
+                        all.x = T)
+      # Data.table dcast uses non-standard evaluation
+      # So quote the full casting formula to make sure
+      # "aggregation_column" is interpreted as a variable
+      # see: http://stackoverflow.com/a/31295592
+      quant_tpm_formula <- paste("bootstrap_num ~", 
+                                 aggregation_column)
+      bs_quant_tpm <- data.table::dcast(tidy_tpm, 
+                                        quant_tpm_formula, value.var = "tpm", 
+                                        fun.aggregate = sum)
+      bs_quant_tpm <- as.matrix(bs_quant_tpm[, -1])
+      rm(tidy_tpm) # these tables are very large
+    }
+    bs_quant_tpm <- aperm(apply(bs_quant_tpm, 2, 
+                                quantile))
+    colnames(bs_quant_tpm) <- c("min", "lower", "mid", 
+                                "upper", "max")
+    ret$bs_quants[[samp_name]]$tpm <- bs_quant_tpm
+  }
+
+  if (gene_mode) {
+    # I can combine target_id and eff_len
+    # I assume the order is the same, since it's read from the same kallisto 
+    # file and each kallisto file has the same order
+    eff_len_df <- data.frame(target_id, eff_len, 
+                             stringsAsFactors = F)
+    # make bootstrap number an explicit column to facilitate melting
+    bs_df <- data.frame(bootstrap_num = c(1:num_bootstrap), 
+                        bs_mat, check.names = F)
+    rm(bs_mat)
+    # data.table melt function is much faster than tidyr's gather function
+    # output is a long table with each bootstrap's value for each target_id
+    tidy_bs <- data.table::melt(bs_df, id.vars = "bootstrap_num", 
+                                variable.name = "target_id", 
+                                value.name = "est_counts")
+    rm(bs_df)
+    # not sure why, but the melt function always returns a factor,
+    # even when setting variable.factor = F, so I coerce target_id
+    tidy_bs$target_id <- as.character(tidy_bs$target_id)
+    # combine the long tidy table with eff_len and aggregation mappings
+    # note that bootstrap number is treated as "sample" here 
+    # for backwards compatibility
+    tidy_bs <- dplyr::select(tidy_bs, target_id, 
+                             est_counts, sample = bootstrap_num)
+    tidy_bs <- merge(data.table::as.data.table(tidy_bs), 
+                     data.table::as.data.table(eff_len_df), by = "target_id", 
+                     all.x = T)
+    tidy_bs <- merge(tidy_bs, mappings, by = "target_id", 
+                     all.x = T)
+    # create the median effective length scaling factor for each gene
+    scale_factor <- tidy_bs[, scale_factor := median(eff_len),
+                            by = eval(parse(text=aggregation_column))]
+    # use the old reads_per_base_transform method to get gene scaled counts
+    scaled_bs <- reads_per_base_transform(tidy_bs, 
+                                          scale_factor$scale_factor, 
+                                          aggregation_column, 
+                                          mappings)
+    # this step undoes the tidying to get back a matrix format
+    # target_ids here are now the aggregation column ids
+    bs_mat <- data.table::dcast(scaled_bs, sample ~ target_id, 
+                                value.var = "scaled_reads_per_base")
+    # this now has the same format as the transcript matrix
+    # but it uses gene ids
+    bs_mat <- as.matrix(bs_mat[, -1])
+    rm(tidy_bs, scaled_bs)
+  }
+
+  if (extra_bootstrap_summary) {
+    bs_quant_est_counts <- aperm(apply(bs_mat, 2, 
+                                       quantile))
+    colnames(bs_quant_est_counts) <- c("min", "lower", 
+                                       "mid", "upper", "max")
+    ret$bs_quants[[samp_name]]$est_counts <- bs_quant_est_counts
+  }
+
+  bs_mat <- transform_fxn(bs_mat)
+  # If bs_mat was made at gene-level, already has column names
+  # If at transcript-level, need to add target_ids
+  if(!gene_mode) {
+    colnames(bs_mat) <- target_id
+  }
+  # all_sample_bootstrap[, i] bootstrap point estimate of the inferential
+  # variability in sample i
+  # NOTE: we are only keeping the ones that pass the filter
+  bootstrap_result <- matrixStats::colVars(bs_mat[, which_ids])
+
+  list(index = i, bs_quants = bs_quants, bootstrap_result = bootstrap_result)
+}
diff --git a/R/gene_analysis.R b/R/gene_analysis.R
@@ -0,0 +1,14 @@
+propagate_transcript_filter <- function(filter_df, target_mapping,
+  grouping_column) {
+
+  filtered_target_mapping <- dplyr::inner_join(as.data.table(filter_df), # nolint
+    as.data.table(target_mapping), by = 'target_id') # nolint
+
+  filtered_target_mapping <- dplyr::select_(filtered_target_mapping,
+    grouping_column)
+
+  data.table::setnames(filtered_target_mapping, grouping_column, 'target_id')
+  filtered_target_mapping <- dplyr::distinct(filtered_target_mapping)
+
+  filtered_target_mapping
+}
diff --git a/R/likelihood.R b/R/likelihood.R
@@ -66,6 +66,16 @@ sleuth_lrt <- function(obj, null_model, alt_model) {
   model_exists(obj, null_model)
   model_exists(obj, alt_model)
 
+  if(!obj$fits[[alt_model]]$transform_synced) {
+    stop("Model '", alt_model, "' was not computed using the sleuth object's",
+         " current transform function. Please rerun sleuth_fit for this model.")
+  }
+
+  if(!obj$fits[[null_model]]$transform_synced) {
+    stop("Model '", null_model, "' was not computed using the sleuth object's",
+         " current transform function. Please rerun sleuth_fit for this model.")
+  }
+
   if ( !likelihood_exists(obj, null_model) ) {
     obj <- compute_likelihood(obj, null_model)
   }

diff --git a/R/matrix.R b/R/matrix.R
@@ -25,6 +25,11 @@
 #' ("obs_norm" or "obs_raw")
 #' @param which_units character vector of length one. Which units to use ("tpm"
 #' or "est_counts")
+#' @return a \code{list} with an attribute 'data', which contains a matrix of target_ids
+#'         and transcript expression in \code{which_units}
+#' @examples
+#' sleuth_matrix <- sleuth_to_matrix(sleuth_obj, 'obs_norm', 'tpm')
+#' head(sleuth_matrix$data) # look at first 5 transcripts, sorted by name
 #' @export
 sleuth_to_matrix <- function(obj, which_df, which_units) {
   if ( !(which_df %in% c("obs_norm", "obs_raw")) ) {