Merge branch 'devel' into alr

pachterlab · Nov 17, 2017 · 1e84a3a · 1e84a3a
2 parents 5d09823 + 725197c
commit 1e84a3a
Show file tree

Hide file tree

Showing 9 changed files with 903 additions and 259 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,9 @@
 Package: sleuth
 Title: Tools for investigating RNA-Seq
 Version: 0.29.0
-Authors@R: c(person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre")))
+Authors@R: c(
+  person("Harold", "Pimentel", , "[email protected]", role = c("aut", "cre")),
+  person("Warren", "McGee", , "[email protected]", role = "aut"))
 Description: Investigate transcript abundance from "kallisto" and differential
     expression analysis from RNA-Seq data.
 License: GPL-3
@@ -21,6 +23,7 @@ Imports:
     parallel,
     lazyeval,
     matrixStats,
+    pheatmap,
     shiny
 Suggests:
     MASS,

diff --git a/R/bootstrap.R b/R/bootstrap.R
@@ -256,11 +256,12 @@ get_bootstrap_summary <- function(obj, target_id, units = 'est_counts') {
     stop(paste0("'", units, "' is invalid for 'units'. please see documentation"))
   }
 
-  if (is.null(obj$bs_quants)) {
-    if (units == 'est_counts') {
-      stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE'")
+  if (is.null(obj$bs_quants) | is.null(obj$bs_quants[[1]][[units]])) {
+    if (units %in% c('est_counts', 'scaled_reads_per_base')) {
+      stop("bootstrap summary appears to be missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE'")
     } else {
-      stop("bootstrap summary missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE' and 'read_bootstrap_tpm = TRUE'")
+      stop("bootstrap summary appears to be missing. rerun sleuth_prep() with argument 'extra_bootstrap_summary = TRUE' ",
+           "and 'read_bootstrap_tpm = TRUE'")
     }
   }
 

diff --git a/R/matrix.R b/R/matrix.R
@@ -35,10 +35,18 @@ sleuth_to_matrix <- function(obj, which_df, which_units) {
   if ( !(which_df %in% c("obs_norm", "obs_raw")) ) {
     stop("Invalid object")
   }
-  if ( !(which_units %in% c("tpm", "est_counts")) ) {
+  if ( !(which_units %in% c("tpm", "est_counts", "scaled_reads_per_base")) ) {
     stop("Invalid units")
   }
 
+  which_units <- check_quant_mode(obj, which_units)
+
+  if (obj$gene_mode && which_df == "obs_raw") {
+    warning("This object is in gene mode, and the raw values are ",
+            "transcripts. Using 'obs_norm' instead.")
+    which_df <- "obs_norm"
+  }
+
   data <- as.data.frame(obj[[which_df]])
 
   res <- list()

diff --git a/R/misc.R b/R/misc.R
@@ -64,8 +64,48 @@ sliding_window_grouping <- function(data, x_col, y_col,
 shrink_df <- function(data, shrink_formula, filter_var) {
   data <- as.data.frame(data)
   s_formula <- substitute(shrink_formula)
-  fit <- eval(loess(s_formula, data[data[, filter_var], ]))
-  data.frame(data, shrink = predict(fit, data))
+  fit <- eval(loess(s_formula, data[data[, filter_var], ], model = T))
+  shrink_data <- data.frame(data, shrink = predict(fit, data))
+
+  # include a column to report any target IDs that failed shrinkage estimation
+  # 'failed_ise' is short for 'failed initial shrinkage estimation'
+  shrink_data$failed_ise <- is.na(shrink_data$shrink)
+  if (any(shrink_data$failed_ise)) {
+    na_rows <- which(shrink_data$failed_ise)
+    num_na <- length(na_rows)
+    which_na <- shrink_data$target_id[na_rows]
+    max_mean_obs <- summary(fit$model$mean_obs)["Max."]
+    min_mean_obs <- summary(fit$model$mean_obs)["Min."]
+    # check for mean_obs values that fall outside of the window that
+    # was used to fit the LOESS curve. If any values do fall outside,
+    # repeat the LOESS fit using "surface = direct" to get an exact surface fit,
+    # which can then be used for extrapolation of these values.
+    if (any(shrink_data$mean_obs > max_mean_obs |
+            shrink_data$mean_obs < min_mean_obs)) {
+      message(num_na, " NA values were found during variance shrinkage estimation",
+              " due to mean observation values outside of the range used for the LOESS fit.\n",
+              "The LOESS fit will be repeated using exact computation of the fitted ",
+              "surface to extrapolate the missing values.\n",
+              "These are the target ids with NA values: ", paste(which_na, collapse = ", "))
+      direct_fit <- eval(loess(s_formula, data[data[, filter_var], ], model = T,
+                  control = loess.control(surface = "direct")))
+      na_data <- data[na_rows, ]
+      na_shrink_data <- data.frame(na_data, shrink = predict(direct_fit, na_data))
+      shrink_data[na_rows, "shrink"] <- na_shrink_data$shrink
+    }
+    # repeat the check for NA values; if there were no values outside of the mean_obs window used
+    # (first check above) or if there were new NA values from the direct fit, then
+    # the cause of the NA values is unknown. The user will be warned that this will cause
+    # all downstream testing to fail for these target IDs.
+    if (any(is.na(shrink_data$shrink))) {
+      stop(num_na, " NA values were found during variance shrinkage estimation",
+              " due to an unknown cause. These values will result in NAs ",
+              "with any downstream testing for these target ids.\n",
+              "Please submit a bug report at the Sleuth Github website.\n",
+              "These are the target ids with NA values: ", paste(which_na, collapse = ", "))
+    }
+  }
+  shrink_data
 }
 
 msg <- function(..., nl = TRUE) {

diff --git a/R/model.R b/R/model.R
@@ -271,30 +271,37 @@ tests.sleuth <- function(obj, lrt = TRUE, wt = TRUE) {
 
 }
 
-#' Extract Wald test results from a sleuth object
+#' Extract Wald or Likelihood Ratio test results from a sleuth object
 #'
-#' This function extracts Wald test results from a sleuth object.
+#' This function extracts Wald or Likelihood Ratio test results from a sleuth object.
 #'
 #' @param obj a \code{sleuth} object
 #' @param test a character string denoting the test to extract. Possible tests can be found by using \code{models(obj)}.
-#' @param which_model a character string denoting the model. If extracting a wald test, use the model name. If extracting a likelihood ratio test, use 'lrt'.
+#' @param test_type 'wt' for Wald test or 'lrt' for Likelihood Ratio test.
+#' @param which_model a character string denoting the model. If extracting a wald test, use the model name.
+#'   Not used if extracting a likelihood ratio test.
 #' @param rename_cols if \code{TRUE} will rename some columns to be shorter and
-#' consistent with vignette
+#'   consistent with vignette
 #' @param show_all if \code{TRUE} will show all transcripts (not only the ones
 #' passing filters). The transcripts that do not pass filters will have
 #' \code{NA} values in most columns.
 #' @return a \code{data.frame} with the following columns:
 #' @return target_id: transcript name, e.g. "ENSXX#####" (dependent on the transcriptome used in kallisto)
+#' @return ...: if there is a target mapping data frame, all of the annotations columns are added from \code{obj$target_mapping} before the other columns.
 #' @return pval: p-value of the chosen model
 #' @return qval: false discovery rate adjusted p-value, using Benjamini-Hochberg (see \code{\link{p.adjust}})
-#' @return b: 'beta' value (effect size). Technically a biased estimator of the fold change
-#' @return se_b: standard error of the beta
+#' @return test_stat (LRT only): Chi-squared test statistic (likelihood ratio test). Only seen with Likelihood Ratio test results.
+#' @return rss (LRT only): the residual sum of squares under the "null model". Only seen with Likelihood Ratio test results.
+#' @return degrees_free (LRT only): the degrees of freedom (equal to difference between the two models). Only seen with Likelihood Ratio test results.
+#' @return b (Wald only): 'beta' value (effect size). Technically a biased estimator of the fold change. Only seen with Wald test results.
+#' @return se_b (Wald only): standard error of the beta. Only seen with Wald test results.
 #' @return mean_obs: mean of natural log counts of observations
 #' @return var_obs: variance of observation
-#' @return tech_var: technical variance of observation from the bootstraps
+#' @return tech_var: technical variance of observation from the bootstraps (named 'sigma_q_sq' if rename_cols is \code{FALSE})
 #' @return sigma_sq: raw estimator of the variance once the technical variance has been removed
 #' @return smooth_sigma_sq: smooth regression fit for the shrinkage estimation
 #' @return final_simga_sq: max(sigma_sq, smooth_sigma_sq); used for covariance estimation of beta
+#'   (named 'smooth_sigma_sq_pmax' if rename_cols is \code{FALSE})
 #' @seealso \code{\link{sleuth_wt}} and \code{\link{sleuth_lrt}} to compute tests, \code{\link{models}} to
 #' view which models, \code{\link{tests}} to view which tests were performed (and can be extracted)
 #' @examples
@@ -327,6 +334,20 @@ sleuth_results <- function(obj, test, test_type = 'wt',
   res <- NULL
   if (test_type == 'lrt') {
     res <- get_test(obj, test, type = 'lrt')
+    res <- dplyr::select(res,
+      target_id,
+      pval,
+      qval,
+      test_stat,
+      rss,
+      degrees_free,
+      mean_obs,
+      var_obs,
+      sigma_q_sq,
+      sigma_sq,
+      smooth_sigma_sq,
+      smooth_sigma_sq_pmax
+      )
   } else {
     res <- get_test(obj, test, 'wt', which_model)
     res <- dplyr::select(res,
@@ -362,8 +383,8 @@ sleuth_results <- function(obj, test, test_type = 'wt',
 
   if ( !is.null(obj$target_mapping) && !obj$gene_mode) {
     res <- dplyr::left_join(
-      data.table::as.data.table(res),
       data.table::as.data.table(obj$target_mapping),
+      data.table::as.data.table(res),
       by = 'target_id')
   }
 
@@ -377,9 +398,12 @@ sleuth_results <- function(obj, test, test_type = 'wt',
     # this line uses dplyr's "left_join" syntax for "by"
     # to match "target_id" from the "res" table,
     # and the gene_column from the target_mapping table.
-    res <- dplyr::left_join(data.table::as.data.table(res),
-                            data.table::as.data.table(target_mapping),
-                            by = c("target_id" = obj$gene_column))
+    by_col <- "target_id"
+    names(by_col) <- obj$gene_column
+    res <- dplyr::left_join(data.table::as.data.table(target_mapping),
+                            data.table::as.data.table(res),
+                            by = by_col)
+    names(res)[1] <- "target_id"
   }
 
   res <- as_df(res)