update documentations

cz-ye · Dec 7, 2017 · 633ea39 · 633ea39
1 parent 1e8763e
commit 633ea39
Show file tree

Hide file tree

Showing 15 changed files with 114 additions and 87 deletions.
diff --git a/R/EM_DE.R b/R/EM_DE.R
@@ -2,19 +2,20 @@
 #'
 #' Fit the DECENT model with DE assumption
 #'
-#' @param data.obs Observed count matrix for endogeneous genes, rows represent genes, columns represent cells
-#' @param spike Observed count matrix for spike-ins, rows represent spike-in molecules, columns represent cells
+#' @param data.obs Observed count matrix for endogeneous genes, rows represent genes, columns represent cells.
+#' @param spike Observed count matrix for spike-ins, rows represent spike-ins, columns represent cells
 #' (ONLY if spikes = \code{TRUE}).
-#' @param spike.conc spike.conc A vector of theoretical count for each spike-in in one cell (ONLY if spikes = \code{TRUE}).
-#' @param CE.range A two-element vector of the lower limit and upper limit for the estimated
-#' capture efficiencies (ONLY if spikes = \code{FALSE}, default [0.02, 0.10]).
+#' @param spike.conc A vector of theoretical count for each spike-in in one cell (ONLY needed if spikes = \code{TRUE}).
+#' @param CE.range A two-element vector of the lower limit and upper limit for the estimated range of
+#' capture efficiencies (ONLY needed if spikes = \code{FALSE}, default [0.02, 0.10]).
 #' @param cell.type A factor or a integer/numeric vector starting from 1 providing cell-type labels.
 #' @param use.spikes If \code{TRUE}, use spike-ins to estimate capture efficiencies.
-#' @param normalize Normalization method, either 'ML' (maximum likelihood) or 'TMM' (Robinson et al., 2010).
-#' @param GQ.approx If \code{TRUE}, use GQ approximation to speed up E-step.
-#' @param parallel If \code{TRUE}, run in parallel.
+#' @param normalize Method for estimating size factors, either 'ML' (maximum likelihood, Ye et al., 2017) or 'TMM' (Robinson et al., 2010).
+#' @param GQ.approx If \code{TRUE}, use Gaussian-Quadrature approximation to speed up E-step.
+#' @param maxit maximum number of iterations for EM algorithm.
+#' @param parallel If \code{TRUE}, run DECENT in parallel.
 #'
-#' @return A list of DE model estimates
+#' @return A list containing estimates of DE model
 #' @examples
 #'
 #' @import MASS

diff --git a/R/EM_noDE.R b/R/EM_noDE.R
@@ -4,9 +4,10 @@
 #'
 #' @param data.obs Observed count matrix for endogeneous genes, rows represent genes, columns represent cells.
 #' @param CE A  vector of capture efficiencies.
-#' @param normalize Normalization method, either 'ML' (maximum likelihood) or 'TMM' (Robinson et al., 2010).
-#' @param GQ.approx If \code{TRUE}, use GQ approximation to speed up E-step.
-#' @param parallel If \code{TRUE}, run in parallel.
+#' @param normalize Method for estimating size factors, either 'ML' (maximum likelihood, Ye et al., 2017) or 'TMM' (Robinson et al., 2010).
+#' @param GQ.approx If \code{TRUE}, use Gaussian-Quadrature approximation to speed up E-step.
+#' @param maxit maximum number of iterations for EM algorithm.
+#' @param parallel If \code{TRUE}, run DECENT in parallel.
 #'
 #' @return A list of no DE model estimates
 #' @examples

diff --git a/R/E_step.R b/R/E_step.R
@@ -2,22 +2,20 @@
 #'
 #' Perform E-step for one gene, without GQ approximation
 #'
-#' @param par A vector of dropout model
-#' @param z A matrix, 1st col = obs count, 2nd,3rd row = DO par
-#' @param z.ind A binary indicator of (z == 0) in the original obs count (before imputation)
+#' @param par A matrix with two columns containing coefficient for the dropout model. The second column is currently unused
+#' and set to zero but may be used in the future to model droput rates dependence on gene-specific factors.
+#' @param z vector of observed count
+#' @param z.ind A binary indicator of (z>0) in the original obs count (before imputation)
 #' @param sf Size factors for different cells
-#' @param mu Mean parameter for gene i
-#' @param disp Reciprocal of size parameter for gene i
+#' @param pi0 Gene-specific zero-inflated parameter
+#' @param mu Gene-specific mean parameter
+#' @param disp Reciprocal of gene-specific size parameter for
 #'
 EstepByGene <- function(par, z, z.ind, sf, pi0, mu, disp, y) {
 
   y <- 1:max(2, qnbinom(0.999, mu = max(mu*sf), size = 1/disp)) #changed to 350 on 01/12/16
- # DO.probs is 200 x ncell matrix
   DO.prob <- apply(as.matrix(cbind(z, par)), 1, calcDOProb, y = y)
 
-  #take into account DE
-  #mu <- mu*dmu
-  #pi0 <- pi0 + dpi0
   NB.prob <- t(apply(as.matrix(y), 1, calcNBProb, mu = mu*sf, size = 1/disp))
   NB.prob <- NB.prob %*% diag(1-pi0)
 
@@ -35,7 +33,6 @@ EstepByGene <- function(par, z, z.ind, sf, pi0, mu, disp, y) {
   EY.wt <- (DO.prob*NB.prob) %*% diag(1/(PZ0E1))
   # impute for all obs
   EYZ0E1 <- colSums(EY.wt*y)
-  #EYZ0E1[z.ind] <- z[z.ind]/par
 
   out <- list()
   out[['PZ0E1']] <- PZ0E1
@@ -76,6 +73,3 @@ calcDOProb <- function(x, y, rho = 0.2) {
 calcNBProb <- function(y, mu, size) {
   return(dnbinom(y, mu = mu, size = size))
 }
-
-
-
diff --git a/R/E_step_approx.R b/R/E_step_approx.R
@@ -2,12 +2,14 @@
 #'
 #' Perform E-step for one gene, with GQ approximation
 #'
-#' @param par A vector of dropout model
-#' @param z A matrix, 1st col = obs count, 2nd,3rd row = DO par
-#' @param z.ind A binary indicator of (z == 0) in the original obs count (before imputation)
+#' @param par A matrix with two columns containing coefficient for the dropout model. The second column is currently unused
+#' and set to zero but may be used in the future to model droput rates dependence on gene-specific factors.
+#' @param z vector of observed count
+#' @param z.ind A binary indicator of (z>0) in the original obs count (before imputation)
 #' @param sf Size factors for different cells
-#' @param mu Mean parameter for gene i
-#' @param disp Reciprocal of size parameter for gene i
+#' @param pi0 gene-specific zero-inflated parameter
+#' @param mu gene-specific mean parameter
+#' @param disp Reciprocal of gene-specific size parameter for
 #' @param GQ.object Gauss-Legendre quadrature points and weights
 #'
 Estep2ByGene <- function(par, z, z.ind, sf, pi0, mu,disp, GQ.object) {

diff --git a/R/LRT.R b/R/LRT.R
@@ -1,14 +1,14 @@
 #' Likelihood-ratio test
 #'
-#' Likelihood-ratio test
+#' Likelihood-ratio test for DE analysis of scRNA-seq data
 #'
 #' @param data.obs Observed count matrix for endogeneous genes, rows represent genes, columns represent cells
-#' @param out Output object from EM for DE model
-#' @param out2 Output object from EM for no-DE model
+#' @param out Output of fitDE, it contains EM algorithm estimates for models with DE between cell-types.
+#' @param out2 Output of fitNoDE, it contains EM algorithm estimates for models without DE between cell-types
 #' @param cell.type A factor or a integer/numeric vector starting from 1 giving cell-type labels
 #' @param parallel If \code{TRUE}, run in parallel
 #'
-#' @return A list including statistics, p-values and parameters for likelihood-ratio test.
+#' @return A list containing statistics, p-values and parameter estimates for models with and without DE.
 #'
 #' @useDynLib DECENT
 #' @importFrom Rcpp sourceCpp
@@ -110,6 +110,3 @@ lrTest <- function(data.obs, out, out2, cell.type, parallel) {
 
   return(output)
 }
-
-
-
diff --git a/R/M_step.R b/R/M_step.R
@@ -1,7 +1,14 @@
 #' M-step
 #'
-#' Perform the last part of M-step gene by gene, update pi_0, mu and phi
+#' Calculate negative complete data log-likelihood of DECENT model
+#' @param p vector of the following parameters: log odds-ratio of pi0, log mean of the first cell type,
+#' log fold-change for the mean parameter and log of dispersion (1/size) parameter.
+#' @param y vector containing the expected number of molecules as output from the E-step.
+#' @param sf vector of size factor estimates
+#' @param status the estimated probability of non-zero pre-dropout count, output of the E-step.
+#' @param ct A factor or a integer/numeric vector starting from 1 giving cell-type labels
 #'
+#' @return negative complete data log-likelihood evaluated at the parameter values.
 #'
 MstepNB <- function(p, y, sf, status, ct) {
   #n.ct=max(ct)
@@ -15,15 +22,15 @@ MstepNB <- function(p, y, sf, status, ct) {
   return(-sum(logl))
 }
 
-#' Gradient of log-likelihood function for ZINB model
+#' Gradient of complete data log-likelihood function for DECENT model
 #'
 #' Not used currently
 #'
 #' @param p A vector of parameter
 #' p[1] = logistic regression intercept for pi0
-#' p[2] = log mu1
-#' p[3] = log DE par for mu
-#' p[4] = log disp parameter
+#' p[2] = log mu for the first cell type
+#' p[3] = log fold-change (FC) for mean parameter
+#' p[4] = log dispersion (1/size) parameter
 #'
 zinbGrad <- function(p,y,status,sf,ct) {
 

diff --git a/R/decent.R b/R/decent.R
@@ -5,19 +5,19 @@
 #' @param data.obs Observed count matrix for endogeneous genes, rows represent genes, columns represent cells.
 #' @param spike Observed count matrix for spike-ins, rows represent spike-ins, columns represent cells
 #' (ONLY if spikes = \code{TRUE}).
-#' @param spike.conc A vector of theoretical count for each spike-in in one cell (ONLY if spikes = \code{TRUE}).
-#' @param CE.range A two-element vector of the lower limit and upper limit for the estimated
-#' capture efficiencies (ONLY if spikes = \code{FALSE}, default [0.02, 0.10]).
+#' @param spike.conc A vector of theoretical count for each spike-in in one cell (ONLY needed if spikes = \code{TRUE}).
+#' @param CE.range A two-element vector of the lower limit and upper limit for the estimated range of
+#' capture efficiencies (ONLY needed if spikes = \code{FALSE}, default [0.02, 0.10]).
 #' @param cell.type A factor or a integer/numeric vector starting from 1 providing cell-type labels.
 #' @param use.spikes If \code{TRUE}, use spike-ins to estimate capture efficiencies.
-#' @param normalize Normalization method, either 'ML' (maximum likelihood) or 'TMM' (Robinson et al., 2010).
-#' @param GQ.approx If \code{TRUE}, use GQ approximation to speed up E-step.
+#' @param normalize Method for estimating size factors, either 'ML' (maximum likelihood, Ye et al., 2017) or 'TMM' (Robinson et al., 2010).
+#' @param GQ.approx If \code{TRUE}, use Gaussian-Quadrature approximation to speed up E-step.
 #' @param parallel If \code{TRUE}, run DECENT in parallel.
-#' @param n.cores Number of CPU cores to use, default is all (ONLY if \code{TRUE}).
+#' @param n.cores Number of CPU cores to use, default is all (ONLY if parallel=\code{TRUE}).
 #' @param imputed If \code{TRUE}, include imputed data matrix in the output (Not available now).
 #' @param dir Output directory for the DE model estimates, no-DE model estimates as well as the LRT output.
 #'
-#' @return A data frame containing the result of differential expression analysis.
+#' @return A list containing the result of differential expression analysis, with the following components: stat,pval,par.DE and par.noDE.
 #'
 #' @import parallel
 #' @import foreach

diff --git a/man/Estep2ByGene.Rd b/man/Estep2ByGene.Rd
diff --git a/man/EstepByGene.Rd b/man/EstepByGene.Rd
diff --git a/man/MstepNB.Rd b/man/MstepNB.Rd
diff --git a/man/decent.Rd b/man/decent.Rd
diff --git a/man/fitDE.Rd b/man/fitDE.Rd
diff --git a/man/fitNoDE.Rd b/man/fitNoDE.Rd
diff --git a/man/lrTest.Rd b/man/lrTest.Rd