AlexsLemonade · sjspielman · Dec 18, 2023 · Dec 18, 2023 · Dec 18, 2023 · Dec 18, 2023
diff --git a/R/merge_sce_list.R b/R/merge_sce_list.R
@@ -62,7 +62,7 @@ merge_sce_list <- function(
     cell_id_column = "cell_id",
     include_altexp = TRUE) {
 
-  # Check `sce_list`----------------------
+  ## Checks --------------------------
   if (is.null(names(sce_list))) {
     warning(
       glue::glue(
@@ -79,15 +79,13 @@ merge_sce_list <- function(
     return(sce_list)
   }
 
-  # Check `retain_coldata_cols` ----------------
+  # Check `retain_coldata_cols`
   if (length(retain_coldata_cols) == 0) {
     warning("All pre-existing colData will be removed from the the merged SCE.
      Please check that `retain_coldata_cols` was correctly specified.")
   }
 
-  # Subset SCEs to shared features and ensure appropriate naming ------------------
-
-  # First, obtain intersection among all SCE objects
+  # Check for shared features
   shared_features <- sce_list |>
     purrr::map(rownames) |>
     purrr::reduce(intersect)
@@ -97,6 +95,64 @@ merge_sce_list <- function(
          They cannot be merged.")
   }
 
+  # Check that library id and sample id are present in main SCE metadata
+  id_checks <- sce_list |>
+    purrr::map(\(sce){
+      all(c("library_id", "sample_id") %in% names(metadata(sce)))
+    }) |>
+    unlist()
-    purrr::map(\(sce){
-      all(c("library_id", "sample_id") %in% names(metadata(sce)))
-    }) |>
-    unlist()
+    purrr::map_lgl(\(sce){
+      all(c("library_id", "sample_id") %in% names(metadata(sce)))
+    })
-    purrr::map(\(sce){
-      all(c("library_id", "sample_id") %in% names(metadata(sce)))
-    }) |>
-    unlist()
+    purrr::map_lgl(\(sce){
+      all(c("library_id", "sample_id") %in% names(metadata(sce)))
+    })
+
+  if (!all(id_checks)) {
+    stop("The metadata for each SCE object must contain `library_id` and `sample_id`.")
+  }
+
+  # Check altExp compatibility, if we are including them
+  if (include_altexp) {
+
+    # Find all altExp names present in the SCE objects.
+    altexp_names <- sce_list |>
+      purrr::map(
+        \(sce) altExpNames(sce)
+      ) |>
+      purrr::reduce(union)
+
+    # For each in altexp_names (if present), do they have the same features?
+    # If not, error out
+    for (altexp_name in altexp_names) {
+
+      # all altExps for this name
+      altexp_list <- sce_list |>
+        purrr::keep(\(sce) altexp_name %in% altExpNames(sce)) |>
+        purrr::map(altExp, altexp_name)
+
+      # find their union of features
+      altexp_name_features <- altexp_list |>
+        purrr::map(rownames) |>
+        purrr::reduce(union) |>
+        sort()
+
+      # create logical vector for presence of all features
+      features_present <- altexp_list |>
+        purrr::map_lgl(
+          \(alt_sce) identical(altexp_name_features, sort(rownames(alt_sce)))
+        )
+
+      if (!all(features_present)) {
+        stop(
+          glue::glue("The {altexp_name} alternative experiments do not share the same set of features.")
+        )
+      }
+    }
+  }
+
+  ## Subset SCEs to shared features and ensure appropriate naming ------------------
+
+  # First, obtain the union of features for all (main) SCE objects
+  # this will also be the final order of features/rows in the final SCE
+  sce_full_features <- sce_list |>
+    purrr::map(rownames) |>
+    purrr::reduce(union)
-  # First, obtain the union of features for all (main) SCE objects
-  # this will also be the final order of features/rows in the final SCE
-  sce_full_features <- sce_list |>
-    purrr::map(rownames) |>
-    purrr::reduce(union)
+  # First, obtain the intersection of features for all (main) SCE objects
+  # this will also be the final order of features/rows in the final SCE
+  sce_shared_features <- sce_list |>
+    purrr::map(rownames) |>
+    purrr::reduce(intersect)
-  # First, obtain the union of features for all (main) SCE objects
-  # this will also be the final order of features/rows in the final SCE
-  sce_full_features <- sce_list |>
-    purrr::map(rownames) |>
-    purrr::reduce(union)
+  # First, obtain the intersection of features for all (main) SCE objects
+  # this will also be the final order of features/rows in the final SCE
+  sce_shared_features <- sce_list |>
+    purrr::map(rownames) |>
+    purrr::reduce(intersect)
+
   # Second, determine all the column names that are present in any SCE so it can
   #  be created in any missing SCEs with `NA` values
   all_colnames <- sce_list |>
@@ -106,35 +162,22 @@ merge_sce_list <- function(
     unlist() |>
     unique()
 
-  # Check that the `retain_coldata_cols` are present in at least one SCE, and
-  #  error if the column exists nowhere.
+  # Check that the `retain_coldata_cols` are present in at least one SCE
   if (!(any(retain_coldata_cols %in% all_colnames))) {
-    warning("The provided `retain_coldata_cols` are not present in any SCEs.")
+    warning("The provided `retain_coldata_cols` are not present in any SCE colData.")
   }
 
-  # check that library id and sample id are present in metadata
-  id_checks <- sce_list |>
-    purrr::map(\(sce){
-      all(c("library_id", "sample_id") %in% names(metadata(sce)))
-    }) |>
-    unlist()
-
-  if (!all(id_checks)) {
-    stop("The metadata for each SCE object must contain `library_id` and `sample_id`.")
-  }
-
-  # Prepare main experiment of SCEs for merging --------------------
+  ## Prepare main experiment of SCEs for merging --------------------
   sce_list <- sce_list |>
     purrr::imap(
       prepare_sce_for_merge,
       batch_column = batch_column,
       cell_id_column = cell_id_column,
-      shared_features = shared_features,
+      all_features = sce_full_features,
-      all_features = sce_full_features,
+      merge_features = sce_shared_features,
-      all_features = sce_full_features,
+      merge_features = sce_shared_features,
       retain_coldata_cols = retain_coldata_cols,
       preserve_rowdata_cols = preserve_rowdata_cols
     )
 
-
   ## Handle metadata ---------------------------------------------
   # get a list of metadata from the list of sce objects
   # each library becomes an element within the metadata components
@@ -240,8 +283,8 @@ merge_sce_list <- function(
 #'   colData slot
 #' @param cell_id_column The name of the cell_id column which will be added to the
 #'   colData slot
-#' @param shared_features A vector of features (genes) that all SCEs to be merged
-#'   have in common
+#' @param all_features A vector of features that all SCEs are expected to have.
+#'   If any are missing, they will be added and filled with `NA` values.
-#' @param all_features A vector of features that all SCEs are expected to have.
-#'   If any are missing, they will be added and filled with `NA` values.
+#' @param merge_features A vector of features that will be included in the output SCE for merging
+#'   If any are missing, they will be added and filled with `NA` values.
-#' @param all_features A vector of features that all SCEs are expected to have.
-#'   If any are missing, they will be added and filled with `NA` values.
+#' @param merge_features A vector of features that will be included in the output SCE for merging
+#'   If any are missing, they will be added and filled with `NA` values.
 #' @param retain_coldata_cols A vector of columns to retain in the colData slot.
 #'   If columns are missing from the data, they will be filled with `NA` values.
 #'   The exceptions to this are `barcode_column` and `batch_column` which will be
@@ -255,12 +298,20 @@ prepare_sce_for_merge <- function(
   sce_name,
   batch_column,
   cell_id_column,
-  shared_features,
+  all_features,
-  all_features,
+  merge_features,
-  all_features,
+  merge_features,
   retain_coldata_cols,
   preserve_rowdata_cols) {
 
-  # Subset to shared features
-  sce <- sce[shared_features, ]
+
+  #### assays #####
+  # If all features are present, order them to match `all_features` order
+  # If any features are missing, create a new SCE with those features and pop in
+  #  existing assays and rowData slot
+  sce <- create_sce_with_all_features(
-  sce <- create_sce_with_all_features(
+  sce <- create_sce_with_features(
-  sce <- create_sce_with_all_features(
+  sce <- create_sce_with_features(
+    sce,
+    all_features
-    all_features
+    merge_features
-    all_features
+    merge_features
+  )
+
 
   ##### rowData #####
   # Add `sce_name` ID to rowData column names except for those
@@ -333,6 +384,102 @@ prepare_sce_for_merge <- function(
 }
 
 
+#' Create a new SCE that contains all provided features, or reorder a provided
+#'  SCE into the given feature order
+#'
+#' @param sce SCE from which a new SCE will be created, or which will be reordered
+#' @param all_features A vector of all features that need to be in the final SCE,
+#'   in this order
-#' @param all_features A vector of all features that need to be in the final SCE,
-#'   in this order
+#' @param features A vector of the feature names for the final SCE, in output order
-#' @param all_features A vector of all features that need to be in the final SCE,
-#'   in this order
+#' @param features A vector of the feature names for the final SCE, in output order
+#'
+#' @return SCE object with all features in the given order
+create_sce_with_all_features <- function(
+    sce,
+    all_features) {
-    all_features) {
+    features) {
-    all_features) {
+    features) {
+
+  present_features <- rownames(sce)
+  feature_diff <- setdiff(all_features, present_features)
-  feature_diff <- setdiff(all_features, present_features)
+  missing_features <- setdiff(features, present_features)
-  feature_diff <- setdiff(all_features, present_features)
+  missing_features <- setdiff(features, present_features)
+
+  if (length(feature_diff) == 0) {
+
+    # Simply reorder and return
+    return(sce[all_features,])
-  if (length(feature_diff) == 0) {
-
-    # Simply reorder and return
-    return(sce[all_features,])
+  if (length(missing_features) == 0) {
+
+    # Simply reorder and/or filter and return
+    return(sce[features,])
-  if (length(feature_diff) == 0) {
-
-    # Simply reorder and return
-    return(sce[all_features,])
+  if (length(missing_features) == 0) {
+
+    # Simply reorder and/or filter and return
+    return(sce[features,])
+
+  } else {
+
+    # new matrix with all NAs
+    new_matrix <- matrix(
+      data = NA,
+      nrow = length(all_features),
+      ncol = ncol(sce),
+      dimnames = list(
+        all_features,
+        colnames(sce)
+      )
+    )
-    new_matrix <- matrix(
-      data = NA,
-      nrow = length(all_features),
-      ncol = ncol(sce),
-      dimnames = list(
-        all_features,
-        colnames(sce)
-      )
-    )
+    new_matrix <- Matrix::Matrix(
+      data = NA_real_,
+      nrow = length(all_features),
+      ncol = ncol(sce),
+      dimnames = list(
+        all_features,
+        colnames(sce)
+      )
+      sparse = TRUE
+    )
-    new_matrix <- matrix(
-      data = NA,
-      nrow = length(all_features),
-      ncol = ncol(sce),
-      dimnames = list(
-        all_features,
-        colnames(sce)
-      )
-    )
+    new_matrix <- Matrix::Matrix(
+      data = NA_real_,
+      nrow = length(all_features),
+      ncol = ncol(sce),
+      dimnames = list(
+        all_features,
+        colnames(sce)
+      )
+      sparse = TRUE
+    )
+
+    # Create new matrix for each present assay
+    sce_assays <- assayNames(sce)
+    new_assays <- sce_assays |>
+      purrr::map(
+        \(assay_name) {
+
+
+          # fill in existing matrix values
+          new_matrix[present_features, colnames(sce)] <- as.matrix(
+            assay(sce, assay_name)[present_features, colnames(sce)]
+          )
-          new_matrix[present_features, colnames(sce)] <- as.matrix(
-            assay(sce, assay_name)[present_features, colnames(sce)]
-          )
+          new_matrix[present_features, colnames(sce)] <- (
+            assay(sce, assay_name)[present_features, colnames(sce)]
+          )
-          new_matrix[present_features, colnames(sce)] <- as.matrix(
-            assay(sce, assay_name)[present_features, colnames(sce)]
-          )
+          new_matrix[present_features, colnames(sce)] <- (
+            assay(sce, assay_name)[present_features, colnames(sce)]
+          )
+
+          return(new_matrix)
+        }
+      ) |>
+      purrr::set_names(sce_assays)
+
+    # Create a new SCE
+    new_sce <- SingleCellExperiment(assays = new_assays)
+
+    # Establish new rowData, filling in NAs for missing features
+    rowdata_colnames <- colnames(rowData(sce))
+    new_rowData <- matrix(
+      data = NA,
+      nrow = length(all_features),
+      ncol = ncol(rowData(sce)),
+      dimnames = list(
+        all_features,
+        rowdata_colnames
+      )
+    )
+
+    # Slot in existing rowData, ensuring correct order (as.matrix is needed)
+    new_rowData[present_features, rowdata_colnames] <- as.matrix(
+      rowData(sce)[present_features, rowdata_colnames]
+    )
-    new_rowData[present_features, rowdata_colnames] <- as.matrix(
-      rowData(sce)[present_features, rowdata_colnames]
-    )
+    new_rowData[present_features, rowdata_colnames] <- (
+      rowData(sce)[present_features, rowdata_colnames]
+    )
-    new_rowData[present_features, rowdata_colnames] <- as.matrix(
-      rowData(sce)[present_features, rowdata_colnames]
-    )
+    new_rowData[present_features, rowdata_colnames] <- (
+      rowData(sce)[present_features, rowdata_colnames]
+    )
+
+    # Add new rowData into new_sce
+    # TODO: We still need to handle columns like `gene_ids` which are now NAs
+    rowData(new_sce) <- new_rowData
+
+    # Add existing colData & metadata to the new SCE
+    colData(new_sce) <- colData(sce)
+    metadata(new_sce) <- metadata(sce)
+
+    # Return this new_sce
+    return(new_sce)
+  }
+}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
 
 #' Prepare altExps for merge and create a list of merged altExps for each altExp name
 #'

diff --git a/man/create_sce_with_all_features.Rd b/man/create_sce_with_all_features.Rd
diff --git a/man/prepare_sce_for_merge.Rd b/man/prepare_sce_for_merge.Rd