microbiome · TuomasBorman · Dec 11, 2024 · Jan 22, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: miaTime
 Type: Package
 Title: Microbiome Time Series Analysis
-Version: 0.99.2
+Version: 0.99.3
 Authors@R:
      c(person(given = "Leo", family = "Lahti", role = c("aut"),
               email = "[email protected]",

diff --git a/NAMESPACE b/NAMESPACE
@@ -27,3 +27,4 @@ importFrom(dplyr,mutate)
 importFrom(dplyr,summarize)
 importFrom(dplyr,ungroup)
 importFrom(tidyr,pivot_wider)
+importFrom(tidyr,unnest)
diff --git a/NEWS b/NEWS
@@ -1,3 +1,6 @@
+Changes in version 0.99.3
++ Handle repeated samples in divergence functions by calculating average
+
 Changes in version 0.99.1
 + Added bimodality functions
 

diff --git a/R/getBaselineDivergence.R b/R/getBaselineDivergence.R
@@ -135,8 +135,7 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"),
         ########################### INPUT CHECK END ############################
         # Add baseline samples to colData
         args <- .add_reference_samples_to_coldata(
-            x, time.col, group, reference, assay.type, method,
-            reference.method = "baseline", ...)
+            x, time.col, group, reference, reference.method = "baseline", ...)
         # Create an argument list. Do not include altexp as it is already taken
         # into account.
         args <- c(
@@ -146,12 +145,12 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"),
         )
         # Calculate divergences
         res <- do.call(getDivergence, args)
-        # Add time difference
-        x <- args[["x"]]
-        reference <- args[["reference"]]
-        time_res <- .get_time_difference(x, time.col, reference)
+        # Get time difference
+        args[["time.col"]] <- time.col
+        time_res <- do.call(.get_time_difference, args)
         # Create a DF to return
-        res <- .convert_divergence_to_df(x, res, time_res, reference, ...)
+        args <- c(args, list(x_orig = x, res = res, time_res = time_res))
+        res <- do.call(.convert_divergence_to_df, args)
         return(res)
     }
 )
@@ -231,7 +230,16 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
     if( is.null(reference) ){
         ref <- .get_reference_samples(
             cd, time.col, time.interval, group, reference.method)
-        cd[[ref.name]] <- ref
+        # If the data includes repeated timepoints, the data has multiple
+        # reference samples for each sample. That is why we make sure that
+        # the data is expanded accordingly.
+        if( length(ref) != nrow(cd) ){
+            x <- x[, names(ref)]
+            cd <- cd[names(ref), ]
+        } else{
+            ref <- ref[ rownames(cd) ]
+        }
+        cd[[ref.name]] <- unname(ref)
         reference <- ref.name
     }
     # If reference was specified, check that it is specifying samples
@@ -270,16 +278,23 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
 
     # Add modified colData back to TreeSE
     colData(x) <- cd
+    # Add original sample names (if there are duplicated timepoints, certain
+    # samples are duplicated). And make column names unique.
+    orig_sample_col <- "original_sample_names"
+    colData(x)[[orig_sample_col]] <- colnames(x)
+    colnames(x) <- make.unique(colnames(x))
     # The returned value includes the TreeSE along with reference
     # column name because it might be that we have modified it.
-    res <- list(x = x, reference = reference)
+    res <- list(
+        x = x, reference = reference, orig.sample.names = orig_sample_col)
     return(res)
 }
 
 # This function returns the first sample for each group by default.
 # Alternatively, it returns the previous ith sample for each sample in each
 # group.
 #' @importFrom dplyr group_by mutate arrange ungroup lag
+#' @importFrom tidyr unnest
 .get_reference_samples <- function(
         df, time.col, time.interval, group, reference.method){
     # This following line is to suppress "no visible binding for" messages
@@ -288,8 +303,8 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
 
     rowname_col <- "temporary_rownames_column"
     reference_col <- "temporary_reference_column"
-    # Store rownames and add rownames as a column
-    df[[rowname_col]] <- original_order <- rownames(df)
+    # Add rownames as a column
+    df[[rowname_col]] <- rownames(df)
     # Convert to data.frame and group data based on group
     df <- df |>
         as.data.frame() |>
@@ -299,30 +314,70 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
     if( reference.method == "baseline" ){
         # Find first timepoint within a group
         df <- df |>
-            mutate(!!reference_col :=
-                .data[[rowname_col]][which.min(.data[[time.col]])])
+            mutate(!!reference_col := .get_baseline_samples(
+                .data, time.col, rowname_col))
     } else if( reference.method == "stepwise" ){
         # For each sample, get the previous ith sample.
         # For each subject, get previous sample based on time.
         df <- df |>
-            mutate(!!reference_col := lag(
-                .data[[rowname_col]], n = time.interval,
-                order_by = .data[[time.col]]))
+            mutate(!!reference_col := .get_previous_samples(
+                .data, time.col, rowname_col, time.interval))
+    }
+    # Give warning if the data includes replicated timepoints.
+    if( any(lengths(df[[reference_col]]) > 1L) ){
+        warning("Some samples are associated with multiple reference samples. ",
+                "In these cases, the reference time point includes multiple ",
+                "samples, and their average is used.", call. = FALSE)
     }
-    # Ungroup to revert to the original structure and convert to DataFrame
+    # Ungroup to revert to the original structure, make sure that
+    # each cell includes single value (takes action when there are repeated
+    # timepoints) and convert to DataFrame
     df <- df |>
         ungroup() |>
+        unnest(cols = .data[[reference_col]]) |>
         DataFrame()
-    # Put the data into original order
-    rownames(df) <- df[[rowname_col]]
-    df <- df[original_order, ]
     # Get only reference samples
     res <- df[[reference_col]]
+    names(res) <- df[[rowname_col]]
     return(res)
 }
 
+# This function gets df as input. The data must be already grouped if grouping
+# exists. For each sample, this function finds baseline timepoint. If there
+# are multiple samples from baseline timepoint, all are returned.
+.get_baseline_samples <- function(.data, time.col, rowname_col){
+    # Get timepoints and corresponding baseline timepoints
+    time_points <- unique(sort(.data[[time.col]]))
+    baseline_time <- min(time_points, na.rm = TRUE)
+    baseline_time <- rep(baseline_time, length(.data[[time.col]]))
+    # Split sample names so that they are grouped into timepoints
+    timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]])
+    # For each sample, assign baseline samples
+    baseline_samples <- timepoint_samples[
+        match(baseline_time, names(timepoint_samples)) ]
+    return(baseline_samples)
+}
+
+# This function gets df as input. The data must be already grouped if grouping
+# exists. For each sample, this function finds previous timepoint. If there
+# are multiple samples from previous timepoint, all are returned.
+#' @importFrom dplyr lag
+.get_previous_samples <- function(.data, time.col, rowname_col, time.interval){
+    # Get timepoints and corresponding previous timepoints
+    current_time <- unique(sort(.data[[time.col]]))
+    prev_time <- lag(current_time, n = time.interval)
+    prev_time <- prev_time[match(.data[[time.col]], current_time)]
+    # Split sample names so that they are grouped into timepoints
+    timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]])
+    # For each sample, assign previous samples
+    prev_samples <- timepoint_samples[
+        match(prev_time, names(timepoint_samples)) ]
+    prev_samples[ lengths(prev_samples) == 0L ] <- NA_character_
+    return(prev_samples)
+}
+
 # This function get time difference between a sample and its reference sample
-.get_time_difference <- function(x, time.col, reference){
+.get_time_difference <- function(x, time.col, reference, ...){
     # Get timepoints
     time_point <- x[[time.col]]
     # Get reference time points
@@ -333,13 +388,31 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
 }
 
 # This function converts time divergence results to DF object
+#' @importFrom dplyr summarize
 .convert_divergence_to_df <- function(
-        x, res, time_res, reference,
+        x_orig, x, res, time_res, orig.sample.names, reference,
         name = c("divergence", "time_diff", "ref_samples"), ...){
     # Validate 'name' param
     temp <- .check_input(name, list("character vector"), length = 3L)
     #
-    df <- DataFrame(res, time_res, x[[reference]], row.names = colnames(x))
+    df <- data.frame(res, time_res, sample = x[[orig.sample.names]])
+    # If data includes replicated samples, each time point might have multiple
+    # samples. We take average of these repeated time points.
+    df <- df |>
+        group_by(sample) |>
+        summarize(res = mean(res, ...), time_res = mean(time_res, ...)) |>
+        DataFrame()
+    # Add reference samples
+    ref_samples <- split(x[[reference]], x[[orig.sample.names]]) |> unname()
+    if( all(lengths(ref_samples) == 1L) ){
+        ref_samples <- unlist(ref_samples)
+    }
+    df[["reference_samples"]] <- ref_samples
+    # Wrangle names
+    rownames(df) <- df[["sample"]]
+    df[["sample"]] <- NULL
     colnames(df) <- name
+    # Ensure that the order is correct, i.e., matches with the original input
+    df <- df[colnames(x_orig), ]
     return(df)
 }
diff --git a/R/getStepwiseDivergence.R b/R/getStepwiseDivergence.R
@@ -97,12 +97,12 @@ setMethod("getStepwiseDivergence", signature = c(x = "ANY"),
         )
         # Calculate divergences
         res <- do.call(getDivergence, args)
-        # Add time difference
-        x <- args[["x"]]
-        reference <- args[["reference"]]
-        time_res <- .get_time_difference(x, time.col, reference)
+        # Get time difference
+        args[["time.col"]] <- time.col
+        time_res <- do.call(.get_time_difference, args)
         # Create a DF to return
-        res <- .convert_divergence_to_df(x, res, time_res, reference, ...)
+        args <- c(args, list(x_orig = x, res = res, time_res = time_res))
+        res <- do.call(.convert_divergence_to_df, args)
         return(res)
     }
 )

diff --git a/tests/testthat/test-getBaselineDivergence.R b/tests/testthat/test-getBaselineDivergence.R
@@ -2,6 +2,7 @@
 test_that("addBaselineDivergence output", {
     data(hitchip1006)
     tse <- hitchip1006
+    tse <- tse[, !duplicated(colData(tse)[, c("time", "subject")])]
     tse2 <- addBaselineDivergence(
         tse, group = "subject", time.col = "time",
         name = c("divergence_from_baseline", "time_from_baseline",
@@ -135,7 +136,7 @@ test_that(".get_reference_samples with different time intervals", {
 
 # Basic SummarizedExperiment for testing
 col_data <- DataFrame(
-    time = c(0, 1, 2, 1, 2, 0),
+    time = c(0, 1, 2, 4, 10, 3),
     group = c("A", "A", "A", "B", "B", "B"),
     row.names = c("Sample1", "Sample2",
         "Sample3", "Sample4", "Sample5", "Sample6"))
@@ -199,7 +200,8 @@ test_that(".get_reference_samples baseline", {
         colData(se), time.col = "time", group = "group",
         reference.method = "stepwise", time.interval = 1)
     expect_equal(stepwise, c(
-        NA, "Sample1", "Sample2", "Sample6", "Sample4", NA))
+        NA, "Sample1", "Sample2", "Sample6", "Sample4", NA),
+        check.attributes = FALSE)
 })
 
 # Time difference calculation
@@ -209,7 +211,7 @@ test_that(".get_time_difference calculates correct time diff", {
     colData(se2)[["ref"]] <- reference
     time_diffs <- .get_time_difference(
         se2, time.col = "time", reference = "ref")
-    expect_equal(time_diffs, c(-1, 1, 2, -1, NA, -1))
+    expect_equal(time_diffs, c(-1, 1, 2, 2, NA, -1))
 })
 
 # Convert divergence to DataFrame
@@ -225,9 +227,11 @@ test_that(".convert_divergence_to_df formats correctly", {
         assays = list(),
         colData = col_data
     )
+    colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6))
     reference <- "reference"
     df <- .convert_divergence_to_df(
-        se, divergence, time_diff, reference,
+        x_orig = se, x = se, res = divergence, time_res = time_diff,
+        reference = reference, orig.sample.names = "sam",
         name = c("test_div", "test_time_diff", "test_reference_samples"))
     expect_s4_class(df, "DataFrame")
     expect_equal(colnames(df),
@@ -272,9 +276,46 @@ test_that(".convert_divergence_to_df with NA divergence values", {
         colData = col_data
     )
     reference <- "reference"
+    colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6))
     df <- .convert_divergence_to_df(
-        se, divergence, time_diff, reference,
+        se, se, divergence, time_diff, reference, "sam",
         name = c("test_div", "test_time_diff", "test_reference_samples"))
     expect_s4_class(df, "DataFrame")
     expect_true(all(is.na(df$test_div[is.na(divergence)])))
 })
+
+# Test that the function works correctly with replicated samples
+test_that("getBaselineDivergence with replicated samples", {
+    tse <- makeTSE(nrow = 1000, ncol = 20)
+    assayNames(tse) <- "counts"
+    colData(tse)[["time"]] <- sample(c(1, 3, 6, 100), 20, replace = TRUE)
+    res <- getBaselineDivergence(
+        tse, time.col = "time", group = "group", method = "euclidean") |>
+        expect_warning()
+    res <- res[, c(1, 2)]
+    # For all samples, calculate divergence
+    sams <- colnames(tse)
+    ref <- lapply(sams, function(sam){
+        # Get data on sample
+        sam_dat <- colData(tse[, sam])
+        # Get its reference samples
+        group_dat <- colData(tse)[tse[["group"]] == sam_dat[["group"]], ]
+        ref_time <- sort(group_dat[["time"]])[[1]]
+        # Loop through each reference sample, calculate its distance to
+        # the sample, and take mean
+        ref_sams <- rownames(group_dat[ group_dat[["time"]] == ref_time, ])
+        ref_vals <- vapply(ref_sams, function(ref_sam){
+            dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ),
+                 method = "euclidean")
+        }, numeric(1))
+        ref_vals <- mean(ref_vals)
+        # Return divergence and time difference
+        temp_res <- c(ref_vals, sam_dat[["time"]] - ref_time)
+        return(temp_res)
+    })
+    ref <- do.call(rbind, ref)
+    ref <- DataFrame(ref)
+    colnames(ref) <- colnames(res)
+    rownames(ref) <- rownames(res)
+    expect_equal(res, ref)
+})