Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replicated samples in divergence functions #102

Open
wants to merge 5 commits into
base: devel
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: miaTime
Type: Package
Title: Microbiome Time Series Analysis
Version: 0.99.2
Version: 0.99.3
Authors@R:
c(person(given = "Leo", family = "Lahti", role = c("aut"),
email = "[email protected]",
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ importFrom(dplyr,mutate)
importFrom(dplyr,summarize)
importFrom(dplyr,ungroup)
importFrom(tidyr,pivot_wider)
importFrom(tidyr,unnest)
3 changes: 3 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
Changes in version 0.99.3
+ Handle repeated samples in divergence functions by calculating average

Changes in version 0.99.1
+ Added bimodality functions

Expand Down
119 changes: 96 additions & 23 deletions R/getBaselineDivergence.R
Original file line number Diff line number Diff line change
Expand Up @@ -135,8 +135,7 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"),
########################### INPUT CHECK END ############################
# Add baseline samples to colData
args <- .add_reference_samples_to_coldata(
x, time.col, group, reference, assay.type, method,
reference.method = "baseline", ...)
x, time.col, group, reference, reference.method = "baseline", ...)
# Create an argument list. Do not include altexp as it is already taken
# into account.
args <- c(
Expand All @@ -146,12 +145,12 @@ setMethod("getBaselineDivergence", signature = c(x = "SummarizedExperiment"),
)
# Calculate divergences
res <- do.call(getDivergence, args)
# Add time difference
x <- args[["x"]]
reference <- args[["reference"]]
time_res <- .get_time_difference(x, time.col, reference)
# Get time difference
args[["time.col"]] <- time.col
time_res <- do.call(.get_time_difference, args)
# Create a DF to return
res <- .convert_divergence_to_df(x, res, time_res, reference, ...)
args <- c(args, list(x_orig = x, res = res, time_res = time_res))
res <- do.call(.convert_divergence_to_df, args)
return(res)
}
)
Expand Down Expand Up @@ -231,7 +230,16 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
if( is.null(reference) ){
ref <- .get_reference_samples(
cd, time.col, time.interval, group, reference.method)
cd[[ref.name]] <- ref
# If the data includes repeated timepoints, the data has multiple
# reference samples for each sample. That is why we make sure that
# the data is expanded accordingly.
if( length(ref) != nrow(cd) ){
x <- x[, names(ref)]
cd <- cd[names(ref), ]
} else{
ref <- ref[ rownames(cd) ]
}
cd[[ref.name]] <- unname(ref)
reference <- ref.name
}
# If reference was specified, check that it is specifying samples
Expand Down Expand Up @@ -270,16 +278,23 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),

# Add modified colData back to TreeSE
colData(x) <- cd
# Add original sample names (if there are duplicated timepoints, certain
# samples are duplicated). And make column names unique.
orig_sample_col <- "original_sample_names"
colData(x)[[orig_sample_col]] <- colnames(x)
colnames(x) <- make.unique(colnames(x))
# The returned value includes the TreeSE along with reference
# column name because it might be that we have modified it.
res <- list(x = x, reference = reference)
res <- list(
x = x, reference = reference, orig.sample.names = orig_sample_col)
return(res)
}

# This function returns the first sample for each group by default.
# Alternatively, it returns the previous ith sample for each sample in each
# group.
#' @importFrom dplyr group_by mutate arrange ungroup lag
#' @importFrom tidyr unnest
.get_reference_samples <- function(
df, time.col, time.interval, group, reference.method){
# This following line is to suppress "no visible binding for" messages
Expand All @@ -288,8 +303,8 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),

rowname_col <- "temporary_rownames_column"
reference_col <- "temporary_reference_column"
# Store rownames and add rownames as a column
df[[rowname_col]] <- original_order <- rownames(df)
# Add rownames as a column
df[[rowname_col]] <- rownames(df)
# Convert to data.frame and group data based on group
df <- df |>
as.data.frame() |>
Expand All @@ -299,30 +314,70 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
if( reference.method == "baseline" ){
# Find first timepoint within a group
df <- df |>
mutate(!!reference_col :=
.data[[rowname_col]][which.min(.data[[time.col]])])
mutate(!!reference_col := .get_baseline_samples(
.data, time.col, rowname_col))
} else if( reference.method == "stepwise" ){
# For each sample, get the previous ith sample.
# For each subject, get previous sample based on time.
df <- df |>
mutate(!!reference_col := lag(
.data[[rowname_col]], n = time.interval,
order_by = .data[[time.col]]))
mutate(!!reference_col := .get_previous_samples(
.data, time.col, rowname_col, time.interval))
}
# Give warning if the data includes replicated timepoints.
if( any(lengths(df[[reference_col]]) > 1L) ){
warning("Some samples are associated with multiple reference samples. ",
"In these cases, the reference time point includes multiple ",
"samples, and their average is used.", call. = FALSE)
}
# Ungroup to revert to the original structure and convert to DataFrame
# Ungroup to revert to the original structure, make sure that
# each cell includes single value (takes action when there are repeated
# timepoints) and convert to DataFrame
df <- df |>
ungroup() |>
unnest(cols = .data[[reference_col]]) |>
DataFrame()
# Put the data into original order
rownames(df) <- df[[rowname_col]]
df <- df[original_order, ]
# Get only reference samples
res <- df[[reference_col]]
names(res) <- df[[rowname_col]]
return(res)
}

# This function gets df as input. The data must be already grouped if grouping
# exists. For each sample, this function finds baseline timepoint. If there
# are multiple samples from baseline timepoint, all are returned.
.get_baseline_samples <- function(.data, time.col, rowname_col){
# Get timepoints and corresponding baseline timepoints
time_points <- unique(sort(.data[[time.col]]))
baseline_time <- min(time_points, na.rm = TRUE)
baseline_time <- rep(baseline_time, length(.data[[time.col]]))
# Split sample names so that they are grouped into timepoints
timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]])
# For each sample, assign baseline samples
baseline_samples <- timepoint_samples[
match(baseline_time, names(timepoint_samples)) ]
return(baseline_samples)
}

# This function gets df as input. The data must be already grouped if grouping
# exists. For each sample, this function finds previous timepoint. If there
# are multiple samples from previous timepoint, all are returned.
#' @importFrom dplyr lag
.get_previous_samples <- function(.data, time.col, rowname_col, time.interval){
# Get timepoints and corresponding previous timepoints
current_time <- unique(sort(.data[[time.col]]))
prev_time <- lag(current_time, n = time.interval)
prev_time <- prev_time[match(.data[[time.col]], current_time)]
# Split sample names so that they are grouped into timepoints
timepoint_samples <- split(.data[[rowname_col]], .data[[time.col]])
# For each sample, assign previous samples
prev_samples <- timepoint_samples[
match(prev_time, names(timepoint_samples)) ]
prev_samples[ lengths(prev_samples) == 0L ] <- NA_character_
return(prev_samples)
}

# This function get time difference between a sample and its reference sample
.get_time_difference <- function(x, time.col, reference){
.get_time_difference <- function(x, time.col, reference, ...){
# Get timepoints
time_point <- x[[time.col]]
# Get reference time points
Expand All @@ -333,13 +388,31 @@ setMethod("addBaselineDivergence", signature = c(x = "SummarizedExperiment"),
}

# This function converts time divergence results to DF object
#' @importFrom dplyr summarize
.convert_divergence_to_df <- function(
x, res, time_res, reference,
x_orig, x, res, time_res, orig.sample.names, reference,
name = c("divergence", "time_diff", "ref_samples"), ...){
# Validate 'name' param
temp <- .check_input(name, list("character vector"), length = 3L)
#
df <- DataFrame(res, time_res, x[[reference]], row.names = colnames(x))
df <- data.frame(res, time_res, sample = x[[orig.sample.names]])
# If data includes replicated samples, each time point might have multiple
# samples. We take average of these repeated time points.
df <- df |>
group_by(sample) |>
summarize(res = mean(res, ...), time_res = mean(time_res, ...)) |>
DataFrame()
# Add reference samples
ref_samples <- split(x[[reference]], x[[orig.sample.names]]) |> unname()
if( all(lengths(ref_samples) == 1L) ){
ref_samples <- unlist(ref_samples)
}
df[["reference_samples"]] <- ref_samples
# Wrangle names
rownames(df) <- df[["sample"]]
df[["sample"]] <- NULL
colnames(df) <- name
# Ensure that the order is correct, i.e., matches with the original input
df <- df[colnames(x_orig), ]
return(df)
}
10 changes: 5 additions & 5 deletions R/getStepwiseDivergence.R
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ setMethod("getStepwiseDivergence", signature = c(x = "ANY"),
)
# Calculate divergences
res <- do.call(getDivergence, args)
# Add time difference
x <- args[["x"]]
reference <- args[["reference"]]
time_res <- .get_time_difference(x, time.col, reference)
# Get time difference
args[["time.col"]] <- time.col
time_res <- do.call(.get_time_difference, args)
# Create a DF to return
res <- .convert_divergence_to_df(x, res, time_res, reference, ...)
args <- c(args, list(x_orig = x, res = res, time_res = time_res))
res <- do.call(.convert_divergence_to_df, args)
return(res)
}
)
Expand Down
51 changes: 46 additions & 5 deletions tests/testthat/test-getBaselineDivergence.R
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
test_that("addBaselineDivergence output", {
data(hitchip1006)
tse <- hitchip1006
tse <- tse[, !duplicated(colData(tse)[, c("time", "subject")])]
tse2 <- addBaselineDivergence(
tse, group = "subject", time.col = "time",
name = c("divergence_from_baseline", "time_from_baseline",
Expand Down Expand Up @@ -135,7 +136,7 @@ test_that(".get_reference_samples with different time intervals", {

# Basic SummarizedExperiment for testing
col_data <- DataFrame(
time = c(0, 1, 2, 1, 2, 0),
time = c(0, 1, 2, 4, 10, 3),
group = c("A", "A", "A", "B", "B", "B"),
row.names = c("Sample1", "Sample2",
"Sample3", "Sample4", "Sample5", "Sample6"))
Expand Down Expand Up @@ -199,7 +200,8 @@ test_that(".get_reference_samples baseline", {
colData(se), time.col = "time", group = "group",
reference.method = "stepwise", time.interval = 1)
expect_equal(stepwise, c(
NA, "Sample1", "Sample2", "Sample6", "Sample4", NA))
NA, "Sample1", "Sample2", "Sample6", "Sample4", NA),
check.attributes = FALSE)
})

# Time difference calculation
Expand All @@ -209,7 +211,7 @@ test_that(".get_time_difference calculates correct time diff", {
colData(se2)[["ref"]] <- reference
time_diffs <- .get_time_difference(
se2, time.col = "time", reference = "ref")
expect_equal(time_diffs, c(-1, 1, 2, -1, NA, -1))
expect_equal(time_diffs, c(-1, 1, 2, 2, NA, -1))
})

# Convert divergence to DataFrame
Expand All @@ -225,9 +227,11 @@ test_that(".convert_divergence_to_df formats correctly", {
assays = list(),
colData = col_data
)
colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6))
reference <- "reference"
df <- .convert_divergence_to_df(
se, divergence, time_diff, reference,
x_orig = se, x = se, res = divergence, time_res = time_diff,
reference = reference, orig.sample.names = "sam",
name = c("test_div", "test_time_diff", "test_reference_samples"))
expect_s4_class(df, "DataFrame")
expect_equal(colnames(df),
Expand Down Expand Up @@ -272,9 +276,46 @@ test_that(".convert_divergence_to_df with NA divergence values", {
colData = col_data
)
reference <- "reference"
colnames(se) <- se[["sam"]] <- paste0("sample", seq(1, 6))
df <- .convert_divergence_to_df(
se, divergence, time_diff, reference,
se, se, divergence, time_diff, reference, "sam",
name = c("test_div", "test_time_diff", "test_reference_samples"))
expect_s4_class(df, "DataFrame")
expect_true(all(is.na(df$test_div[is.na(divergence)])))
})

# Test that the function works correctly with replicated samples
test_that("getBaselineDivergence with replicated samples", {
tse <- makeTSE(nrow = 1000, ncol = 20)
assayNames(tse) <- "counts"
colData(tse)[["time"]] <- sample(c(1, 3, 6, 100), 20, replace = TRUE)
res <- getBaselineDivergence(
tse, time.col = "time", group = "group", method = "euclidean") |>
expect_warning()
res <- res[, c(1, 2)]
# For all samples, calculate divergence
sams <- colnames(tse)
ref <- lapply(sams, function(sam){
# Get data on sample
sam_dat <- colData(tse[, sam])
# Get its reference samples
group_dat <- colData(tse)[tse[["group"]] == sam_dat[["group"]], ]
ref_time <- sort(group_dat[["time"]])[[1]]
# Loop through each reference sample, calculate its distance to
# the sample, and take mean
ref_sams <- rownames(group_dat[ group_dat[["time"]] == ref_time, ])
ref_vals <- vapply(ref_sams, function(ref_sam){
dist(t( assay(tse, "counts")[, c(sam, ref_sam)] ),
method = "euclidean")
}, numeric(1))
ref_vals <- mean(ref_vals)
# Return divergence and time difference
temp_res <- c(ref_vals, sam_dat[["time"]] - ref_time)
return(temp_res)
})
ref <- do.call(rbind, ref)
ref <- DataFrame(ref)
colnames(ref) <- colnames(res)
rownames(ref) <- rownames(res)
expect_equal(res, ref)
})
Loading
Loading