diff --git a/.github/.dockstore.yml b/.github/.dockstore.yml index 6c105e73c..f5257f23b 100644 --- a/.github/.dockstore.yml +++ b/.github/.dockstore.yml @@ -24,6 +24,7 @@ workflows: filters: branches: - main + - kj/44_outlier_sample_removal tags: - /.*/ @@ -51,6 +52,7 @@ workflows: filters: branches: - main + - kj/44_outlier_sample_removal tags: - /.*/ diff --git a/src/RdTest/RdTest.R b/src/RdTest/RdTest.R index 5557e03de..75b0e9859 100755 --- a/src/RdTest/RdTest.R +++ b/src/RdTest/RdTest.R @@ -109,7 +109,9 @@ option_list <- list( make_option(c("-l", "--SampleExcludeList"), type="character", default=NULL, help="Optional:Single column file with list of samples to exclude", metavar="character"), make_option(c("-w", "--SampleIncludeList"), type="character", default=NULL, - help="Optional:Single column file with list of samples to include", metavar="character") + help="Optional:Single column file with list of samples to include", metavar="character"), + make_option(c("--outlierSampleIds"), type="character", default=NULL, + help="Optional:Path to file containing outlier sample IDs", metavar="character") ) opt_parser <- OptionParser(option_list = option_list) @@ -507,13 +509,14 @@ specified_cnv <- function(cnv_matrix, sampleIDs, cnvID, chr, start, end, cnvtype ##make sure first four columns are not modified## columnswithsamp <- c(columnswithsamp, 1, 2, 3, 4) genotype_matrix[1,-columnswithsamp] = 2 - } + } + return(genotype_matrix) } ###Kmeans multi-CNV Test## ##interval is measured by predicted copy state## -kMeans <-function(cnv_matrix,chr,start,end,cnvID,Kinterval,Kintervalstart,Kintervalend,outFolder,outputname) +kMeans <- function(cnv_matrix,chr,start,end,cnvID,Kinterval,Kintervalstart,Kintervalend,outFolder,outputname) { samplenames <- rownames(cnv_matrix) #create Eucledian matrix### @@ -931,7 +934,7 @@ plotJPG <- function(genotype_matrix,cnv_matrix,chr,start,end,cnvID,sampleIDs,out } ##Provide genotype for VCF format## -genotype<- function(cnv_matrix,genotype_matrix,refgeno,chr,start,end,cnvID,sampleIDs,cnvtype,outFolder,outputname,plot_cnvmatrix) +genotype <- function(cnv_matrix,genotype_matrix,refgeno,chr,start,end,cnvID,sampleIDs,cnvtype,outFolder,outputname,plot_cnvmatrix) { ##get depth intensities## cnv_median <-c(create_groups(genotype_matrix, cnv_matrix)$Control,create_groups(genotype_matrix, cnv_matrix)$Treat) @@ -1107,12 +1110,11 @@ runRdTest<-function(bed) end = round(center + (sizefilter/2)) } + ##Make sure region is in tabix## if (end - start <= 0 ) { end=start+1 } - ##Make sure region is in tabix## - ##Get Intesity Data## if (exists("poorbincov")) { @@ -1177,7 +1179,18 @@ runRdTest<-function(bed) idsforsearch<-rownames(cnv_matrix) samplestokeep<-match(unlist(strsplit(sampleIDs,",")),idsforsearch) sampleIDs<-idsforsearch[na.omit(samplestokeep)] + + #Exclude outlier samples only if non-outlier samples exist + if (!is.null(opt$outlierSampleIds)) { + outlier_ids <- readLines(opt$outlierSampleIds) + non_outlier_samples <- setdiff(sampleIDs, outlier_ids) + if (length(non_outlier_samples) > 0) { + sampleIDs <- non_outlier_samples + cnv_matrix <- cnv_matrix[!(rownames(cnv_matrix) %in% outlier_ids), , drop = FALSE] + } + } samplesPrior <-unlist(strsplit(as.character(sampleIDs),split=",")) + ##Run K Test if Specified## if (opt$runKmeans == TRUE) { k_matrix<-kMeans(cnv_matrix,chr,start,end,cnvID,Kinterval,Kintervalstart,Kintervalend,outFolder,outputname) diff --git a/src/sv-pipeline/02_evidence_assessment/02e_metric_aggregation/scripts/aggregate.py b/src/sv-pipeline/02_evidence_assessment/02e_metric_aggregation/scripts/aggregate.py index 59d9f62db..a3ed803ae 100755 --- a/src/sv-pipeline/02_evidence_assessment/02e_metric_aggregation/scripts/aggregate.py +++ b/src/sv-pipeline/02_evidence_assessment/02e_metric_aggregation/scripts/aggregate.py @@ -124,12 +124,17 @@ def fam_info_readin(fam_file): return [fam, samp, fa, mo] -def process_metadata(variants, bed=False, batch_list=None): +def process_metadata(variants, bed=False, batch_list=None, outlier_sample_ids=None): if bed: samples = [s.strip() for s in batch_list.readlines()] else: samples = list(variants.header.samples) + outlier_set = set() + if outlier_sample_ids: + with open(outlier_sample_ids, 'r') as f: + outlier_set = set(line.strip() for line in f) + # parents = [s for s in samples if _is_parent(s)] # children = [s for s in samples if _is_child(s)] # n_parents = len(parents) @@ -192,23 +197,11 @@ def process_metadata(variants, bed=False, batch_list=None): # Flag variants specific to outlier samples metadata['is_outlier_specific'] = False - for svtype in 'DEL DUP INV BND INS'.split(): - counts = pd.DataFrame.from_dict(called_counts[svtype], orient='index')\ - .reset_index()\ - .rename(columns={'index': 'sample', 0: 'var_count'}) - if counts.shape[0] == 0: - continue - - q1 = counts.var_count.quantile(0.25) - q3 = counts.var_count.quantile(0.75) - thresh = q3 + 1.5 * (q3 - q1) - outliers = counts.loc[counts.var_count >= thresh, 'sample'].values - - flagged = [] - for var_name, samples in called_samples[svtype].items(): - if samples.issubset(outliers): - flagged.append(var_name) - metadata.loc[metadata.name.isin(flagged), 'is_outlier_specific'] = True + if len(outlier_set) > 0: + for variants in called_samples.values(): + for name, called in variants.items(): + if called and called.issubset(outlier_set): + metadata.loc[metadata.name == name, 'is_outlier_specific'] = True for col in 'start end svsize'.split(): metadata[col] = metadata[col].astype(int) @@ -273,6 +266,7 @@ def main(): parser.add_argument('-b', '--BAFtest') parser.add_argument('-s', '--SRtest') parser.add_argument('-p', '--PEtest') + parser.add_argument('-o', '--outlier-sample-ids') parser.add_argument('--batch-list', type=argparse.FileType('r')) parser.add_argument('--segdups', required=True) parser.add_argument('--rmsk', required=True) @@ -290,7 +284,11 @@ def main(): variants = pysam.VariantFile(args.variants) dtypes = 'PE SR RD BAF'.split() - metadata = process_metadata(variants, args.bed, args.batch_list) + outlier_sample_ids = None + if args.outlier_sample_ids: + outlier_sample_ids = args.outlier_sample_ids + + metadata = process_metadata(variants, args.bed, args.batch_list, outlier_sample_ids) # Calculate segdup coverage bt = pbt.BedTool.from_dataframe(metadata['chrom start end'.split()]) diff --git a/src/svtk/svtk/cli/baf_test.py b/src/svtk/svtk/cli/baf_test.py index a8e3edda7..87514e28a 100644 --- a/src/svtk/svtk/cli/baf_test.py +++ b/src/svtk/svtk/cli/baf_test.py @@ -9,7 +9,7 @@ ########## -def preprocess(chrom, start, end, tbx, samples, window=None): +def preprocess(chrom, start, end, tbx, samples, window=None, called_samples=None, outlier_sample_ids=None): """ Report normalized BAFs in a set of samples across a desired region. @@ -33,55 +33,58 @@ def preprocess(chrom, start, end, tbx, samples, window=None): called_bafs : pd.DataFrame BAF of each called SNP within CNV """ - - # Load and filter SNP sites if window is None: window = end - start - # records = tbx.fetch(chrom, start - window, end + window,parser=pysam.asTuple()) - # sites = deque() + bafs = deque() if window < 1000000: for record in tbx.fetch(chrom, max(1, start - window), end + window, parser=pysam.asTuple()): bafs.append(np.array(record)) else: - for record in tbx.fetch(chrom, max(1, start - 1000000), start, parser=pysam.asTuple()): bafs.append(np.array(record)) for record in tbx.fetch(chrom, (start + end) // 2 - 500000, (start + end) // 2 + 500000, parser=pysam.asTuple()): bafs.append(np.array(record)) for record in tbx.fetch(chrom, end, end + 1000000, parser=pysam.asTuple()): bafs.append(np.array(record)) - bafs = np.array(bafs) - # if bafs.shape[0] == 0: - # return 0,0 - bafs = pd.DataFrame(bafs) + + bafs = pd.DataFrame(np.array(bafs)) if bafs.empty: - return bafs, bafs + return bafs, bafs, called_samples bafs.columns = ['chr', 'pos', 'baf', 'sample'] - bafs = bafs[bafs['sample'].isin(samples)] - # print(bafs) + + # Exclude outlier samples from all samples only if non-outlier samples exist + bafs_no_outliers = bafs[bafs['sample'].isin(samples) & ~bafs['sample'].isin(outlier_sample_ids)] + if not bafs_no_outliers.empty: + bafs = bafs_no_outliers + + # Exclude outlier samples from called samples only if non-outlier samples exist + non_outlier_called_samples = [s for s in called_samples if s not in outlier_sample_ids] + if non_outlier_called_samples: + retained_samples = non_outlier_called_samples + else: + retained_samples = called_samples + if bafs.empty: - return bafs, bafs + return bafs, bafs, retained_samples + bafs['pos'] = bafs.pos.astype(int) bafs['baf'] = bafs.baf.astype(float) - # print(bafqs) bafs.loc[bafs.pos <= start, 'region'] = 'before' bafs.loc[bafs.pos >= end, 'region'] = 'after' bafs.loc[(bafs.pos > start) & (bafs.pos < end), 'region'] = 'inside' - het_counts = bafs.groupby('sample region'.split()).size( - ).reset_index().rename(columns={0: 'count'}) - het_counts = het_counts.pivot_table( - values='count', index='sample', columns='region', fill_value=0) + het_counts = bafs.groupby('sample region'.split()).size().reset_index().rename(columns={0: 'count'}) + het_counts = het_counts.pivot_table(values='count', index='sample', columns='region', fill_value=0) het_counts = het_counts.reindex(samples).fillna(0).astype(int) cols = 'before inside after sample'.split() het_counts = het_counts.reset_index() for colname in cols: if colname not in het_counts.columns: het_counts[colname] = 0 - # het_counts = het_counts.reset_index()[cols] + # Report BAF for variants inside CNV called_bafs = bafs.loc[bafs.region == 'inside'].copy() - return het_counts, called_bafs + return het_counts, called_bafs, retained_samples def main(argv): @@ -93,7 +96,7 @@ def main(argv): parser.add_argument('file', help='Compiled snp file') parser.add_argument('-b', '--batch',) parser.add_argument('--index', help='Tabix index for remote bed') - # help='Samples') + parser.add_argument('--outlier-sample-ids', help='Path to file containing outlier sample IDs') # Print help if no arguments specified if len(argv) == 0: @@ -125,6 +128,11 @@ def main(argv): raise Exception('Must provide tabix index with remote URL') tbx = pysam.TabixFile(args.file) + outlier_samples = set() + if args.outlier_sample_ids: + with open(args.outlier_sample_ids, 'r') as f: + outlier_samples = set(line.strip() for line in f) + # this is necessary to avoid stochasticity in calculation of KS statistic np.random.seed(0) random_state = np.random.RandomState(0) @@ -141,8 +149,11 @@ def main(argv): samplelist = samples.split(',') type = dat[5] try: - het_counts, called_bafs = preprocess( - chrom, start, end, tbx, samples=splist) + het_counts, called_bafs, samplelist = preprocess( + chrom, start, end, tbx, samples=splist, + outlier_sample_ids=outlier_samples, + called_samples=samplelist + ) except ValueError: het_counts = pd.DataFrame() called_bafs = pd.DataFrame() @@ -152,7 +163,12 @@ def main(argv): min(end - start, 1000000), random_state=random_state) KS = KS2sample(called_bafs, samplelist) ks, ksp = KS.test(samplelist) - mean, delp = Del.Ttest(samplelist) + try: + mean, delp = Del.Ttest(samplelist) + except Exception: + print(samplelist) + print(het_counts) + raise Exception statis = Del.stats(samplelist) line = chrom + '\t' + str(start) + '\t' + str(end) + '\t' + id + '\t' + samples + '\t' + type + '\t' + str( mean) + ',' + str(delp) + "\t" + str(ks) + ',' + str(ksp) + '\t' + statis diff --git a/src/svtk/svtk/cli/pesr_test.py b/src/svtk/svtk/cli/pesr_test.py index ec61221bf..10ccb2969 100644 --- a/src/svtk/svtk/cli/pesr_test.py +++ b/src/svtk/svtk/cli/pesr_test.py @@ -51,6 +51,8 @@ def sr_test(argv): 'Same format as RdTest, one column per sample.') parser.add_argument('--log', action='store_true', default=False, help='Print progress log to stderr.') + parser.add_argument('--outlier-sample-ids', default=None, + help='Path to file containing outlier sample IDs.') # Print help if no arguments specified if len(argv) == 0: @@ -87,9 +89,14 @@ def sr_test(argv): else: medians = None + outlier_sample_ids = None + if args.outlier_sample_ids: + outlier_sample_ids = args.outlier_sample_ids + runner = SRTestRunner(vcf, countfile, fout, args.background, common=args.common, window=args.window, ins_window=args.insertion_window, - whitelist=whitelist, medians=medians, log=args.log) + whitelist=whitelist, medians=medians, log=args.log, + outlier_sample_ids=outlier_sample_ids) runner.run() @@ -126,6 +133,8 @@ def pe_test(argv): 'Same format as RdTest, one column per sample.') parser.add_argument('--log', action='store_true', default=False, help='Print progress log to stderr.') + parser.add_argument('--outlier-sample-ids', default=None, + help='Path to file containing outlier sample IDs.') if len(argv) == 0: parser.print_help() @@ -163,8 +172,12 @@ def pe_test(argv): else: medians = None - runner = PETestRunner(vcf, discfile, fout, args.background, args.common, - args.window_in, args.window_out, whitelist, medians=medians, log=args.log) + outlier_sample_ids = None + if args.outlier_sample_ids: + outlier_sample_ids = args.outlier_sample_ids + + runner = PETestRunner(vcf, discfile, fout, args.background, args.common, args.window_in, args.window_out, + whitelist, medians=medians, log=args.log, outlier_sample_ids=outlier_sample_ids) runner.run() diff --git a/src/svtk/svtk/pesr/pe_test.py b/src/svtk/svtk/pesr/pe_test.py index bde88cb59..a9a8e61f9 100644 --- a/src/svtk/svtk/pesr/pe_test.py +++ b/src/svtk/svtk/pesr/pe_test.py @@ -128,8 +128,8 @@ def _get_coords(pos, strand): class PETestRunner(PESRTestRunner): def __init__(self, vcf, discfile, fout, n_background=160, common=False, - window_in=50, window_out=500, - whitelist=None, blacklist=None, medians=None, log=False): + window_in=50, window_out=500, whitelist=None, blacklist=None, + medians=None, log=False, outlier_sample_ids=None): """ vcf : pysam.VariantFile discfile : pysam.TabixFile @@ -138,7 +138,7 @@ def __init__(self, vcf, discfile, fout, n_background=160, common=False, window_out, medians=medians) self.fout = fout - super().__init__(vcf, common, n_background, whitelist, blacklist, log) + super().__init__(vcf, common, n_background, whitelist, blacklist, log, outlier_sample_ids) def test_record(self, record): if not self._strand_check(record): diff --git a/src/svtk/svtk/pesr/pesr_test.py b/src/svtk/svtk/pesr/pesr_test.py index 5668f9147..824b7c1ae 100644 --- a/src/svtk/svtk/pesr/pesr_test.py +++ b/src/svtk/svtk/pesr/pesr_test.py @@ -93,7 +93,7 @@ def null_score(self, null_val=0.0): class PESRTestRunner: def __init__(self, vcf, common=False, n_background=160, whitelist=None, blacklist=None, - log=False): + log=False, outlier_sample_ids=None): self.vcf = vcf self.common = common @@ -105,6 +105,12 @@ def __init__(self, vcf, common=False, n_background=160, whitelist=None, blacklis self.log = log + outlier_samples = set() + if outlier_sample_ids: + with open(outlier_sample_ids, 'r') as f: + outlier_samples = set([line.strip() for line in f]) + self.outlier_sample_ids = outlier_samples + def run(self): if self.log: start = datetime.datetime.now() @@ -134,7 +140,12 @@ def test_record(self, record): def choose_background(self, record, whitelist=None, blacklist=None): # Select called and background samples called = svu.get_called_samples(record) - background = [s for s in self.samples if s not in called] + background = [s for s in self.samples if s not in called and s not in self.outlier_sample_ids] + + # Exclude outlier samples only if non-outlier samples exist + non_outlier_called = [s for s in called if s not in self.outlier_sample_ids] + if len(non_outlier_called) > 0: + called = non_outlier_called # Permit override of specified white/blacklists whitelist = whitelist if whitelist is not None else self.whitelist diff --git a/src/svtk/svtk/pesr/sr_test.py b/src/svtk/svtk/pesr/sr_test.py index 612127de3..91fd3f682 100644 --- a/src/svtk/svtk/pesr/sr_test.py +++ b/src/svtk/svtk/pesr/sr_test.py @@ -183,7 +183,7 @@ def _test_coord(self, coord, strand, chrom, samples, background, left_boundary, class SRTestRunner(PESRTestRunner): def __init__(self, vcf, countfile, fout, n_background=160, common=False, window=100, ins_window=50, - whitelist=None, blacklist=None, medians=None, log=False): + whitelist=None, blacklist=None, medians=None, log=False, outlier_sample_ids=None): """ vcf : pysam.VariantFile countfile : pysam.TabixFile @@ -197,7 +197,7 @@ def __init__(self, vcf, countfile, fout, n_background=160, common=False, window= self.srtest = SRTest(countfile, common=common, window=window, ins_window=ins_window, medians=medians) self.fout = fout - super().__init__(vcf, common, n_background, whitelist, blacklist, log) + super().__init__(vcf, common, n_background, whitelist, blacklist, log, outlier_sample_ids) def test_record(self, record): called, background = self.choose_background(record) diff --git a/wdl/BAFTest.wdl b/wdl/BAFTest.wdl index 864b65155..98d0b7aff 100644 --- a/wdl/BAFTest.wdl +++ b/wdl/BAFTest.wdl @@ -14,6 +14,7 @@ workflow BAFTest { Int split_size File autosome_contigs File ref_dict + File? outlier_sample_ids String linux_docker String sv_pipeline_docker @@ -36,6 +37,7 @@ workflow BAFTest { split_size = split_size, chrom = autosome[0], ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_baftest = runtime_attr_baftest, diff --git a/wdl/BAFTestChromosome.wdl b/wdl/BAFTestChromosome.wdl index c4f42da77..635d5bf2d 100644 --- a/wdl/BAFTestChromosome.wdl +++ b/wdl/BAFTestChromosome.wdl @@ -13,6 +13,7 @@ workflow BAFTestChromosome { Int split_size Int? suffix_len File ref_dict + File? outlier_sample_ids String linux_docker String sv_pipeline_docker @@ -45,6 +46,7 @@ workflow BAFTestChromosome { samples = samples, prefix = basename(split), batch = batch, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_baftest } @@ -72,6 +74,7 @@ task BAFTest { Array[String] samples String prefix String batch + File? outlier_sample_ids String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -122,8 +125,8 @@ task BAFTest { bgzip local.BAF.txt tabix -0 -s1 -b2 -e2 local.BAF.txt.gz fi - - svtk baf-test ~{bed} local.BAF.txt.gz --batch batch.key > ~{prefix}.metrics + + svtk baf-test ~{bed} local.BAF.txt.gz --batch batch.key ~{if defined(outlier_sample_ids) then "--outlier-sample-ids ~{outlier_sample_ids}" else ""} > ~{prefix}.metrics >>> runtime { diff --git a/wdl/GenerateBatchMetrics.wdl b/wdl/GenerateBatchMetrics.wdl index ab8cafb10..db015f885 100644 --- a/wdl/GenerateBatchMetrics.wdl +++ b/wdl/GenerateBatchMetrics.wdl @@ -42,6 +42,7 @@ workflow GenerateBatchMetrics { # Run module metrics workflow at the end - on by default Boolean? run_module_metrics File? primary_contigs_list # required if run_module_metrics = true + File? outlier_sample_ids # sample IDs to exclude from training String sv_pipeline_docker String sv_base_mini_docker @@ -129,6 +130,7 @@ workflow GenerateBatchMetrics { male_samples = GetSampleLists.male_samples, female_samples = GetSampleLists.female_samples, male_only_variant_ids = GetMaleOnlyVariantIDs.male_only_variant_ids, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, linux_docker = linux_docker, runtime_attr_rdtest = runtime_attr_rdtest, @@ -147,6 +149,7 @@ workflow GenerateBatchMetrics { algorithm = algorithm, batch = batch, samples = GetSampleIdsFromVcf.out_array, + outlier_sample_ids = outlier_sample_ids, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_baftest = runtime_attr_baftest, @@ -175,6 +178,7 @@ workflow GenerateBatchMetrics { male_only_variant_ids = GetMaleOnlyVariantIDs.male_only_variant_ids, run_common = true, common_cnv_size_cutoff = common_cnv_size_cutoff, + outlier_sample_ids = outlier_sample_ids, sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, @@ -203,6 +207,7 @@ workflow GenerateBatchMetrics { female_samples = GetSampleLists.female_samples, male_only_variant_ids = GetMaleOnlyVariantIDs.male_only_variant_ids, common_cnv_size_cutoff = common_cnv_size_cutoff, + outlier_sample_ids = outlier_sample_ids, sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, @@ -222,6 +227,7 @@ workflow GenerateBatchMetrics { baftest = BAFTest.baftest, segdups = segdups, rmsk = rmsk, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_aggregate_tests } @@ -241,6 +247,7 @@ workflow GenerateBatchMetrics { srtest = SRTest.srtest_common, segdups = segdups, rmsk = rmsk, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_aggregate_tests } @@ -393,6 +400,7 @@ task AggregateTests { File? srtest File segdups File rmsk + File? outlier_sample_ids String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -418,6 +426,7 @@ task AggregateTests { ~{if defined(baftest) then "-b ~{baftest}" else "" } \ ~{if defined(petest) then "-p ~{petest}" else "" } \ ~{if defined(srtest) then "-s ~{srtest}" else "" } \ + ~{if defined(outlier_sample_ids) then "-o ~{outlier_sample_ids}" else ""} \ --segdups ~{segdups} \ --rmsk ~{rmsk} \ aggregated.metrics diff --git a/wdl/PETest.wdl b/wdl/PETest.wdl index 38bb00913..af8daeb45 100644 --- a/wdl/PETest.wdl +++ b/wdl/PETest.wdl @@ -20,6 +20,7 @@ workflow PETest { File male_only_variant_ids File samples Int common_cnv_size_cutoff + File? outlier_sample_ids String sv_base_mini_docker String linux_docker @@ -52,6 +53,7 @@ workflow PETest { allosome = false, ref_dict = ref_dict, common_cnv_size_cutoff = common_cnv_size_cutoff, + outlier_sample_ids = outlier_sample_ids, sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, @@ -81,6 +83,7 @@ workflow PETest { allosome = true, ref_dict = ref_dict, common_cnv_size_cutoff = common_cnv_size_cutoff, + outlier_sample_ids = outlier_sample_ids, sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, diff --git a/wdl/PETestChromosome.wdl b/wdl/PETestChromosome.wdl index 360db4bf0..848d9ff58 100644 --- a/wdl/PETestChromosome.wdl +++ b/wdl/PETestChromosome.wdl @@ -20,6 +20,7 @@ workflow PETestChromosome { File samples Boolean allosome Int common_cnv_size_cutoff + File? outlier_sample_ids String sv_base_mini_docker String linux_docker @@ -56,6 +57,7 @@ workflow PETestChromosome { prefix = basename(split), ref_dict = ref_dict, common_model = false, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_petest } @@ -70,6 +72,7 @@ workflow PETestChromosome { prefix = basename(split), ref_dict = ref_dict, common_model = false, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_petest } @@ -96,6 +99,7 @@ workflow PETestChromosome { prefix = basename(split), ref_dict = ref_dict, common_model = false, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_petest } @@ -134,6 +138,7 @@ workflow PETestChromosome { ref_dict = ref_dict, prefix = basename(split), common_model = true, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_petest } @@ -177,6 +182,7 @@ task PETest { Boolean common_model File ref_dict String prefix + File? outlier_sample_ids String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -227,7 +233,7 @@ task PETest { tabix -0 -s1 -b2 -e2 local.PE.txt.gz fi - svtk pe-test -o ~{window} ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.PE.txt.gz ~{prefix}.stats + svtk pe-test -o ~{window} ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.PE.txt.gz ~{prefix}.stats ~{if defined(outlier_sample_ids) then "--outlier-sample-ids ~{outlier_sample_ids}" else ""} >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) diff --git a/wdl/RDTest.wdl b/wdl/RDTest.wdl index 3bfb8d017..76e5f4a68 100644 --- a/wdl/RDTest.wdl +++ b/wdl/RDTest.wdl @@ -20,6 +20,7 @@ workflow RDTest { File male_only_variant_ids File samples File ref_dict + File? outlier_sample_ids String sv_pipeline_docker String linux_docker @@ -50,6 +51,7 @@ workflow RDTest { male_only_variant_ids = male_only_variant_ids, allosome = false, ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, linux_docker = linux_docker, runtime_attr_rdtest = runtime_attr_rdtest, @@ -77,6 +79,7 @@ workflow RDTest { male_only_variant_ids = male_only_variant_ids, allosome = true, ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, linux_docker = linux_docker, runtime_attr_rdtest = runtime_attr_rdtest, diff --git a/wdl/RDTestChromosome.wdl b/wdl/RDTestChromosome.wdl index 0668fe5d5..84d9a3c92 100644 --- a/wdl/RDTestChromosome.wdl +++ b/wdl/RDTestChromosome.wdl @@ -20,6 +20,7 @@ workflow RDTestChromosome { File samples Boolean allosome File ref_dict + File? outlier_sample_ids String sv_pipeline_docker String linux_docker @@ -56,6 +57,7 @@ workflow RDTestChromosome { prefix = basename(split), flags = flags, ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_rdtest } @@ -71,6 +73,7 @@ workflow RDTestChromosome { prefix = basename(split), flags = flags, ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_rdtest } @@ -99,6 +102,7 @@ workflow RDTestChromosome { prefix = basename(split), flags = flags, ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_rdtest } @@ -131,6 +135,7 @@ task RDTest { File ref_dict String prefix String flags + File? outlier_sample_ids String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -178,7 +183,7 @@ task RDTest { bgzip local.RD.txt tabix -p bed local.RD.txt.gz fi - + Rscript /opt/RdTest/RdTest.R \ -b ~{bed} \ -n ~{prefix} \ @@ -186,7 +191,7 @@ task RDTest { -m ~{medianfile} \ -f ~{ped_file} \ -w ~{include_list} \ - ~{flags} + ~{if defined(outlier_sample_ids) then "--outlierSampleIds ~{outlier_sample_ids}" else ""} ~{flags} >>> runtime { diff --git a/wdl/SRTest.wdl b/wdl/SRTest.wdl index b16e30935..25e6ca29d 100644 --- a/wdl/SRTest.wdl +++ b/wdl/SRTest.wdl @@ -21,6 +21,7 @@ workflow SRTest { Boolean run_common Int? common_cnv_size_cutoff # Required if run_common is true File ref_dict + File? outlier_sample_ids String sv_pipeline_docker String linux_docker @@ -55,6 +56,7 @@ workflow SRTest { run_common = run_common, common_cnv_size_cutoff = common_cnv_size_cutoff, ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, @@ -85,6 +87,7 @@ workflow SRTest { run_common = run_common, common_cnv_size_cutoff = common_cnv_size_cutoff, ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_base_mini_docker = sv_base_mini_docker, linux_docker = linux_docker, sv_pipeline_docker = sv_pipeline_docker, diff --git a/wdl/SRTestChromosome.wdl b/wdl/SRTestChromosome.wdl index 83987975d..d1bc75a4f 100644 --- a/wdl/SRTestChromosome.wdl +++ b/wdl/SRTestChromosome.wdl @@ -21,6 +21,7 @@ workflow SRTestChromosome { Boolean allosome Boolean run_common Int? common_cnv_size_cutoff + File? outlier_sample_ids String sv_pipeline_docker String linux_docker @@ -57,6 +58,7 @@ workflow SRTestChromosome { common_model = false, prefix = basename(split), ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_srtest } @@ -71,6 +73,7 @@ workflow SRTestChromosome { common_model = false, prefix = basename(split), ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_srtest } @@ -98,6 +101,7 @@ workflow SRTestChromosome { common_model = false, prefix = basename(split), ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_srtest } @@ -136,6 +140,7 @@ workflow SRTestChromosome { common_model = true, prefix = basename(split), ref_dict = ref_dict, + outlier_sample_ids = outlier_sample_ids, sv_pipeline_docker = sv_pipeline_docker, runtime_attr_override = runtime_attr_srtest } @@ -179,6 +184,7 @@ task SRTest { Boolean common_model String prefix File ref_dict + File? outlier_sample_ids String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -227,8 +233,8 @@ task SRTest { bgzip local.SR.txt tabix -0 -s1 -b2 -e2 local.SR.txt.gz fi - - svtk sr-test -w 50 --log ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.SR.txt.gz ~{prefix}.stats + + svtk sr-test -w 50 --log ~{common_arg} --medianfile ~{medianfile} --samples ~{include_list} ~{vcf} local.SR.txt.gz ~{prefix}.stats ~{if defined(outlier_sample_ids) then "--outlier-sample-ids ~{outlier_sample_ids}" else ""} >>> runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) diff --git a/wdl/TrainGCNV.wdl b/wdl/TrainGCNV.wdl index 43c41e8fa..34c6762bf 100644 --- a/wdl/TrainGCNV.wdl +++ b/wdl/TrainGCNV.wdl @@ -20,9 +20,8 @@ workflow TrainGCNV { # Options for subsetting samples for training. # Assumes all other inputs correspond to the full sample list. Intended for Terra Int? n_samples_subsample # Number of samples to subsample from provided sample list for trainGCNV (rec: ~100) + File? outlier_sample_ids # Identifies samples to be excluded from provided sample list for trainGCNV Int subsample_seed = 42 - # Subset of full sample list on which to train the gCNV model. Overrides n_samples_subsample if both provided - Array[String]? sample_ids_training_subset # Condense read counts Int? min_interval_size @@ -88,7 +87,7 @@ workflow TrainGCNV { String linux_docker String gatk_docker String condense_counts_docker - String? sv_pipeline_docker # required if using n_samples_subsample or sample_ids_training_subset to subset samples + String? sv_pipeline_docker # required if using n_samples_subsample or outlier_sample_ids to subset samples # Runtime configuration overrides RuntimeAttr? condense_counts_runtime_attr @@ -102,28 +101,19 @@ workflow TrainGCNV { RuntimeAttr? runtime_attr_explode } - if (defined(sample_ids_training_subset)) { - call util.GetSubsampledIndices { + if ((defined(n_samples_subsample) && select_first([n_samples_subsample]) < length(samples)) || (defined(outlier_sample_ids))) { + call util.GetFilteredSubsampledIndices { input: all_strings = write_lines(samples), - subset_strings = write_lines(select_first([sample_ids_training_subset])), - prefix = cohort, - sv_pipeline_docker = select_first([sv_pipeline_docker]) - } - } - - if (defined(n_samples_subsample) && (select_first([n_samples_subsample]) < length(samples)) && !defined(sample_ids_training_subset)) { - call util.RandomSubsampleStringArray { - input: - strings = write_lines(samples), - seed = subsample_seed, + exclude_strings = select_first([outlier_sample_ids]), subset_size = select_first([n_samples_subsample]), + seed = subsample_seed, prefix = cohort, sv_pipeline_docker = select_first([sv_pipeline_docker]) } } - Array[Int] sample_indices = select_first([GetSubsampledIndices.subsample_indices_array, RandomSubsampleStringArray.subsample_indices_array, range(length(samples))]) + Array[Int] sample_indices = select_first([GetFilteredSubsampledIndices.include_indices_array, range(length(samples))]) scatter (i in sample_indices) { String sample_ids_ = samples[i] diff --git a/wdl/Utils.wdl b/wdl/Utils.wdl index 70a39436a..20a90e8ea 100644 --- a/wdl/Utils.wdl +++ b/wdl/Utils.wdl @@ -239,7 +239,73 @@ task RunQC { preemptible: preemptible_attempts maxRetries: 1 } +} +task GetFilteredSubsampledIndices { + input { + File all_strings + File? exclude_strings + Int? subset_size + Int seed + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + String include_indices_filename = "~{prefix}.filtered_subsampled_indices.list" + String include_strings_filename = "~{prefix}.filtered_subsampled_strings.list" + + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1 + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + command <<< + + set -euo pipefail + python3 <>> + + output { + File include_indices_file = include_indices_filename + Array[Int] include_indices_array = read_lines(include_indices_filename) + File include_strings_file = include_strings_filename + Array[String] include_strings_array = read_lines(include_strings_filename) + } + + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + docker: sv_pipeline_docker + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + } } task RandomSubsampleStringArray {