Skip to content

Commit

Permalink
EvidenceQC.MedianCov memory & make batched workflow output filenames …
Browse files Browse the repository at this point in the history
…unique (#230)
  • Loading branch information
epiercehoffman authored Sep 14, 2021
1 parent 74693ff commit a1a13e3
Show file tree
Hide file tree
Showing 7 changed files with 41 additions and 23 deletions.
2 changes: 1 addition & 1 deletion input_values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"sv_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base:mw-gnomad-0506-pr-087d4df",
"sv_base_mini_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base-mini:mw-gnomad-0506-pr-087d4df",
"sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-base:mw-gnomad-0506-pr-087d4df",
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-gnomad-0506-pr-087d4df",
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/eph/sv-pipeline:eph_mediancov_mem_and_names-3f0c76d",
"sv_pipeline_qc_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-qc:mw-gnomad-0506-pr-087d4df",
"sv_pipeline_rdtest_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-rdtest:mw-gnomad-0506-pr-087d4df",
"wham_docker" : "us.gcr.io/broad-dsde-methods/wham:8645aa",
Expand Down
17 changes: 9 additions & 8 deletions src/sv-pipeline/04_variant_resolution/scripts/PE_genotype.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ RD_genotypes=$3
RD_melted_genotypes=$4
RF_cutoffs=$5
blacklist=$6
batch=$7

egrep "DEL|DUP" ${bed} > cnv.bed;

Expand Down Expand Up @@ -136,7 +137,7 @@ awk '{if ($1!~"X" && $1!~"Y") print $4}' cnv.bed \
| fgrep -wf copystate.pass.txt \
| fgrep -wf size.pass.txt \
| fgrep -wvf <(cat cnv.exclude.all.bed depthonly.fail.txt repeat.breakpoint.fail.ids.txt) \
> pe.train.include.txt;
> "$batch.pe.train.include.txt";

# select training
pe_pval=$( awk -F'\t' '{if ( $5=="PE_log_pval") print $2}' $RF_cutoffs)
Expand All @@ -150,10 +151,10 @@ zcat ${PE_counts} \
# Join RD and PE genotypes
join -j 1 -a 1 -e "2" -o 1.2 1.3 1.4 2.2 \
<(zcat pe.geno.all.txt.gz \
| fgrep -wf pe.train.include.txt \
| fgrep -wf "$batch.pe.train.include.txt" \
| awk '{print $1"_"$2 "\t" $0}' \
| sort -k1,1 ) \
<(fgrep -wf pe.train.include.txt <(zcat ${RD_melted_genotypes}) \
<(fgrep -wf "$batch.pe.train.include.txt" <(zcat ${RD_melted_genotypes}) \
| awk '{print $4"_"$5 "\t" $6}' \
| sort -k1,1) \
| tr ' ' '\t' \
Expand Down Expand Up @@ -194,9 +195,9 @@ sd_het=$(awk '{if ($NF==1 || $NF==3) print $3}' PE.RD.hetfilter.merged.txt \
| awk '{print $NF*1.645}')

##generate PE metric file##
echo -e "pe_count"'\t'$pe_count>pe_metric_file.txt
echo -e "median_hom"'\t'$median_hom>>pe_metric_file.txt
echo -e "sd_het"'\t'$sd_het>>pe_metric_file.txt
echo -e "pe_count"'\t'$pe_count>"$batch.pe_metric_file.txt"
echo -e "median_hom"'\t'$median_hom>>"$batch.pe_metric_file.txt"
echo -e "sd_het"'\t'$sd_het>>"$batch.pe_metric_file.txt"

zcat ${PE_counts} \
| fgrep -v name \
Expand Down Expand Up @@ -228,7 +229,7 @@ zcat pe.geno.final.txt.gz|awk '{if ($NF>0) print}' \
-e "zfinal<-round((pmin(z1,z2)-z)* $normalization)" \
-e 'write.table(cbind(d[1],d[,2],d[,3],d[,4],zfinal),"wreads.geno.txt",col.names=FALSE,quote=FALSE,row.names=FALSE,sep = "\t")'

cat wreads.geno.txt null.wreads.geno.txt <(zcat null.geno.txt.gz)|awk '{if ($NF<0) print $1,$2,$3,$4,1;else if ($NF>999) print $1,$2,$3,$4,999 ;else print}' |tr ' ' '\t'|sort -k1,1 -k2,2|gzip>pe.geno.withquality.txt.gz
cat wreads.geno.txt null.wreads.geno.txt <(zcat null.geno.txt.gz)|awk '{if ($NF<0) print $1,$2,$3,$4,1;else if ($NF>999) print $1,$2,$3,$4,999 ;else print}' |tr ' ' '\t'|sort -k1,1 -k2,2|gzip>"$batch.pe.geno.withquality.txt.gz"

##Per variant quality scores##
##Assign a normalization factor to scale variant up to 999 if based on expected het median ##
Expand All @@ -253,5 +254,5 @@ awk '{print $1}' pe.variant.quality.final.txt \
awk -v var=$normalization_var '{if ($2*var>999) print $1 "\t" 999;else print $1 "\t" $2*var}' pe.variant.quality.final.txt \
|cat - pe.variant.quality.final.null.txt \
|sort -k1,1|gzip \
>pe.variant.quality.final.txt.gz
>"$batch.pe.variant.quality.final.txt.gz"

Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ RF_cutoffs=$5
whitelist=$6
petrainfile=$7
pegenotypes=$8
batch=$9

sr_pval=$( awk -F'\t' '{if ( $5=="SR_sum_log_pval") print $2}' $RF_cutoffs | head -n 1)
sr_count=$(/opt/sv-pipeline/04_variant_resolution/scripts/convert_poisson_p.py $sr_pval)
Expand Down Expand Up @@ -234,3 +235,4 @@ echo -e "rare_single"'\t'$rare_single>>sr_metric_file.txt
echo -e "rare_both"'\t'$rare_both>>sr_metric_file.txt
echo -e "common_single"'\t'$common_single>>sr_metric_file.txt
echo -e "common_both"'\t'$common_both>>sr_metric_file.txt
mv sr_metric_file.txt "$batch.sr_metric_file.txt"
15 changes: 11 additions & 4 deletions wdl/EvidenceQC.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ workflow EvidenceQC {
RuntimeAttr? wgd_build_runtime_attr
RuntimeAttr? wgd_score_runtime_attr
RuntimeAttr? runtime_attr_bincov_attr
RuntimeAttr? runtime_attr_mediancov_attr
RuntimeAttr? runtime_attr_mediancov # Memory ignored, use median_cov_mem_gb_per_sample
Float? median_cov_mem_gb_per_sample
}
call mbm.MakeBincovMatrix as MakeBincovMatrix {
Expand All @@ -68,13 +70,14 @@ workflow EvidenceQC {
runtime_attr_override = runtime_attr_bincov_attr
}
call mc.MedianCov as MedianCov{
Float median_cov_mem_gb = select_first([median_cov_mem_gb_per_sample, 0.5]) * length(samples) + 7.5
call mc.MedianCov as MedianCov {
input:
bincov_matrix = MakeBincovMatrix.merged_bincov,
cohort_id = batch,
sv_pipeline_qc_docker = sv_pipeline_qc_docker,
runtime_attr = runtime_attr_mediancov_attr
runtime_attr = runtime_attr_mediancov,
mem_gb_override = median_cov_mem_gb
}
if (run_ploidy) {
Expand Down Expand Up @@ -104,6 +107,7 @@ workflow EvidenceQC {
call vcfqc.RawVcfQC as RawVcfQC_Delly {
input:
vcfs = select_first([delly_vcfs]),
prefix = batch,
caller = "Delly",
runtime_attr_qc = runtime_attr_qc,
sv_pipeline_docker = sv_pipeline_docker,
Expand All @@ -114,6 +118,7 @@ workflow EvidenceQC {
call vcfqc.RawVcfQC as RawVcfQC_Manta {
input:
vcfs = select_first([manta_vcfs]),
prefix = batch,
caller = "Manta",
runtime_attr_qc = runtime_attr_qc,
sv_pipeline_docker = sv_pipeline_docker,
Expand All @@ -124,6 +129,7 @@ workflow EvidenceQC {
call vcfqc.RawVcfQC as RawVcfQC_Melt {
input:
vcfs = select_first([melt_vcfs]),
prefix = batch,
caller = "Melt",
runtime_attr_qc = runtime_attr_qc,
sv_pipeline_docker = sv_pipeline_docker,
Expand All @@ -134,6 +140,7 @@ workflow EvidenceQC {
call vcfqc.RawVcfQC as RawVcfQC_Wham {
input:
vcfs = select_first([wham_vcfs]),
prefix = batch,
caller = "Wham",
runtime_attr_qc = runtime_attr_qc,
sv_pipeline_docker = sv_pipeline_docker,
Expand Down
9 changes: 6 additions & 3 deletions wdl/RawVcfQC.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import "Structs.wdl"
workflow RawVcfQC {
input {
Array[File] vcfs
String prefix
String caller
String sv_pipeline_docker
RuntimeAttr? runtime_attr_qc
Expand All @@ -30,6 +31,7 @@ workflow RawVcfQC {
call PickOutliers {
input:
stat_files = RunIndividualQC.stat,
prefix = prefix,
caller = caller,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_outlier
Expand Down Expand Up @@ -84,6 +86,7 @@ task RunIndividualQC {
task PickOutliers {
input {
Array[File] stat_files
String prefix
String caller
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
Expand All @@ -102,15 +105,15 @@ task PickOutliers {
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File low = "${caller}.QC.outlier.low"
File high = "${caller}.QC.outlier.high"
File low = "${prefix}.${caller}.QC.outlier.low"
File high = "${prefix}.${caller}.QC.outlier.high"
}
command <<<
set -eu -o pipefail
# concatenate all stat_files into one input file
xargs cat < ~{write_lines(stat_files)} > ~{caller}.QC.input
# pick outliers from input stats
/opt/sv-pipeline/pre_SVCalling_and_QC/raw_vcf_qc/calc_num_svs_pick_outlier.py ~{caller}.QC.input ~{caller}.QC.outlier -z
/opt/sv-pipeline/pre_SVCalling_and_QC/raw_vcf_qc/calc_num_svs_pick_outlier.py ~{caller}.QC.input ~{prefix}.~{caller}.QC.outlier -z

>>>
runtime {
Expand Down
12 changes: 7 additions & 5 deletions wdl/TrainPEGenotyping.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ workflow TrainPEGenotyping {
RD_genotypes = RD_genotypes,
RD_melted_genotypes = RD_melted_genotypes,
exclude_list = exclude_list,
batch = batch_ID,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_genotype
}
Expand Down Expand Up @@ -134,6 +135,7 @@ task GenotypePEPart1 {
File RD_genotypes
File RD_melted_genotypes
File exclude_list
String batch
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}
Expand All @@ -149,10 +151,10 @@ task GenotypePEPart1 {
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File PE_train = "pe.train.include.txt"
File PE_metrics = "pe_metric_file.txt"
File genotypes = "pe.geno.withquality.txt.gz"
File varGQ = "pe.variant.quality.final.txt.gz"
File PE_train = "~{batch}.pe.train.include.txt"
File PE_metrics = "~{batch}.pe_metric_file.txt"
File genotypes = "~{batch}.pe.geno.withquality.txt.gz"
File varGQ = "~{batch}.pe.variant.quality.final.txt.gz"
}
command <<<

Expand All @@ -163,7 +165,7 @@ task GenotypePEPart1 {
~{RD_melted_genotypes} \
~{RF_cutoffs} \
~{exclude_list} \
/opt/RdTest/generate_cutoff_PE.R
~{batch}

>>>
runtime {
Expand Down
7 changes: 5 additions & 2 deletions wdl/TrainSRGenotyping.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ workflow TrainSRGenotyping {
samples = samples,
PE_train = PE_train,
PE_genotypes = PE_genotypes,
batch = batch_ID,
sv_pipeline_docker = sv_pipeline_docker,
runtime_attr_override = runtime_attr_genotype
}
Expand All @@ -89,6 +90,7 @@ task GenotypeSRPart1 {
Array[String] samples
File PE_train
File PE_genotypes
String batch
String sv_pipeline_docker
RuntimeAttr? runtime_attr_override
}
Expand All @@ -104,7 +106,7 @@ task GenotypeSRPart1 {
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
output {
File SR_metrics = "sr_metric_file.txt"
File SR_metrics = "~{batch}.sr_metric_file.txt"
}
command <<<

Expand All @@ -116,7 +118,8 @@ task GenotypeSRPart1 {
~{RF_cutoffs} \
~{write_lines(samples)} \
~{PE_train} \
~{PE_genotypes}
~{PE_genotypes} \
~{batch}

>>>
runtime {
Expand Down

0 comments on commit a1a13e3

Please sign in to comment.