From 0b9cf45d1d1fa44855d55ae479cb58ca1d6021a4 Mon Sep 17 00:00:00 2001 From: Karan Jaisingh Date: Tue, 4 Feb 2025 11:25:25 -0500 Subject: [PATCH] Fix IntegrateGQ.sh errors due to presence of variants of just one type (#760) * Initial commit * Added safe joining - include all from each side if none present * Removed join -a commands * Additional join i missed before * Final set of joins missed * All join changes resolved * Verify no join difference in integrategq * Further changes * Reset to init for ease of understanding * Minor updates, still WIP * Minor edits * Separated piped commands into distinct steps * Added extra line * Modified to use ARGIND * Removed extra line in pe test * Removed extra space at the end of script * Readded comments for user clarity --- .../scripts/IntegrateGQ.sh | 84 +++++++++++-------- 1 file changed, 51 insertions(+), 33 deletions(-) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh index 429fc5f61..3bdea0fb3 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh @@ -30,41 +30,60 @@ zcat $RD_melted_genotypes \ |gzip \ >rd_indiv_geno.txt.gz -##Deletions, need to PE-SR genotypes to match RD format (2==ref)## ##PE## -zcat $pegeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >pe_indiv_geno.txt - -##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## -zcat $pegeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ - |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >>pe_indiv_geno.txt -sort -k1,1 pe_indiv_geno.txt|gzip>pe_indiv_geno.txt.gz - -rm pe_indiv_geno.txt +zcat "$pegeno_indiv_file" | \ +awk -v OFS="\t" ' + ARGIND==1 { + if ($5 == "DEL") { + del[$4] + } else { + nodel[$4] + } + next + } + ARGIND==2 { + if ($1 in del) { + ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## + final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2)) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } else if ($1 in nodel) { + ##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)## + final_gt = ($4>0 ? $4+2 : 2) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } + } +' int.bed - \ +| awk '!seen[$1]++' \ +| sort -k1,1 \ +| gzip > pe_indiv_geno.txt.gz ##SR## -zcat $srgeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \ - |awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >sr_indiv_geno.txt - -##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## -zcat $srgeno_indiv_file \ - | { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \ - |awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \ - |awk '!seen[$1"@"$2]++' \ - >>sr_indiv_geno.txt - -sort -k1,1 sr_indiv_geno.txt|gzip>sr_indiv_geno.txt.gz - -rm sr_indiv_geno.txt +zcat "$srgeno_indiv_file" | \ +awk -v OFS="\t" ' + ARGIND==1 { + if ($5 == "DEL") { + del[$4] + } else { + nodel[$4] + } + next + } + ARGIND==2 { + if ($1 in del) { + ##Deletions, need to PE-SR genotypes to match RD format (2==ref)## + final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2)) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } else if ($1 in nodel) { + ##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)## + final_gt = ($4>0 ? $4+2 : 2) + print $1"@"$2, $1, $2, $4, final_gt, $5 + } + } +' int.bed - \ +| awk '!seen[$1]++' \ +| sort -k1,1 \ +| gzip > sr_indiv_geno.txt.gz + ##check to make sure PE and SR are same size which they should be## @@ -377,4 +396,3 @@ cat genotype.variant.txt \ |awk '{if ($2==0) $2=1;if ($3==0) $3=1; if ($4==0) $4=1; if ($5==0) $5=1; print}' OFS="\t" \ |gzip \ >genotype.variant.txt.gz -