Skip to content

Commit

Permalink
Fix IntegrateGQ.sh errors due to presence of variants of just one type (
Browse files Browse the repository at this point in the history
#760)

* Initial commit

* Added safe joining - include all from each side if none present

* Removed join -a commands

* Additional join i missed before

* Final set of joins missed

* All join changes resolved

* Verify no join difference in integrategq

* Further changes

* Reset to init for ease of understanding

* Minor updates, still WIP

* Minor edits

* Separated piped commands into distinct steps

* Added extra line

* Modified to use ARGIND

* Removed extra line in pe test

* Removed extra space at the end of script

* Readded comments for user clarity
  • Loading branch information
kjaisingh authored Feb 4, 2025
1 parent a69e7c7 commit 0b9cf45
Showing 1 changed file with 51 additions and 33 deletions.
84 changes: 51 additions & 33 deletions src/sv-pipeline/04_variant_resolution/scripts/IntegrateGQ.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,41 +30,60 @@ zcat $RD_melted_genotypes \
|gzip \
>rd_indiv_geno.txt.gz

##Deletions, need to PE-SR genotypes to match RD format (2==ref)##
##PE##
zcat $pegeno_indiv_file \
| { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \
|awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \
|awk '!seen[$1"@"$2]++' \
>pe_indiv_geno.txt

##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)##
zcat $pegeno_indiv_file \
| { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \
|awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \
|awk '!seen[$1"@"$2]++' \
>>pe_indiv_geno.txt
sort -k1,1 pe_indiv_geno.txt|gzip>pe_indiv_geno.txt.gz

rm pe_indiv_geno.txt
zcat "$pegeno_indiv_file" | \
awk -v OFS="\t" '
ARGIND==1 {
if ($5 == "DEL") {
del[$4]
} else {
nodel[$4]
}
next
}
ARGIND==2 {
if ($1 in del) {
##Deletions, need to PE-SR genotypes to match RD format (2==ref)##
final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2))
print $1"@"$2, $1, $2, $4, final_gt, $5
} else if ($1 in nodel) {
##Duplications and other events, need to PE-SR genotypes to match RD (2==ref)##
final_gt = ($4>0 ? $4+2 : 2)
print $1"@"$2, $1, $2, $4, final_gt, $5
}
}
' int.bed - \
| awk '!seen[$1]++' \
| sort -k1,1 \
| gzip > pe_indiv_geno.txt.gz

##SR##
zcat $srgeno_indiv_file \
| { fgrep -wf <(awk '{if ($5=="DEL") print $4}' int.bed) || true; } \
|awk '{if ($4>1) print $1"@"$2,$1,$2,$4,0,$5;else if ($4==1) print $1"@"$2,$1,$2,$4,1,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \
|awk '!seen[$1"@"$2]++' \
>sr_indiv_geno.txt

##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)##
zcat $srgeno_indiv_file \
| { fgrep -wf <(awk '{if ($5!="DEL") print $4}' int.bed) || true; } \
|awk '{if ($4>0) print $1"@"$2,$1,$2,$4,$4+2,$5 ;else print $1"@"$2,$1,$2,$4,2,$5}' OFS='\t' \
|awk '!seen[$1"@"$2]++' \
>>sr_indiv_geno.txt

sort -k1,1 sr_indiv_geno.txt|gzip>sr_indiv_geno.txt.gz

rm sr_indiv_geno.txt
zcat "$srgeno_indiv_file" | \
awk -v OFS="\t" '
ARGIND==1 {
if ($5 == "DEL") {
del[$4]
} else {
nodel[$4]
}
next
}
ARGIND==2 {
if ($1 in del) {
##Deletions, need to PE-SR genotypes to match RD format (2==ref)##
final_gt = ($4>1 ? 0 : ($4==1 ? 1 : 2))
print $1"@"$2, $1, $2, $4, final_gt, $5
} else if ($1 in nodel) {
##Duplications and other events, need to PE-SR genotysrs to match RD (2==ref)##
final_gt = ($4>0 ? $4+2 : 2)
print $1"@"$2, $1, $2, $4, final_gt, $5
}
}
' int.bed - \
| awk '!seen[$1]++' \
| sort -k1,1 \
| gzip > sr_indiv_geno.txt.gz


##check to make sure PE and SR are same size which they should be##

Expand Down Expand Up @@ -377,4 +396,3 @@ cat genotype.variant.txt \
|awk '{if ($2==0) $2=1;if ($3==0) $3=1; if ($4==0) $4=1; if ($5==0) $5=1; print}' OFS="\t" \
|gzip \
>genotype.variant.txt.gz

0 comments on commit 0b9cf45

Please sign in to comment.