Skip to content

Commit

Permalink
Optimize memory usage when merging genotypes (#175)
Browse files Browse the repository at this point in the history
* Optimize memory usage when merging genotypes.

# The following changes are implemented:
# 1. Remove rows where genotype is set to `.` hence the type of values
#    can safely be casted to numerical;
# 2. Set the type of genotype values to uint16;
# 3. Use pd.stack instead of pd.melt in order to be able to use
#    categorical representations to minimize memory requirement.

* Read files line-by-line and melt/merge them on-the-fly: using min memory

* Replace King's dead link with a current-alive URL.

* Revert deleting shebang.

* Update links to the newly built docker images.
  • Loading branch information
VJalili authored Jun 15, 2021
1 parent 5440f12 commit 3ff61a7
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 38 deletions.
4 changes: 2 additions & 2 deletions input_values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
"samtools_cloud_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:mw-gnomad-02-6a66c96",
"sv_base_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:mw-gnomad-02-6a66c96",
"sv_base_mini_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:rlc_posthoc_filtering_cnv_mcnv_compatability_9a8561",
"sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline-base:rlc_posthoc_filtering_cnv_mcnv_compatability_9a8561",
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-clean-vcf-4-sr-gq-bd75b73",
"sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline-base:merge-genotypes-fc8ba1b",
"sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/vjalili/sv-pipeline:merge-genotypes-fc8ba1b",
"sv_pipeline_qc_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-qc:mw-xz-fixes-7cbffee",
"sv_pipeline_rdtest_docker" : "us.gcr.io/broad-dsde-methods/cwhelan/sv-pipeline-rdtest:cw_rdtest_bin_fix_update_5f2c8f",
"wham_docker" : "us.gcr.io/broad-dsde-methods/wham:8645aa",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,36 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright © 2018 Matthew Stone <[email protected]>
# Distributed under terms of the MIT license.

"""
import argparse

"""

import argparse
import pandas as pd
DELIMITER = "\t"


def merge(genotypes_filename, gq_filename, merged_filename):
with open(genotypes_filename, "r") as genotypes, open(gq_filename, "r") as gq, open(merged_filename, "w") as merged:

# Integrity check: do the files have same columns?
genotypes_header = genotypes.readline().rstrip().split(DELIMITER)
gq_header = gq.readline().rstrip().split(DELIMITER)
if not genotypes_header == gq_header:
raise ValueError("The files do not have same number/order of columns")

n_cols = len(gq_header)
for genotypes_line, gq_line in zip(genotypes, gq):
x = genotypes_line.rstrip().split(DELIMITER)
y = gq_line.rstrip().split(DELIMITER)

# Check if lines in the files are in the correct order.
if not x[0:4] == y[0:4]:
raise ValueError(f"The lines in the files are not in the same order; "
f"expected the following lines to match.\n{x[0:4]}\n{y[0:4]}")

h = DELIMITER.join(x[0:4])
for i in range(4, n_cols):
merged.write(DELIMITER.join([h, gq_header[i], x[i], y[i]]) + "\n")


def main():
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.RawDescriptionHelpFormatter)
Expand All @@ -21,30 +39,4 @@ def main():
parser.add_argument('fout')
args = parser.parse_args()

# Load and melt genotypes
gt = pd.read_table(args.genotypes).drop_duplicates()
gt = pd.melt(gt, id_vars='chr start end cnvID'.split(),
var_name='sample', value_name='genotype')

# Round genotype copy states
# gt['genotype'] = gt['genotype'].round().astype(int)

# Pivot back out so input genotype matrix is all integers
# pivot = gt.pivot_table(index='chr start end cnvID'.split(),
# columns='sample', values='genotype').reset_index()
# samples = [c for c in pivot.columns if c not in 'chr start end cnvID'.split()]
# pivot[samples] = pivot[samples].astype(int)
# pivot.to_csv(args.genotypes, index=False, sep='\t')

# Load and melt GQ
gq = pd.read_table(args.GQ)
gq = pd.melt(gq, id_vars='chr start end cnvID'.split(),
var_name='sample', value_name='GQ')

# Merge genotypes with GQ
gt = pd.merge(gt, gq, on='chr start end cnvID sample'.split(), how='left')
gt.to_csv(args.fout, header=False, index=False, sep='\t')


if __name__ == '__main__':
main()
merge(args.genotypes, args.GQ, args.fout)

0 comments on commit 3ff61a7

Please sign in to comment.