Skip to content

Commit

Permalink
add low cov calls
Browse files Browse the repository at this point in the history
  • Loading branch information
ViralVerity committed Jun 27, 2024
1 parent cd28963 commit a07aaba
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ Within this, there will be:
- summary_all_samples.tsv: contains information about all possible virus options provided per sample
- alignment - contains alignments by virus type NB not to be used for phylogenetics because it is only rough for estimating coverage. If a trimmed bed file was provided then these are trimmed down, otherwise they are only untrimmed.
- consensus - consensus sequences of the called virus for each sample
- low coverage consensus - consensus sequences for samples which had less than 50% coverage against the reference sequence and so didn't receive a formal serotype call. Only those with more than 5% difference betwen the coverage of one serotype and the next highest coverage taken (sylvatics excluded) to give an estimate for which serotype they may be. We have found that low coverage genomes can still be successfully assigned lineages using genome detective, but we do not recommend using them for broader phylogenetic studies.
- depth - text files of each position of the genome and their sequencing depth by sample. NB positions are relative to reference sequence
- variants - contains a summary file of the number of variants by sample, and then a file for each sample containing additional information about variants

Expand Down
41 changes: 41 additions & 0 deletions denv_pipeline/scripts/make_summary_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
def summarise_files(config, per_sample_files, serotype_call_file, top_call_file, all_info_file):

serotypes = defaultdict(list)
all_coverage = defaultdict(dict)
all_lines = []
top_calls = []
serotype_call = []
Expand All @@ -16,6 +17,10 @@ def summarise_files(config, per_sample_files, serotype_call_file, top_call_file,
with open(file) as f:
data = csv.DictReader(f, delimiter="\t")
for l in data:
if not l['serotype_called'] in all_coverage[l['sample_id']]:
all_coverage[l['sample_id']][l['serotype_called']] = []
all_coverage[l['sample_id']][l['serotype_called']].append(l['coverage_untrimmed'])

possible_tops.append(l)
all_lines.append(l)

Expand Down Expand Up @@ -54,6 +59,7 @@ def summarise_files(config, per_sample_files, serotype_call_file, top_call_file,

sort_variant_files(config, serotypes)
get_right_serotype_files(config, serotypes)
pull_low_coverage_seqs(config, all_coverage, serotypes)
make_alignments(config, serotypes)

return
Expand Down Expand Up @@ -186,3 +192,38 @@ def make_alignments(config, serotypes):
new_file = f'{alignment_dir}/{virus_type}.untrim.aln'
cat_string = " ".join([f"{tempdir}/{aln}" for aln in alns])
os.system(f"cat {cat_string} >> {new_file}")


def pull_low_coverage_seqs(config, all_coverage, high_coverage):

top_call = {}

for name, cov_dict in all_coverage.items():
if name not in high_coverage:
lst = []
for sero, cov in cov_dict.items():
if "sylvatic" not in sero:
lst.append(cov)

in_order = sorted(lst, reverse=True)
if in_order[0] > (in_order[1] + 5):
top = [i for i in cov_dict if cov_dict[i]==in_order[0]][0]
else:
top = "NA"

top_call[name] = top

low_cov_seqs = set()
depth = config["depth"]
with open(os.path.join(config["outdir"], "results", "low_coverage_calls.csv"), 'w') as fw:
fw.write("sample_id,serotype\n")
for name,call in top_call.items():
if call != "NA":
fw.write(f"{name},{call}\n")

low_cov_seqs.add(f'{name}.{call}.{depth}.cons.fa')

for cons in low_cov_seqs:
source = os.path.join(config['tempdir'], cons)
dest = os.path.join(config["outdir"], "results", "low_coverage_consensus")
shutil.move(source, dest)
1 change: 1 addition & 0 deletions denv_pipeline/utils/set_up_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ def make_folders(config):
misc.make_directory(os.path.join(results_dir, "depth"))
misc.make_directory(os.path.join(results_dir, "consensus_sequences"))
misc.make_directory(os.path.join(results_dir, "alignments"))
misc.make_directory(os.path.join(results_dir, "low_coverage_consensus"))

if config["download"]:
misc.make_directory(os.path.join(config["outdir"], "downloads"))
Expand Down

0 comments on commit a07aaba

Please sign in to comment.