diff --git a/denv_pipeline/command.py b/denv_pipeline/command.py index 4a4e700..3e6c345 100644 --- a/denv_pipeline/command.py +++ b/denv_pipeline/command.py @@ -32,6 +32,7 @@ def main(sysargs = sys.argv[1:]): parser.add_argument("--outdir", dest="outdir", help="location where files will be stored.") parser.add_argument("--reference-directory", "-rd", help="location where bed files and reference genomes are") parser.add_argument("--depth", help="depth to map sequences to. Default=10") + parser.add_argument("--threshold", help="threshold to call consensus positions at, default=0.75",dest="threshold") parser.add_argument("--temp", dest="temp", action="store_true", help="keep intermediate files") parser.add_argument("--tempdir", dest="tempdir", help="where the temporary files go") @@ -72,6 +73,8 @@ def main(sysargs = sys.argv[1:]): if "indir" not in config: config["indir"] = config["outdir"] + error_checks.check_threshold(config) + config["outdir"] = config["outdir"].rstrip("/") config = set_up_scripts.set_up_temporary_directory_path(config) config = set_up_scripts.set_up_reference_directory(config) diff --git a/denv_pipeline/scripts/denv_pipeline.smk b/denv_pipeline/scripts/denv_pipeline.smk index 6fe2a1e..b494162 100644 --- a/denv_pipeline/scripts/denv_pipeline.smk +++ b/denv_pipeline/scripts/denv_pipeline.smk @@ -26,6 +26,7 @@ rule mapper: mapper_script = os.path.join(workflow.current_basedir,"mapper.sh"), primer_dir = config["reference_directory"], depth = config["depth"], + threshold = config["threshold"], tempdir = config["tempdir"], python_script = os.path.join(workflow.current_basedir,"serotype_caller.py"), python_script2 = os.path.join(workflow.current_basedir, "make_empty_files.py") @@ -35,7 +36,7 @@ rule mapper: cpus_per_task=2, runtime=300 run: - shell("{params.mapper_script} {wildcards.sample} {input.read_location}/*R1* {input.read_location}/*R2* {params.primer_dir} {params.python_script} {params.python_script2} {params.depth} {params.tempdir} {log.log} >> {log.log} 2>&1") + shell("{params.mapper_script} {wildcards.sample} {input.read_location}/*R1* {input.read_location}/*R2* {params.primer_dir} {params.python_script} {params.python_script2} {params.depth} {params.threshold} {params.tempdir} {log.log} >> {log.log} 2>&1") if not os.path.exists(os.path.join(params.tempdir,f"{wildcards.sample}_all_virustype_info.txt")): shell("touch {params.tempdir}/{wildcards.sample}_all_virustype_info.txt") @@ -80,10 +81,14 @@ rule make_qc_plots: serotype_dict, colour_dict, patch_list = visualisations.prepare_for_plots(input.serotype_calls_file) visualisations.variant_plot(params.results_dir, input.variant_summary_file, serotype_dict, colour_dict, patch_list) + if config["download"]: + source = os.path.join(params.results_dir, "variant_plot.pdf") + dest = os.path.join(config["outdir"], "downloads") + shell("cp -r {source} {dest}") if config["ct_file"] and config["ct_column"] and config["id_column"]: visualisations.ct_plot(params.results_dir, config["ct_file"], config["ct_column"], config["id_column"], input.serotype_calls_file, serotype_dict, colour_dict, patch_list) - - - - + if config["download"]: + source = os.path.join(params.results_dir, "ct_plot.pdf") + dest = os.path.join(config["outdir"], "downloads") + shell("cp -r {source} {dest}") diff --git a/denv_pipeline/scripts/mapper.sh b/denv_pipeline/scripts/mapper.sh index e9dbf00..b07c8a3 100755 --- a/denv_pipeline/scripts/mapper.sh +++ b/denv_pipeline/scripts/mapper.sh @@ -6,8 +6,9 @@ primer_dir=$4 serotype_caller=$5 empty_file_maker=$6 depth=$7 -tempdir=$8 -log=$9 +threshold=$8 +tempdir=$9 +log=$10 while IFS= read -r virustype || [[ -n "$virustype" ]]; do @@ -22,9 +23,11 @@ while IFS= read -r virustype || [[ -n "$virustype" ]]; do fi echo "----->>>>>Mapping reads against serotype "${virustype}" reference sequence" + which bwa bwa mem -v 1 -t 2 ${fasta} $read1 $read2 | samtools view -bS -F 4 -F 2048 | samtools sort -o ${tempdir}/${fname}.${virustype}.bam >> ${log} 2>&1 echo "----->>>>>Trimming bam file" + which ivar ivar trim -e -i ${tempdir}/${fname}.${virustype}.bam -b ${bed} -p ${tempdir}/${fname}.${virustype}.trimmed.bam >> ${log} 2>&1 if ! [ -s ${tempdir}/${fname%.*}.${virustype}.trimmed.bam ]; then @@ -42,7 +45,7 @@ while IFS= read -r virustype || [[ -n "$virustype" ]]; do samtools index ${tempdir}/${fname}.${virustype}.sort.bam >> ${log} 2>&1 echo "----->>>>>Generating consensus sequence" - samtools mpileup -aa --reference ${fasta} -A -d 10000 -Q 0 ${tempdir}/${fname}.${virustype}.sort.bam | ivar consensus -t 0.75 -m ${depth} -p ${tempdir}/${fname}.${virustype}.${depth}.cons -i ${consensus_name} >> ${log} 2>&1 + samtools mpileup -aa --reference ${fasta} -A -d 10000 -Q 0 ${tempdir}/${fname}.${virustype}.sort.bam | ivar consensus -t ${threshold} -m ${depth} -p ${tempdir}/${fname}.${virustype}.${depth}.cons -i ${consensus_name} >> ${log} 2>&1 echo "----->>>>>Aligning consensus cps sequence against the reference serotype "${virustype}" cps sequence" nextalign run --reference ${fasta} --output-fasta ${tempdir}/${fname}.${virustype}.${depth}.out.aln ${tempdir}/${fname}.${virustype}.${depth}.cons.fa >> ${log} 2>&1 diff --git a/denv_pipeline/scripts/visualisations.py b/denv_pipeline/scripts/visualisations.py index 445aa8e..498fb5a 100644 --- a/denv_pipeline/scripts/visualisations.py +++ b/denv_pipeline/scripts/visualisations.py @@ -20,14 +20,16 @@ def prepare_for_plots(final_serotype_calls): all_viruses.add(l['serotype_called']) colour_dict = {} - lst = mpl.colormaps['viridis'](range(len(all_viruses))) + custom_cmap = mpl.colors.LinearSegmentedColormap.from_list("", ["#567CBE","#D58A80", "#ADB3D9"], len(all_viruses)) + + lst = custom_cmap(range(len(all_viruses))) for i,j in enumerate(all_viruses): colour_dict[j] = rgb2hex(lst[i]) patch_list = [] for serotype,hexc in colour_dict.items(): patch_list.append(mpatches.Patch(color=hexc, label=serotype)) - + return virus_dict, colour_dict, patch_list def variant_plot(results_dir, variants_summary_file, virus_dict, colour_dict, patch_list): @@ -113,3 +115,4 @@ def ct_plot(results_dir, ct_file, ct_column, id_column, final_serotype_calls, vi plt.legend(handles=patch_list,fontsize=15,frameon=False) plt.savefig(os.path.join(results_dir, "ct_plot.pdf"), bbox_inches="tight") + diff --git a/denv_pipeline/utils/error_checks.py b/denv_pipeline/utils/error_checks.py index 916ff76..5c106a2 100644 --- a/denv_pipeline/utils/error_checks.py +++ b/denv_pipeline/utils/error_checks.py @@ -118,22 +118,22 @@ def check_primer_dir(config): def check_env_activated(): if pkgutil.find_loader('snakemake') is None or pkgutil.find_loader('Bio') is None: - sys.stderr.write(green(f"Error: installation not correct. Ensure that environment is activated and you have run 'pip install .'")) + sys.stderr.write(green(f"Error: installation not correct. Ensure that environment is activated and you have run 'pip install .'\n")) sys.exit(-1) def check_ct_file(config): if config["ct_file"] and not config["ct_column"]: - sys.stderr.write(green(f"Error: ct_file specified but no ct_column for ct vs coverage plot. Please provide Ct column name and id column name.")) + sys.stderr.write(green(f"Error: ct_file specified but no ct_column for ct vs coverage plot. Please provide Ct column name and id column name.\n")) sys.exit(-1) if config["ct_file"] and not config["id_column"]: - sys.stderr.write(green(f"Error: ct_file specified but no id_column for ct vs coverage plot. Please provide Ct column name and id column name.")) + sys.stderr.write(green(f"Error: ct_file specified but no id_column for ct vs coverage plot. Please provide Ct column name and id column name.\n")) sys.exit(-1) if (config["ct_column"] or config["id_column"]) and not config["ct_file"]: - sys.stderr.write(green(f"Error: ct_column or id_column specified but no ct_file for ct vs coverage plot. Please provide file containing Ct information.")) + sys.stderr.write(green(f"Error: ct_column or id_column specified but no ct_file for ct vs coverage plot. Please provide file containing Ct information.\n")) sys.exit(-1) if not os.path.exists(config["ct_file"]): @@ -145,8 +145,15 @@ def check_ct_file(config): data = csv.DictReader(f) headers = data.fieldnames if config["ct_column"] not in headers: - sys.stderr.write(green(f"Error: {config['ct_column']} not found in ct_file")) + sys.stderr.write(green(f"Error: {config['ct_column']} not found in ct_file\n")) sys.exit(-1) if config["id_column"] not in headers: - sys.stderr.write(green(f"Error: {config['id_column']} not found in ct_file")) - sys.exit(-1) \ No newline at end of file + sys.stderr.write(green(f"Error: {config['id_column']} not found in ct_file\n")) + sys.exit(-1) + + +def check_threshold(config): + + if float(config["threshold"]) > 1: + sys.stderr.write(green(f"Error: consensus threshold must be between 0 and 1\n")) + sys.exit(-1) \ No newline at end of file diff --git a/denv_pipeline/utils/set_up_scripts.py b/denv_pipeline/utils/set_up_scripts.py index 358ffe1..7c94e26 100644 --- a/denv_pipeline/utils/set_up_scripts.py +++ b/denv_pipeline/utils/set_up_scripts.py @@ -13,6 +13,7 @@ def get_defaults(config): config["config"] = False config["depth"] = 10 + config["threshold"] = 0.75 config["tempdir"] = "temporary_files" config['ct_file'] = False @@ -194,6 +195,7 @@ def get_valid_keys(): valid_keys.append("outdir") valid_keys.append("reference_directory") valid_keys.append("depth") + valid_keys.append("threshold") valid_keys.append("temp") valid_keys.append("tempdir") valid_keys.append("download") diff --git a/tests/dry_run_config.yml b/tests/dry_run_config.yml index 6b9bd36..21b4ca2 100644 --- a/tests/dry_run_config.yml +++ b/tests/dry_run_config.yml @@ -1,4 +1,5 @@ depth: 50 +threshold: 0.75 ct_file: "test_ct_file.csv" ct_column: "test_ct" id_column: "test_id"