Merge branch 'main' of https://github.com/grubaughlab/DENV_pipeline

merge
grubaughlab · Aug 11, 2023 · 2d66630 · 2d66630
2 parents 0f6b8f5 + e7ce897
commit 2d66630
Show file tree

Hide file tree

Showing 7 changed files with 41 additions and 17 deletions.
diff --git a/denv_pipeline/command.py b/denv_pipeline/command.py
@@ -32,6 +32,7 @@ def main(sysargs = sys.argv[1:]):
     parser.add_argument("--outdir", dest="outdir", help="location where files will be stored.")
     parser.add_argument("--reference-directory", "-rd", help="location where bed files and reference genomes are")
     parser.add_argument("--depth", help="depth to map sequences to. Default=10")
+    parser.add_argument("--threshold", help="threshold to call consensus positions at, default=0.75",dest="threshold")
 
     parser.add_argument("--temp", dest="temp", action="store_true", help="keep intermediate files")
     parser.add_argument("--tempdir", dest="tempdir", help="where the temporary files go")
@@ -72,6 +73,8 @@ def main(sysargs = sys.argv[1:]):
     if "indir" not in config:
         config["indir"] = config["outdir"]
 
+    error_checks.check_threshold(config)
+
     config["outdir"] = config["outdir"].rstrip("/")
     config = set_up_scripts.set_up_temporary_directory_path(config)
     config = set_up_scripts.set_up_reference_directory(config)

diff --git a/denv_pipeline/scripts/denv_pipeline.smk b/denv_pipeline/scripts/denv_pipeline.smk
@@ -26,6 +26,7 @@ rule mapper:
         mapper_script = os.path.join(workflow.current_basedir,"mapper.sh"),
         primer_dir = config["reference_directory"],
         depth = config["depth"],
+        threshold = config["threshold"],
         tempdir = config["tempdir"],
         python_script = os.path.join(workflow.current_basedir,"serotype_caller.py"),
         python_script2 = os.path.join(workflow.current_basedir, "make_empty_files.py")
@@ -35,7 +36,7 @@ rule mapper:
         cpus_per_task=2,
         runtime=300
     run:
-        shell("{params.mapper_script} {wildcards.sample} {input.read_location}/*R1* {input.read_location}/*R2* {params.primer_dir} {params.python_script} {params.python_script2} {params.depth} {params.tempdir} {log.log}  >> {log.log} 2>&1")
+        shell("{params.mapper_script} {wildcards.sample} {input.read_location}/*R1* {input.read_location}/*R2* {params.primer_dir} {params.python_script} {params.python_script2} {params.depth} {params.threshold} {params.tempdir} {log.log}  >> {log.log} 2>&1")
 
         if not os.path.exists(os.path.join(params.tempdir,f"{wildcards.sample}_all_virustype_info.txt")):
             shell("touch {params.tempdir}/{wildcards.sample}_all_virustype_info.txt")
@@ -80,10 +81,14 @@ rule make_qc_plots:
         serotype_dict, colour_dict, patch_list = visualisations.prepare_for_plots(input.serotype_calls_file)
 
         visualisations.variant_plot(params.results_dir, input.variant_summary_file, serotype_dict, colour_dict, patch_list)
+        if config["download"]:
+            source = os.path.join(params.results_dir, "variant_plot.pdf")
+            dest = os.path.join(config["outdir"], "downloads")
+            shell("cp -r {source} {dest}")
 
         if config["ct_file"] and config["ct_column"] and config["id_column"]:
             visualisations.ct_plot(params.results_dir, config["ct_file"], config["ct_column"], config["id_column"], input.serotype_calls_file, serotype_dict, colour_dict, patch_list)
-
-
-
-
+            if config["download"]:
+                source = os.path.join(params.results_dir, "ct_plot.pdf")
+                dest = os.path.join(config["outdir"], "downloads")
+                shell("cp -r {source} {dest}")
diff --git a/denv_pipeline/scripts/mapper.sh b/denv_pipeline/scripts/mapper.sh
@@ -6,8 +6,9 @@ primer_dir=$4
 serotype_caller=$5
 empty_file_maker=$6
 depth=$7
-tempdir=$8
-log=$9
+threshold=$8
+tempdir=$9
+log=$10
 
 while IFS= read -r virustype || [[ -n "$virustype" ]]; do 
 
@@ -22,9 +23,11 @@ while IFS= read -r virustype || [[ -n "$virustype" ]]; do
     fi
 
     echo "----->>>>>Mapping reads against serotype "${virustype}" reference sequence"
+    which bwa
     bwa mem -v 1 -t 2 ${fasta} $read1 $read2 | samtools view -bS -F 4 -F 2048 | samtools sort -o ${tempdir}/${fname}.${virustype}.bam >> ${log} 2>&1
 
     echo "----->>>>>Trimming bam file"
+    which ivar
     ivar trim -e -i ${tempdir}/${fname}.${virustype}.bam -b ${bed} -p ${tempdir}/${fname}.${virustype}.trimmed.bam >> ${log} 2>&1
 
     if ! [ -s  ${tempdir}/${fname%.*}.${virustype}.trimmed.bam ]; then
@@ -42,7 +45,7 @@ while IFS= read -r virustype || [[ -n "$virustype" ]]; do
     samtools index ${tempdir}/${fname}.${virustype}.sort.bam >> ${log} 2>&1
 
     echo "----->>>>>Generating consensus sequence"
-    samtools mpileup -aa --reference ${fasta} -A -d 10000 -Q 0 ${tempdir}/${fname}.${virustype}.sort.bam | ivar consensus -t 0.75 -m ${depth} -p ${tempdir}/${fname}.${virustype}.${depth}.cons -i ${consensus_name} >> ${log} 2>&1
+    samtools mpileup -aa --reference ${fasta} -A -d 10000 -Q 0 ${tempdir}/${fname}.${virustype}.sort.bam | ivar consensus -t ${threshold} -m ${depth} -p ${tempdir}/${fname}.${virustype}.${depth}.cons -i ${consensus_name} >> ${log} 2>&1
 
     echo "----->>>>>Aligning consensus cps sequence against the reference serotype "${virustype}" cps sequence"
     nextalign run  --reference ${fasta} --output-fasta ${tempdir}/${fname}.${virustype}.${depth}.out.aln ${tempdir}/${fname}.${virustype}.${depth}.cons.fa >> ${log} 2>&1

diff --git a/denv_pipeline/scripts/visualisations.py b/denv_pipeline/scripts/visualisations.py
@@ -20,14 +20,16 @@ def prepare_for_plots(final_serotype_calls):
             all_viruses.add(l['serotype_called'])
 
     colour_dict = {}
-    lst = mpl.colormaps['viridis'](range(len(all_viruses)))
+    custom_cmap = mpl.colors.LinearSegmentedColormap.from_list("", ["#567CBE","#D58A80", "#ADB3D9"], len(all_viruses))
+
+    lst = custom_cmap(range(len(all_viruses)))
     for i,j in enumerate(all_viruses):
         colour_dict[j] = rgb2hex(lst[i])
 
     patch_list = []
     for serotype,hexc in colour_dict.items():
         patch_list.append(mpatches.Patch(color=hexc, label=serotype))
-
+ 
     return virus_dict, colour_dict, patch_list
 
 def variant_plot(results_dir, variants_summary_file, virus_dict, colour_dict, patch_list):
@@ -113,3 +115,4 @@ def ct_plot(results_dir, ct_file, ct_column, id_column, final_serotype_calls, vi
     plt.legend(handles=patch_list,fontsize=15,frameon=False)
 
     plt.savefig(os.path.join(results_dir, "ct_plot.pdf"), bbox_inches="tight")
+
diff --git a/denv_pipeline/utils/error_checks.py b/denv_pipeline/utils/error_checks.py
@@ -118,22 +118,22 @@ def check_primer_dir(config):
 def check_env_activated():
 
     if pkgutil.find_loader('snakemake') is None or pkgutil.find_loader('Bio') is None:
-        sys.stderr.write(green(f"Error: installation not correct. Ensure that environment is activated and you have run 'pip install .'"))
+        sys.stderr.write(green(f"Error: installation not correct. Ensure that environment is activated and you have run 'pip install .'\n"))
         sys.exit(-1)
 
 
 def check_ct_file(config):
 
     if config["ct_file"] and not config["ct_column"]:
-        sys.stderr.write(green(f"Error: ct_file specified but no ct_column for ct vs coverage plot. Please provide Ct column name and id column name."))
+        sys.stderr.write(green(f"Error: ct_file specified but no ct_column for ct vs coverage plot. Please provide Ct column name and id column name.\n"))
         sys.exit(-1)
 
     if config["ct_file"] and not config["id_column"]:
-        sys.stderr.write(green(f"Error: ct_file specified but no id_column for ct vs coverage plot. Please provide Ct column name and id column name."))
+        sys.stderr.write(green(f"Error: ct_file specified but no id_column for ct vs coverage plot. Please provide Ct column name and id column name.\n"))
         sys.exit(-1)
 
     if (config["ct_column"] or config["id_column"]) and not config["ct_file"]:
-        sys.stderr.write(green(f"Error: ct_column or id_column specified but no ct_file for ct vs coverage plot. Please provide file containing Ct information."))
+        sys.stderr.write(green(f"Error: ct_column or id_column specified but no ct_file for ct vs coverage plot. Please provide file containing Ct information.\n"))
         sys.exit(-1)
 
     if not os.path.exists(config["ct_file"]):
@@ -145,8 +145,15 @@ def check_ct_file(config):
         data = csv.DictReader(f)
         headers = data.fieldnames
         if config["ct_column"] not in headers:
-            sys.stderr.write(green(f"Error: {config['ct_column']} not found in ct_file"))
+            sys.stderr.write(green(f"Error: {config['ct_column']} not found in ct_file\n"))
             sys.exit(-1)
         if config["id_column"] not in headers:
-            sys.stderr.write(green(f"Error: {config['id_column']} not found in ct_file"))
-            sys.exit(-1)
+            sys.stderr.write(green(f"Error: {config['id_column']} not found in ct_file\n"))
+            sys.exit(-1)
+
+
+def check_threshold(config):
+
+    if float(config["threshold"]) > 1:
+        sys.stderr.write(green(f"Error: consensus threshold must be between 0 and 1\n"))
+        sys.exit(-1)
diff --git a/denv_pipeline/utils/set_up_scripts.py b/denv_pipeline/utils/set_up_scripts.py
@@ -13,6 +13,7 @@ def get_defaults(config):
     config["config"] = False
 
     config["depth"] = 10
+    config["threshold"] = 0.75
     config["tempdir"] = "temporary_files"
 
     config['ct_file'] = False
@@ -194,6 +195,7 @@ def get_valid_keys():
     valid_keys.append("outdir")
     valid_keys.append("reference_directory")
     valid_keys.append("depth")
+    valid_keys.append("threshold")
     valid_keys.append("temp")
     valid_keys.append("tempdir")
     valid_keys.append("download")

diff --git a/tests/dry_run_config.yml b/tests/dry_run_config.yml
@@ -1,4 +1,5 @@
 depth: 50
+threshold: 0.75
 ct_file: "test_ct_file.csv"
 ct_column: "test_ct"
 id_column: "test_id"