Merge pull request #1 from asaravia-butler/main

Updates to accommodate updated GL processes
AstrobioMike · Jan 2, 2024 · 34d0511 · 34d0511
2 parents 03b4f0f + 0b2c2a3
commit 34d0511
Show file tree

Hide file tree

Showing 2 changed files with 75 additions and 12 deletions.
diff --git a/bin/GL-est-rRNA-percentages b/bin/GL-est-rRNA-percentages
@@ -31,6 +31,7 @@ required.add_argument('--ref', help = "The primary target organism/clade",
                                                                     'B-distachyon',
                                                                     'R-norvegicus',
                                                                     'L-heterotoma',
+                                                                    'L-boulardi',
                                                                     'Escherichia',
                                                                     'Mycobacterium',
                                                                     'Pseudomonas',
@@ -50,6 +51,9 @@ parser.add_argument("--reads-to-scan", help = "In the case with some datasets, e
 parser.add_argument("--single-ended", help = "Add this flag if data are single-end sequencing", action = "store_true")
 
 parser.add_argument("--slurm", help = "Add this flag to submit the job through slurm", action = "store_true")
+parser.add_argument("--nodename", help = "Add this flag to specify slurm node to use", action = "store", default = "")
+parser.add_argument("--exclude_nodename", help = "Add this flag to specify slurm nodes to exclude. This overrides any --nodename supplied.", action = "store", default = "")
+parser.add_argument("--queue", help = "Add this flag to specify slurm queue to use", action = "store", default = "normal")
 
 
 if len(sys.argv)==1:
@@ -72,7 +76,8 @@ expected_reference_dict = {"A-thaliana": "rrna-Athaliana.fasta",
                            "Mus-musculus": "rrna-Mmusculus.fasta",
                            "R-norvegicus": "rrna-Rnorvegicus.fasta",
                            "Streptococcus": "rrna-Streptococcus.fasta",
-                           "L-heterotoma": "rrna-Lheterotoma.fasta"}
+                           "L-heterotoma": "rrna-Lheterotoma.fasta",
+                           "L-boulardi": "rrna-Lboulardi.fasta"}
 
 # setting date retrieved (will eventually need to adjust/efficiency-ize this as more are added)
 if args.ref == "Mycobacterium":
@@ -81,6 +86,8 @@ elif args.ref == "G-hirsutum":
     data_of_rRNA_refs_fetched = "10-Apr-2023"
 elif args.ref == "L-heterotoma":
     data_of_rRNA_refs_fetched = "14-Jun-2023"
+elif args.ref == "L-boulardi":
+    data_of_rRNA_refs_fetched = "19-Oct-2023"
 else:
     data_of_rRNA_refs_fetched = "01-Sep-2021"
 

diff --git a/bin/GL-validate-raw-data b/bin/GL-validate-raw-data
@@ -14,24 +14,52 @@ import re
 import subprocess
 import shutil
 
+assay_suffixes = {
+    "microarray": "GLmicroarray",
+    "bulkRNAseq": "GLbulkRNAseq",
+    "MethylSeq": "GLMethylSeq",
+    "scRNAseq": "GLscRNAseq",
+    "snRNAseq": "GLsnRNAseq",
+    "scATACseq": "GLscATACseq",
+    "snATACseq": "GLsnATACseq",
+    "smallRNAseq": "GLsmallRNAseq",
+    "AmpSeq": "GLAmpSeq",
+    "metagenomics": "GLmetagenomics",
+    "metatranscriptomics": "GLmetatranscriptomics",
+    "targetSeq": "GLtargetSeq",
+    "targetRNAseq": "GLtargetRNAseq",
+    "nanoporeRNAseq": "GLnanoporeRNAseq",
+    "ST": "GLSpatialTranscriptomics",
+    "WGS": "GLwgs",
+    "proteomics": "GLproteomics",
+    "metabolomics": "GLmetabolomics"
+}
+
 parser = argparse.ArgumentParser(description = "This program validates GeneLab raw datasets, renames files if needed \
                                                 to follow GL convention, checks and/or creates md5s, and \
                                                 runs fastqc and combines its outputs with multiqc. It is meant to be \
                                                 executed in the directory holding the read files. \
                                                 For version info, run `GL-version`.",
-                                 epilog = "Ex. usage: GL-validate-raw-data -g GLDS-480 -n 3")
+                                 epilog = "Ex. usage: GL-validate-raw-data -g GLDS-480 -n 3 -a bulkRNAseq")
 
 required = parser.add_argument_group('required arguments')
 
 required.add_argument("-g", "--GLDS-ID", help = 'GLDS ID (e.g. "GLDS-480")', action = "store", required = True)
 required.add_argument("-n", "--number-of-samples", help = "The expected number of samples here", action = "store", type = int, required = True)
+required.add_argument("-a", "--assay", help = f"Specify the assay type. Available options are: {', '.join(assay_suffixes.keys())}.", action = "store", type = str)
 
 parser.add_argument("-m", "--md5-file", help = "Include a file holding all md5sum results if avaiable (one will be created if not, or if the filenames are changed)", \
                     action = "store", default = "")
 parser.add_argument("-s", "--single-ended", help = "Add this flag if data are single-end sequencing", action = "store_true")
 parser.add_argument("--ATACseq", help = "Add this flag if the data are ATACseq (in which case R3 is expected to be the reverse read, and R2 holds barcodes)", action = "store_true")
 parser.add_argument("-k", "--keep-fastqc-files", help = "Add this flag if wanting to keep individual-sample fastqc files", action = "store_true")
 parser.add_argument("--slurm", help = "Add this flag to submit the job through slurm", action = "store_true")
+parser.add_argument("--nodename", help = "Add this flag to specify slurm node to use", action = "store", default = "")
+parser.add_argument("--mem", help = "Add this flag to specify the amount of memory to use", action = "store", default = "")
+parser.add_argument("--exclude_nodename", help = "Add this flag to specify slurm nodes to exclude. This overrides any --nodename supplied.", action = "store", default = "")
+parser.add_argument("--queue", help = "Add this flag to specify slurm queue to use", action = "store", default = "normal")
+parser.add_argument('--HRremoved-suffix', action='store_true', 
+                    help='Set this flag to indicate that "HRremoved" suffix is expected.')
 
 if len(sys.argv)==1:
     parser.print_help(sys.stderr)
@@ -40,21 +68,22 @@ if len(sys.argv)==1:
 args = parser.parse_args()
 
 # currently hard-coded things we may want to adjust
-R1_designations = ["_R1_", "_R1.", "-R1.", "-R1-"]
-R2_designations = ["_R2_", "_R2.", "-R2.", "-R2-"]
+R1_designations = ["_R1_", "_R1.", "-R1.", "-R1-", ".R1.", "_1."]
+R2_designations = ["_R2_", "_R2.", "-R2.", "-R2-", ".R2.", "_2."]
 extensions = [".fq", ".fastq"]
-standard_GL_R1_suffix = "_R1_raw.fastq.gz"
-standard_GL_R2_suffix = "_R2_raw.fastq.gz"
-standard_GL_SE_suffix = "_raw.fastq.gz"
+standard_GL_R1_suffix = "_R1_raw.fastq.gz" if not args.HRremoved_suffix else "_R1_HRremoved_raw.fastq.gz"
+standard_GL_R2_suffix = "_R2_raw.fastq.gz" if not args.HRremoved_suffix else "_R2_HRremoved_raw.fastq.gz"
+standard_GL_SE_suffix = "_raw.fastq.gz" if not args.HRremoved_suffix else "_HRremoved_raw.fastq.gz"
 
 # additional ones for ATACseq
-R3_designations = ["_R3_", "_R3.", "-R3.", "-R3-"]
-standard_GL_R3_suffix = "_R3_raw.fastq.gz"
+R3_designations = ["_R3_", "_R3.", "-R3.", "-R3-", ".R3.", "_3."]
+standard_GL_R3_suffix = "_R3_raw.fastq.gz" # ADD THIS IF ATACseq also needs to allow `HRremoved` in suffix at user request: if not args.HRremoved_suffix else "_R3_HRremoved_raw.fastq.gz"
+print(f"Standard GL Suffixes: R1: '{standard_GL_R1_suffix}'; R2: '{standard_GL_R2_suffix}'; R_SE: '{standard_GL_SE_suffix}'; R3 (ATACseq) '{standard_GL_R3_suffix}'")
 
 md5_output_file = "raw_md5sum.txt"
 md5_check_output_file = str(args.GLDS_ID) + "-raw-md5-check-results.txt"
 fastqc_threads = 8 # allocates 250 MB memory per thread, so 8 is 2 GB
-multiqc_output_prefix = "raw_multiqc"
+multiqc_output_prefix = "_".join(["raw_multiqc", assay_suffixes[args.assay]])
 multiqc_data_dir = multiqc_output_prefix + "_data"
 multiqc_html = multiqc_output_prefix + ".html"
 # multiqc_html = multiqc_output_prefix + "_report.html"
@@ -73,7 +102,22 @@ if args.slurm:
     sbatch_file = str(args.GLDS_ID) + "-raw-validation.slurm"
     slurm_out_file = str(args.GLDS_ID) + "-raw-validation-slurm.out"
     slurm_job_name = str(args.GLDS_ID) + "-raw-validation"
-    slurm_mem = 3000
+    slurm_queue = str(args.queue)
+
+    # This section makes memory allocated via the --mem option override the default of 20GB
+    if args.mem:
+        slurm_mem = str(args.mem)
+    else:
+        slurm_mem = 20000
+
+    # This sections makes `exclusions` override any supplied nodelist!
+    if args.exclude_nodename:
+        slurm_node_exclusions = str(args.exclude_nodename)
+        slurm_node = False
+    else:
+        slurm_node = str(args.nodename)
+        slurm_node_exclusions = False
+
 
 ################################################################################
 
@@ -85,6 +129,10 @@ def main():
         print_notification("Arguments '--single-ended' and '--ATACseq' can't both be specified. Exiting for now.")
         sys.exit(1)
 
+    if args.assay not in assay_suffixes.keys():
+        print_notification(f"Argument --assay must be one of the following: {', '.join(assay_suffixes.keys())}.")
+        sys.exit(1)
+
     if args.slurm:
 
         # pre-flight check so things aren't passed to slurm before finding a problem
@@ -1250,6 +1298,9 @@ def run_fastqc_and_multiqc(map_dict, problem_files_list):
 
     print_notification("Running fastqc...")
 
+    # set JAVA options to allow fastqc to use up to 8GB of heap memory
+    os.environ["_JAVA_OPTIONS"] = "-Xmx8g"
+
     # making list of all files
     list_of_all_files = []
 
@@ -1660,8 +1711,13 @@ def submit_slurm():
 
         out.write('#SBATCH --job-name=' + str(slurm_job_name) + "\n")
         out.write('#SBATCH --output=' + str(slurm_out_file) + "\n")
+        if slurm_node_exclusions:
+            out.write('#SBATCH --exclude=' + str(slurm_node_exclusions) + "\n")
+        if slurm_node:
+            out.write('#SBATCH --nodelist=' + str(slurm_node) + "\n")
+        out.write('#SBATCH --partition=' + str(slurm_queue) + "\n")
         out.write('#SBATCH --mem=' + str(slurm_mem) + "\n\n")
-
+	
         out.write('# sourcing user profile\n')
         out.write('. ~/.profile\n\n')