Skip to content

Commit

Permalink
Merge pull request #1 from asaravia-butler/main
Browse files Browse the repository at this point in the history
Updates to accommodate updated GL processes
  • Loading branch information
AstrobioMike authored Jan 2, 2024
2 parents 03b4f0f + 0b2c2a3 commit 34d0511
Show file tree
Hide file tree
Showing 2 changed files with 75 additions and 12 deletions.
9 changes: 8 additions & 1 deletion bin/GL-est-rRNA-percentages
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ required.add_argument('--ref', help = "The primary target organism/clade",
'B-distachyon',
'R-norvegicus',
'L-heterotoma',
'L-boulardi',
'Escherichia',
'Mycobacterium',
'Pseudomonas',
Expand All @@ -50,6 +51,9 @@ parser.add_argument("--reads-to-scan", help = "In the case with some datasets, e
parser.add_argument("--single-ended", help = "Add this flag if data are single-end sequencing", action = "store_true")

parser.add_argument("--slurm", help = "Add this flag to submit the job through slurm", action = "store_true")
parser.add_argument("--nodename", help = "Add this flag to specify slurm node to use", action = "store", default = "")
parser.add_argument("--exclude_nodename", help = "Add this flag to specify slurm nodes to exclude. This overrides any --nodename supplied.", action = "store", default = "")
parser.add_argument("--queue", help = "Add this flag to specify slurm queue to use", action = "store", default = "normal")


if len(sys.argv)==1:
Expand All @@ -72,7 +76,8 @@ expected_reference_dict = {"A-thaliana": "rrna-Athaliana.fasta",
"Mus-musculus": "rrna-Mmusculus.fasta",
"R-norvegicus": "rrna-Rnorvegicus.fasta",
"Streptococcus": "rrna-Streptococcus.fasta",
"L-heterotoma": "rrna-Lheterotoma.fasta"}
"L-heterotoma": "rrna-Lheterotoma.fasta",
"L-boulardi": "rrna-Lboulardi.fasta"}

# setting date retrieved (will eventually need to adjust/efficiency-ize this as more are added)
if args.ref == "Mycobacterium":
Expand All @@ -81,6 +86,8 @@ elif args.ref == "G-hirsutum":
data_of_rRNA_refs_fetched = "10-Apr-2023"
elif args.ref == "L-heterotoma":
data_of_rRNA_refs_fetched = "14-Jun-2023"
elif args.ref == "L-boulardi":
data_of_rRNA_refs_fetched = "19-Oct-2023"
else:
data_of_rRNA_refs_fetched = "01-Sep-2021"

Expand Down
78 changes: 67 additions & 11 deletions bin/GL-validate-raw-data
Original file line number Diff line number Diff line change
Expand Up @@ -14,24 +14,52 @@ import re
import subprocess
import shutil

assay_suffixes = {
"microarray": "GLmicroarray",
"bulkRNAseq": "GLbulkRNAseq",
"MethylSeq": "GLMethylSeq",
"scRNAseq": "GLscRNAseq",
"snRNAseq": "GLsnRNAseq",
"scATACseq": "GLscATACseq",
"snATACseq": "GLsnATACseq",
"smallRNAseq": "GLsmallRNAseq",
"AmpSeq": "GLAmpSeq",
"metagenomics": "GLmetagenomics",
"metatranscriptomics": "GLmetatranscriptomics",
"targetSeq": "GLtargetSeq",
"targetRNAseq": "GLtargetRNAseq",
"nanoporeRNAseq": "GLnanoporeRNAseq",
"ST": "GLSpatialTranscriptomics",
"WGS": "GLwgs",
"proteomics": "GLproteomics",
"metabolomics": "GLmetabolomics"
}

parser = argparse.ArgumentParser(description = "This program validates GeneLab raw datasets, renames files if needed \
to follow GL convention, checks and/or creates md5s, and \
runs fastqc and combines its outputs with multiqc. It is meant to be \
executed in the directory holding the read files. \
For version info, run `GL-version`.",
epilog = "Ex. usage: GL-validate-raw-data -g GLDS-480 -n 3")
epilog = "Ex. usage: GL-validate-raw-data -g GLDS-480 -n 3 -a bulkRNAseq")

required = parser.add_argument_group('required arguments')

required.add_argument("-g", "--GLDS-ID", help = 'GLDS ID (e.g. "GLDS-480")', action = "store", required = True)
required.add_argument("-n", "--number-of-samples", help = "The expected number of samples here", action = "store", type = int, required = True)
required.add_argument("-a", "--assay", help = f"Specify the assay type. Available options are: {', '.join(assay_suffixes.keys())}.", action = "store", type = str)

parser.add_argument("-m", "--md5-file", help = "Include a file holding all md5sum results if avaiable (one will be created if not, or if the filenames are changed)", \
action = "store", default = "")
parser.add_argument("-s", "--single-ended", help = "Add this flag if data are single-end sequencing", action = "store_true")
parser.add_argument("--ATACseq", help = "Add this flag if the data are ATACseq (in which case R3 is expected to be the reverse read, and R2 holds barcodes)", action = "store_true")
parser.add_argument("-k", "--keep-fastqc-files", help = "Add this flag if wanting to keep individual-sample fastqc files", action = "store_true")
parser.add_argument("--slurm", help = "Add this flag to submit the job through slurm", action = "store_true")
parser.add_argument("--nodename", help = "Add this flag to specify slurm node to use", action = "store", default = "")
parser.add_argument("--mem", help = "Add this flag to specify the amount of memory to use", action = "store", default = "")
parser.add_argument("--exclude_nodename", help = "Add this flag to specify slurm nodes to exclude. This overrides any --nodename supplied.", action = "store", default = "")
parser.add_argument("--queue", help = "Add this flag to specify slurm queue to use", action = "store", default = "normal")
parser.add_argument('--HRremoved-suffix', action='store_true',
help='Set this flag to indicate that "HRremoved" suffix is expected.')

if len(sys.argv)==1:
parser.print_help(sys.stderr)
Expand All @@ -40,21 +68,22 @@ if len(sys.argv)==1:
args = parser.parse_args()

# currently hard-coded things we may want to adjust
R1_designations = ["_R1_", "_R1.", "-R1.", "-R1-"]
R2_designations = ["_R2_", "_R2.", "-R2.", "-R2-"]
R1_designations = ["_R1_", "_R1.", "-R1.", "-R1-", ".R1.", "_1."]
R2_designations = ["_R2_", "_R2.", "-R2.", "-R2-", ".R2.", "_2."]
extensions = [".fq", ".fastq"]
standard_GL_R1_suffix = "_R1_raw.fastq.gz"
standard_GL_R2_suffix = "_R2_raw.fastq.gz"
standard_GL_SE_suffix = "_raw.fastq.gz"
standard_GL_R1_suffix = "_R1_raw.fastq.gz" if not args.HRremoved_suffix else "_R1_HRremoved_raw.fastq.gz"
standard_GL_R2_suffix = "_R2_raw.fastq.gz" if not args.HRremoved_suffix else "_R2_HRremoved_raw.fastq.gz"
standard_GL_SE_suffix = "_raw.fastq.gz" if not args.HRremoved_suffix else "_HRremoved_raw.fastq.gz"

# additional ones for ATACseq
R3_designations = ["_R3_", "_R3.", "-R3.", "-R3-"]
standard_GL_R3_suffix = "_R3_raw.fastq.gz"
R3_designations = ["_R3_", "_R3.", "-R3.", "-R3-", ".R3.", "_3."]
standard_GL_R3_suffix = "_R3_raw.fastq.gz" # ADD THIS IF ATACseq also needs to allow `HRremoved` in suffix at user request: if not args.HRremoved_suffix else "_R3_HRremoved_raw.fastq.gz"
print(f"Standard GL Suffixes: R1: '{standard_GL_R1_suffix}'; R2: '{standard_GL_R2_suffix}'; R_SE: '{standard_GL_SE_suffix}'; R3 (ATACseq) '{standard_GL_R3_suffix}'")

md5_output_file = "raw_md5sum.txt"
md5_check_output_file = str(args.GLDS_ID) + "-raw-md5-check-results.txt"
fastqc_threads = 8 # allocates 250 MB memory per thread, so 8 is 2 GB
multiqc_output_prefix = "raw_multiqc"
multiqc_output_prefix = "_".join(["raw_multiqc", assay_suffixes[args.assay]])
multiqc_data_dir = multiqc_output_prefix + "_data"
multiqc_html = multiqc_output_prefix + ".html"
# multiqc_html = multiqc_output_prefix + "_report.html"
Expand All @@ -73,7 +102,22 @@ if args.slurm:
sbatch_file = str(args.GLDS_ID) + "-raw-validation.slurm"
slurm_out_file = str(args.GLDS_ID) + "-raw-validation-slurm.out"
slurm_job_name = str(args.GLDS_ID) + "-raw-validation"
slurm_mem = 3000
slurm_queue = str(args.queue)

# This section makes memory allocated via the --mem option override the default of 20GB
if args.mem:
slurm_mem = str(args.mem)
else:
slurm_mem = 20000

# This sections makes `exclusions` override any supplied nodelist!
if args.exclude_nodename:
slurm_node_exclusions = str(args.exclude_nodename)
slurm_node = False
else:
slurm_node = str(args.nodename)
slurm_node_exclusions = False


################################################################################

Expand All @@ -85,6 +129,10 @@ def main():
print_notification("Arguments '--single-ended' and '--ATACseq' can't both be specified. Exiting for now.")
sys.exit(1)

if args.assay not in assay_suffixes.keys():
print_notification(f"Argument --assay must be one of the following: {', '.join(assay_suffixes.keys())}.")
sys.exit(1)

if args.slurm:

# pre-flight check so things aren't passed to slurm before finding a problem
Expand Down Expand Up @@ -1250,6 +1298,9 @@ def run_fastqc_and_multiqc(map_dict, problem_files_list):

print_notification("Running fastqc...")

# set JAVA options to allow fastqc to use up to 8GB of heap memory
os.environ["_JAVA_OPTIONS"] = "-Xmx8g"

# making list of all files
list_of_all_files = []

Expand Down Expand Up @@ -1660,8 +1711,13 @@ def submit_slurm():

out.write('#SBATCH --job-name=' + str(slurm_job_name) + "\n")
out.write('#SBATCH --output=' + str(slurm_out_file) + "\n")
if slurm_node_exclusions:
out.write('#SBATCH --exclude=' + str(slurm_node_exclusions) + "\n")
if slurm_node:
out.write('#SBATCH --nodelist=' + str(slurm_node) + "\n")
out.write('#SBATCH --partition=' + str(slurm_queue) + "\n")
out.write('#SBATCH --mem=' + str(slurm_mem) + "\n\n")

out.write('# sourcing user profile\n')
out.write('. ~/.profile\n\n')

Expand Down

0 comments on commit 34d0511

Please sign in to comment.