-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding --directRNAseq option to
GL-est-rRNA-percentages
- Loading branch information
1 parent
eda608c
commit 44907b8
Showing
1 changed file
with
242 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,7 +16,7 @@ parser = argparse.ArgumentParser(description = "This is currently just an intern | |
It is not currently written to work properly anywhere other than GeneLab's main cluster, and is not robust to \ | ||
things like varying file extensions (it is expected to have files already renamed in GL's convention). \ | ||
For help reach out to Mike. For version info, run `GL-version`.", | ||
epilog="Ex. usage: GL-est-rRNA-percentages -s samplex.txt --ref Mus-musculus --GLDS-ID GLDS-XXX --slurm\n") | ||
epilog="Ex. usage: GL-est-rRNA-percentages -s samples.txt --ref Mus-musculus --GLDS-ID GLDS-XXX --slurm\n") | ||
|
||
required = parser.add_argument_group('required arguments') | ||
|
||
|
@@ -49,6 +49,7 @@ parser.add_argument("--reads-to-scan", help = "In the case with some datasets, e | |
choices = ["R1", "R2"]) | ||
|
||
parser.add_argument("--single-ended", help = "Add this flag if data are single-end sequencing", action = "store_true") | ||
parser.add_argument("--directRNAseq", help = "Add this flag if data are actualy RNA sequences that have U's instead of T's", action = "store_true") | ||
|
||
parser.add_argument("--slurm", help = "Add this flag to submit the job through slurm", action = "store_true") | ||
parser.add_argument("--nodename", help = "Add this flag to specify slurm node to use", action = "store", default = "") | ||
|
@@ -299,6 +300,7 @@ def build_snakemake_call(): | |
ref_dir = os.environ['rRNA_refs'] | ||
GLDS_ID = args.GLDS_ID | ||
single_ended = args.single_ended | ||
directRNAseq = args.directRNAseq | ||
|
||
if args.reads_to_scan == "R1": | ||
expected_suffix = "_R1_raw.fastq.gz" | ||
|
@@ -363,25 +365,257 @@ def write_blurb(): | |
|
||
# getting version of htstream (the programs all give 1.3.2, but the installed conda version is 1.3.3, so getting that with this mess) | ||
htstream_version = subprocess.check_output('conda list | grep htstream | tr -s " " "\t" | cut -f 2', shell = True).decode('utf-8').strip() | ||
# writing out versions used to file | ||
with open(blurb_file, "w") as out_file: | ||
|
||
# writing out versions used to file, and processing blurb (which slightly varies if these were RNA sequences instead of cDNA) | ||
if args.directRNAseq: | ||
|
||
out_file.write(f"htstream v{htstream_version} utilized\n") | ||
with open(blurb_file, "w") as out_file: | ||
|
||
out_file.write("\nProtocol text:\n\n") | ||
out_file.write(f"rRNA percentages of reads were estimated with the 'hts_SeqScreener' program of htstream v{htstream_version} screened against a fasta file of {args.ref} rRNA sequences retrieved from NCBI on {data_of_rRNA_refs_fetched}.\n\n") | ||
out_file.write(f"htstream v{htstream_version} utilized\n") | ||
|
||
out_file.write("\nProtocol text:\n\n") | ||
out_file.write(f"rRNA percentages of reads were estimated with the 'hts_SeqScreener' program of htstream v{htstream_version} screened against a fasta file of {args.ref} rRNA sequences retrieved from NCBI on {data_of_rRNA_refs_fetched}.\n") | ||
out_file.write(f"Since 'hts_SeqScreener' does not deal with uracil bases, they were first changed to thymines in temporary files just for the sake of rRNA estimation - original files remain unchanged.\n\n") | ||
|
||
|
||
else: | ||
|
||
with open(blurb_file, "w") as out_file: | ||
|
||
out_file.write(f"htstream v{htstream_version} utilized\n") | ||
|
||
out_file.write("\nProtocol text:\n\n") | ||
out_file.write(f"rRNA percentages of reads were estimated with the 'hts_SeqScreener' program of htstream v{htstream_version} screened against a fasta file of {args.ref} rRNA sequences retrieved from NCBI on {data_of_rRNA_refs_fetched}.\n\n") | ||
|
||
|
||
def write_snakefile(snakefile_text): | ||
|
||
""" this creates the needed Snakefile based on the variable below """ | ||
""" | ||
This creates the needed Snakefile based on the variables below. There | ||
are two possible versions, one for if the input files are actual RNA sequencing, | ||
so they have U's instead of T's, and one where the input are DNA. | ||
If there are U's, hts_SeqScreener can't deal with them. So we are converting U's to T's prior to running. | ||
""" | ||
|
||
with open("Snakefile", "w") as out_file: | ||
|
||
out_file.write(snakefile_text) | ||
|
||
|
||
snakefile_text = ''' | ||
# this version is if data are RNA, and therefore have U's instead of T's | ||
if args.directRNAseq: | ||
|
||
snakefile_text = ''' | ||
#################################################################################################### | ||
## Snakefile for GeneLab for estimating percentages of rRNA in RNAseq datasets ## | ||
## This is currently written for internal use only, and likely won't work as-is for non-GL folks ## | ||
## This Snakefile was generated by the genelab-utils `GL-est-rRNA-percentages` program ## | ||
## For help contact Michael D. Lee ([email protected]) ## | ||
#################################################################################################### | ||
import os | ||
import json | ||
import pandas as pd | ||
## reading samples into list and setting some variables from command-line parameters | ||
sample_ID_list = [line.strip() for line in open(config["sample_IDs_file"])] | ||
ref_dir = config["ref_dir"] | ||
primary_target_ref_filename = config["primary_target_ref_filename"] | ||
primary_target_ref_path = os.path.join(ref_dir, primary_target_ref_filename) | ||
logs_dir = config["logs_dir"] | ||
######################################## | ||
############# Rules start ############## | ||
######################################## | ||
rule all: | ||
input: | ||
config["GLDS_ID"] + "-estimated-rRNA-percentages.tsv" | ||
if config["single_ended"] != True: | ||
if not config["expected_suffix"]: | ||
rule hts_Seq_Screen: | ||
input: | ||
R1 = config["input_reads_dir"] + "{ID}_R1_raw.fastq.gz", | ||
R2 = config["input_reads_dir"] + "{ID}_R2_raw.fastq.gz" | ||
params: | ||
temp_R1 = config["input_reads_dir"] + "{ID}_R1_raw-U-to-T.temp.fastq", | ||
temp_R2 = config["input_reads_dir"] + "{ID}_R2_raw-U-to-T.temp.fastq" | ||
output: | ||
json_log = logs_dir + "/{ID}-rRNA-screen.log" | ||
shell: | ||
""" | ||
# decompressing and converting U's to T's | ||
zcat {input.R1} | tr "U" "T" > {params.temp_R1} | ||
zcat {input.R2} | tr "U" "T" > {params.temp_R2} | ||
hts_SeqScreener -1 {params.temp_R1} -2 {params.temp_R2} -C -r -L {output.json_log} \ | ||
| hts_SeqScreener -s {primary_target_ref_path} -C -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Hsapiens.fasta -C -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Escherichia.fasta -C -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Pseudomonas.fasta -C -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Staphylococcus.fasta -C -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Streptococcus.fasta -C -r -A {output.json_log} > /dev/null | ||
# removing intermediate files | ||
rm {params.temp_R1} {params.temp_R2} | ||
""" | ||
else: | ||
rule hts_Seq_Screen: | ||
input: | ||
reads = config["input_reads_dir"] + "{ID}" + config["expected_suffix"] | ||
params: | ||
temp_reads = config["input_reads_dir"] + "{ID}-U-to-T.temp" + config["expected_suffix"] + ".fastq" | ||
output: | ||
json_log = logs_dir + "/{ID}-rRNA-screen.log" | ||
shell: | ||
""" | ||
# decompressing and converting U's to T's | ||
zcat {input.reads} | tr "U" "T" > {params.temp_reads} | ||
hts_SeqScreener -U {params.temp_reads} -r -L {output.json_log} \ | ||
| hts_SeqScreener -s {primary_target_ref_path} -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Hsapiens.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Escherichia.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Pseudomonas.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Staphylococcus.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Streptococcus.fasta -r -A {output.json_log} > /dev/null | ||
# removing intermediate file | ||
rm {params.temp_reads} | ||
""" | ||
else: | ||
rule hts_Seq_Screen: | ||
input: | ||
reads = config["input_reads_dir"] + "{ID}_raw.fastq.gz" | ||
params: | ||
temp_reads = config["input_reads_dir"] + "{ID}_raw-U-to-T.temp.fastq" | ||
output: | ||
json_log = logs_dir + "/{ID}-rRNA-screen.log" | ||
shell: | ||
""" | ||
# decompressing and converting U's to T's | ||
zcat {input.reads} | tr "U" "T" > {params.temp_reads} | ||
hts_SeqScreener -U {params.temp_reads} -r -L {output.json_log} \ | ||
| hts_SeqScreener -s {primary_target_ref_path} -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Hsapiens.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Escherichia.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Pseudomonas.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Staphylococcus.fasta -r -A {output.json_log} \ | ||
| hts_SeqScreener -s {ref_dir}/rrna-Streptococcus.fasta -r -A {output.json_log} > /dev/null | ||
# removing intermediate files | ||
rm {params.temp_reads} | ||
""" | ||
rule combine_outputs: | ||
input: | ||
expand(logs_dir + "/{ID}-rRNA-screen.log", ID = sample_ID_list) | ||
output: | ||
out_file = config["GLDS_ID"] + "-estimated-rRNA-percentages.tsv" | ||
run: | ||
out_tab = make_tables(input) | ||
out_tab.to_csv(output.out_file, sep = "\t", index = False) | ||
######################################## | ||
############## Functions ############### | ||
######################################## | ||
def make_tables(list_of_files): | ||
""" | ||
this takes a list of input paths to the json log files, parses them and calculates percentages of hits | ||
""" | ||
# starting empty list of rows we combine at end | ||
list_of_rows = [] | ||
for input_file in list_of_files: | ||
# reading in json log file | ||
with open(input_file) as json_file: | ||
log_json = json.load(json_file) | ||
# getting sample ID | ||
sample_name = log_json[0]["Program_details"]["options"]["stats-file"].replace("-rRNA-screen.log", "").replace(logs_dir, "").replace("/", "") | ||
# getting number of reads in sample | ||
num_sample_total_reads = log_json[0]["Fragment"]["in"] | ||
# making dictionary to hold info for this sample | ||
sample_dict = {"sample": sample_name, "num_reads": num_sample_total_reads} | ||
# getting the hits to each target ref | ||
for element in log_json: | ||
# getting target ref | ||
if element["Program_details"]["options"].get("seq") is not None: | ||
target_ref = os.path.basename(element["Program_details"]["options"]["seq"]).replace(".fasta", "").replace("rrna-", "") | ||
else: | ||
target_ref = "phiX" | ||
# getting hits | ||
if config["single_ended"] == True or config["expected_suffix"]: | ||
num_hits = element["Single_end"]["hits"] | ||
else: | ||
num_hits = element["Paired_end"]["hits"] | ||
# getting percent of total | ||
perc_hits = round(num_hits / num_sample_total_reads * 100, 2) | ||
# adding to dictionary | ||
sample_dict[f"perc_{target_ref}_rRNA_hits"] = perc_hits | ||
# turning into dataframe (single row) | ||
curr_tab = pd.DataFrame(sample_dict, index = [sample_name]) | ||
# combining with previous ones if any | ||
list_of_rows.append(curr_tab) | ||
# combining rows into one table | ||
out_tab = pd.concat(list_of_rows) | ||
# appending "-primary-target" to 4th column name | ||
out_tab.rename(columns = {out_tab.columns[3]: out_tab.columns[3] + "-primary-target"}, inplace = True) | ||
return(out_tab) | ||
''' | ||
|
||
# this version is if it is straight cDNA, and has no U's | ||
else: | ||
|
||
snakefile_text = ''' | ||
#################################################################################################### | ||
## Snakefile for GeneLab for estimating percentages of rRNA in RNAseq datasets ## | ||
## This is currently written for internal use only, and likely won't work as-is for non-GL folks ## | ||
|