From 44907b84795acd4822916dea27f2eb0debcc5295 Mon Sep 17 00:00:00 2001
From: AstrobioMike <michael.lee0517@gmail.com>
Date: Tue, 9 Jan 2024 18:12:30 -0800
Subject: [PATCH] adding --directRNAseq option to `GL-est-rRNA-percentages`

---
 bin/GL-est-rRNA-percentages | 250 ++++++++++++++++++++++++++++++++++--
 1 file changed, 242 insertions(+), 8 deletions(-)

diff --git a/bin/GL-est-rRNA-percentages b/bin/GL-est-rRNA-percentages
index ed4420b..d8e940f 100755
--- a/bin/GL-est-rRNA-percentages
+++ b/bin/GL-est-rRNA-percentages
@@ -16,7 +16,7 @@ parser = argparse.ArgumentParser(description = "This is currently just an intern
                                                 It is not currently written to work properly anywhere other than GeneLab's main cluster, and is not robust to \
                                                 things like varying file extensions (it is expected to have files already renamed in GL's convention). \
                                                 For help reach out to Mike. For version info, run `GL-version`.",
-                                 epilog="Ex. usage: GL-est-rRNA-percentages -s samplex.txt --ref Mus-musculus --GLDS-ID GLDS-XXX --slurm\n")
+                                 epilog="Ex. usage: GL-est-rRNA-percentages -s samples.txt --ref Mus-musculus --GLDS-ID GLDS-XXX --slurm\n")
 
 required = parser.add_argument_group('required arguments')
 
@@ -49,6 +49,7 @@ parser.add_argument("--reads-to-scan", help = "In the case with some datasets, e
                                        choices = ["R1", "R2"])
 
 parser.add_argument("--single-ended", help = "Add this flag if data are single-end sequencing", action = "store_true")
+parser.add_argument("--directRNAseq", help = "Add this flag if data are actualy RNA sequences that have U's instead of T's", action = "store_true")
 
 parser.add_argument("--slurm", help = "Add this flag to submit the job through slurm", action = "store_true")
 parser.add_argument("--nodename", help = "Add this flag to specify slurm node to use", action = "store", default = "")
@@ -299,6 +300,7 @@ def build_snakemake_call():
     ref_dir = os.environ['rRNA_refs']
     GLDS_ID = args.GLDS_ID
     single_ended = args.single_ended
+    directRNAseq = args.directRNAseq
 
     if args.reads_to_scan == "R1":
         expected_suffix = "_R1_raw.fastq.gz"
@@ -363,25 +365,257 @@ def write_blurb():
 
     # getting version of htstream (the programs all give 1.3.2, but the installed conda version is 1.3.3, so getting that with this mess)
     htstream_version = subprocess.check_output('conda list | grep htstream | tr -s " " "\t" | cut -f 2', shell = True).decode('utf-8').strip()
-    # writing out versions used to file
-    with open(blurb_file, "w") as out_file:
+    
+    # writing out versions used to file, and processing blurb (which slightly varies if these were RNA sequences instead of cDNA)
+    if args.directRNAseq:
 
-        out_file.write(f"htstream v{htstream_version} utilized\n")
+        with open(blurb_file, "w") as out_file:
 
-        out_file.write("\nProtocol text:\n\n")
-        out_file.write(f"rRNA percentages of reads were estimated with the 'hts_SeqScreener' program of htstream v{htstream_version} screened against a fasta file of {args.ref} rRNA sequences retrieved from NCBI on {data_of_rRNA_refs_fetched}.\n\n")
+            out_file.write(f"htstream v{htstream_version} utilized\n")
+
+            out_file.write("\nProtocol text:\n\n")
+            out_file.write(f"rRNA percentages of reads were estimated with the 'hts_SeqScreener' program of htstream v{htstream_version} screened against a fasta file of {args.ref} rRNA sequences retrieved from NCBI on {data_of_rRNA_refs_fetched}.\n")
+            out_file.write(f"Since 'hts_SeqScreener' does not deal with uracil bases, they were first changed to thymines in temporary files just for the sake of rRNA estimation - original files remain unchanged.\n\n")
+
+
+    else:
+        
+        with open(blurb_file, "w") as out_file:
+
+            out_file.write(f"htstream v{htstream_version} utilized\n")
+
+            out_file.write("\nProtocol text:\n\n")
+            out_file.write(f"rRNA percentages of reads were estimated with the 'hts_SeqScreener' program of htstream v{htstream_version} screened against a fasta file of {args.ref} rRNA sequences retrieved from NCBI on {data_of_rRNA_refs_fetched}.\n\n")
 
 
 def write_snakefile(snakefile_text):
 
-    """ this creates the needed Snakefile based on the variable below """
+    """
+    This creates the needed Snakefile based on the variables below. There
+    are two possible versions, one for if the input files are actual RNA sequencing, 
+    so they have U's instead of T's, and one where the input are DNA.
+
+    If there are U's, hts_SeqScreener can't deal with them. So we are converting U's to T's prior to running.
+    """
 
     with open("Snakefile", "w") as out_file:
 
         out_file.write(snakefile_text)
 
 
-snakefile_text = '''
+# this version is if data are RNA, and therefore have U's instead of T's
+if args.directRNAseq:
+
+    snakefile_text = '''
+####################################################################################################
+## Snakefile for GeneLab for estimating percentages of rRNA in RNAseq datasets                    ##
+## This is currently written for internal use only, and likely won't work as-is for non-GL folks  ##
+## This Snakefile was generated by the genelab-utils `GL-est-rRNA-percentages` program            ##
+## For help contact Michael D. Lee (Mike.Lee@nasa.gov)                                            ##
+####################################################################################################
+
+import os
+import json
+import pandas as pd
+
+## reading samples into list and setting some variables from command-line parameters
+sample_ID_list = [line.strip() for line in open(config["sample_IDs_file"])]
+ref_dir = config["ref_dir"]
+primary_target_ref_filename = config["primary_target_ref_filename"]
+primary_target_ref_path = os.path.join(ref_dir, primary_target_ref_filename)
+logs_dir = config["logs_dir"]
+
+
+########################################
+############# Rules start ##############
+########################################
+
+rule all:
+    input:
+        config["GLDS_ID"] + "-estimated-rRNA-percentages.tsv"
+
+if config["single_ended"] != True:
+
+    if not config["expected_suffix"]:
+
+        rule hts_Seq_Screen:
+            input:
+                R1 = config["input_reads_dir"] + "{ID}_R1_raw.fastq.gz",
+                R2 = config["input_reads_dir"] + "{ID}_R2_raw.fastq.gz"
+
+            params:
+                temp_R1 = config["input_reads_dir"] + "{ID}_R1_raw-U-to-T.temp.fastq",
+                temp_R2 = config["input_reads_dir"] + "{ID}_R2_raw-U-to-T.temp.fastq"
+
+            output:
+                json_log = logs_dir + "/{ID}-rRNA-screen.log"
+
+            shell:
+                """
+                # decompressing and converting U's to T's
+                zcat {input.R1} | tr "U" "T" > {params.temp_R1}
+                zcat {input.R2} | tr "U" "T" > {params.temp_R2}
+
+                hts_SeqScreener -1 {params.temp_R1} -2 {params.temp_R2} -C -r -L {output.json_log} \
+                    | hts_SeqScreener -s {primary_target_ref_path} -C -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Hsapiens.fasta -C -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Escherichia.fasta -C -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Pseudomonas.fasta -C -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Staphylococcus.fasta -C -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Streptococcus.fasta -C -r -A {output.json_log} > /dev/null
+
+                # removing intermediate files
+                rm {params.temp_R1} {params.temp_R2}
+                """
+
+    else:
+
+        rule hts_Seq_Screen:
+            input:
+                reads = config["input_reads_dir"] + "{ID}" + config["expected_suffix"]
+
+            params:
+                temp_reads = config["input_reads_dir"] + "{ID}-U-to-T.temp" + config["expected_suffix"] + ".fastq"
+
+            output:
+                json_log = logs_dir + "/{ID}-rRNA-screen.log"
+
+            shell:
+                """
+                # decompressing and converting U's to T's
+                zcat {input.reads} | tr "U" "T" > {params.temp_reads}
+
+                hts_SeqScreener -U {params.temp_reads} -r -L {output.json_log} \
+                    | hts_SeqScreener -s {primary_target_ref_path} -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Hsapiens.fasta -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Escherichia.fasta -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Pseudomonas.fasta -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Staphylococcus.fasta -r -A {output.json_log} \
+                    | hts_SeqScreener -s {ref_dir}/rrna-Streptococcus.fasta -r -A {output.json_log} > /dev/null
+
+                # removing intermediate file
+                rm {params.temp_reads}
+                """
+
+else:
+
+    rule hts_Seq_Screen:
+        input:
+            reads = config["input_reads_dir"] + "{ID}_raw.fastq.gz"
+
+        params:
+            temp_reads = config["input_reads_dir"] + "{ID}_raw-U-to-T.temp.fastq"
+
+        output:
+            json_log = logs_dir + "/{ID}-rRNA-screen.log"
+
+        shell:
+            """
+            # decompressing and converting U's to T's
+            zcat {input.reads} | tr "U" "T" > {params.temp_reads}
+
+            hts_SeqScreener -U {params.temp_reads} -r -L {output.json_log} \
+                | hts_SeqScreener -s {primary_target_ref_path} -r -A {output.json_log} \
+                | hts_SeqScreener -s {ref_dir}/rrna-Hsapiens.fasta -r -A {output.json_log} \
+                | hts_SeqScreener -s {ref_dir}/rrna-Escherichia.fasta -r -A {output.json_log} \
+                | hts_SeqScreener -s {ref_dir}/rrna-Pseudomonas.fasta -r -A {output.json_log} \
+                | hts_SeqScreener -s {ref_dir}/rrna-Staphylococcus.fasta -r -A {output.json_log} \
+                | hts_SeqScreener -s {ref_dir}/rrna-Streptococcus.fasta -r -A {output.json_log} > /dev/null
+
+            # removing intermediate files
+            rm {params.temp_reads}
+            """
+
+
+rule combine_outputs:
+    input:
+        expand(logs_dir + "/{ID}-rRNA-screen.log", ID = sample_ID_list)
+
+    output:
+        out_file = config["GLDS_ID"] + "-estimated-rRNA-percentages.tsv"
+
+    run:
+        out_tab = make_tables(input)
+        out_tab.to_csv(output.out_file, sep = "\t", index = False)
+
+
+
+########################################
+############## Functions ###############
+########################################
+
+def make_tables(list_of_files):
+
+    """
+    this takes a list of input paths to the json log files, parses them and calculates percentages of hits
+    """
+
+    # starting empty list of rows we combine at end
+    list_of_rows = []
+
+    for input_file in list_of_files:
+
+        # reading in json log file
+        with open(input_file) as json_file:
+
+            log_json = json.load(json_file)
+
+        # getting sample ID
+        sample_name = log_json[0]["Program_details"]["options"]["stats-file"].replace("-rRNA-screen.log", "").replace(logs_dir, "").replace("/", "")
+
+        # getting number of reads in sample
+        num_sample_total_reads = log_json[0]["Fragment"]["in"]
+
+        # making dictionary to hold info for this sample
+        sample_dict = {"sample": sample_name, "num_reads": num_sample_total_reads}
+
+        # getting the hits to each target ref
+        for element in log_json:
+
+            # getting target ref
+            if element["Program_details"]["options"].get("seq") is not None:
+
+                target_ref = os.path.basename(element["Program_details"]["options"]["seq"]).replace(".fasta", "").replace("rrna-", "")
+
+            else:
+
+                target_ref = "phiX"
+
+            # getting hits
+            if config["single_ended"] == True or config["expected_suffix"]:
+
+                num_hits = element["Single_end"]["hits"]
+
+            else:
+
+                num_hits = element["Paired_end"]["hits"]
+
+
+            # getting percent of total
+            perc_hits = round(num_hits / num_sample_total_reads * 100, 2)
+
+            # adding to dictionary
+            sample_dict[f"perc_{target_ref}_rRNA_hits"] = perc_hits
+
+        # turning into dataframe (single row)
+        curr_tab = pd.DataFrame(sample_dict, index = [sample_name])
+
+        # combining with previous ones if any
+        list_of_rows.append(curr_tab)
+
+    # combining rows into one table
+    out_tab = pd.concat(list_of_rows)
+
+    # appending "-primary-target" to 4th column name
+    out_tab.rename(columns = {out_tab.columns[3]: out_tab.columns[3] + "-primary-target"}, inplace = True)
+
+    return(out_tab)
+'''
+
+# this version is if it is straight cDNA, and has no U's
+else:
+
+    snakefile_text = '''
 ####################################################################################################
 ## Snakefile for GeneLab for estimating percentages of rRNA in RNAseq datasets                    ##
 ## This is currently written for internal use only, and likely won't work as-is for non-GL folks  ##