Adding Tests, prototype model, and refactoring (#40)

* Added in the mas8 prototype model for completeness. * Added version field to Longbow Models for reproducibility. - Added segment scores calculation for each non-random longbow modeled segment - Segment scores quality now added to output reads in new tag (XQ) - Overall segment score quality now added to output reads in new tag (YQ). This can be used as a proxy for read quality. - Updated test-requirements.txt to include all packages required to run tests - Added integration test for annotate, including test data and expected data - Added test utils module - Added integration test setting in tox
broadinstitute · May 8, 2021 · 80ff193 · 80ff193
1 parent 17f3f13
commit 80ff193
Show file tree

Hide file tree

Showing 15 changed files with 290 additions and 90 deletions.
diff --git a/src/longbow/annotate/command.py b/src/longbow/annotate/command.py
@@ -3,21 +3,22 @@
 import itertools
 import re
 import time
-import collections
 import os
 
 import click
 import click_log
 import tqdm
 
+import ssw
+
 import pysam
 import multiprocessing as mp
 
 import gzip
 from construct import *
 
 from ..utils import bam_utils
-from ..utils.model import reverse_complement
+from ..utils.bam_utils import SegmentInfo
 from ..utils.model import LibraryModel
 
 
@@ -26,26 +27,6 @@
 click_log.basic_config(logger)
 
 
-# Named tuple to store alignment information:
-class SegmentInfo(collections.namedtuple("SegmentInfo", ["name", "start", "end"])):
-
-    _tag_regex = re.compile(r"(.*?):(\d+)-(\d+)")
-
-    def __len__(self):
-        return self.end - self.start
-
-    def __str__(self):
-        return f"SegmentInfo({self.to_tag()})"
-
-    def to_tag(self):
-        return f"{self.name}:{self.start}-{self.end}"
-
-    @classmethod
-    def from_tag(cls, tag_string):
-        match = cls._tag_regex.match(tag_string)
-        return SegmentInfo(match[1], int(match[2]), int(match[3]))
-
-
 @click.command(name=logger.name)
 @click_log.simple_verbosity_option(logger)
 @click.option(
@@ -129,7 +110,7 @@ def main(pbi, threads, output_bam, m10, input_bam):
         res = manager.dict({"num_reads_annotated": 0, "num_sections": 0})
         output_worker = mp.Process(
             target=_write_thread_fn,
-            args=(results, out_header, output_bam, not sys.stdin.isatty(), res, read_count, m.name)
+            args=(results, out_header, output_bam, not sys.stdin.isatty(), res, read_count, m)
         )
         output_worker.start()
 
@@ -172,7 +153,7 @@ def get_segments(read):
     ]
 
 
-def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, res, read_count, model_name):
+def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, res, read_count, model):
     """Thread / process fn to write out all our data."""
 
     with pysam.AlignmentFile(
@@ -186,6 +167,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar,
         total=read_count
     ) as pbar:
 
+        ssw_aligner = ssw.Aligner()
+
         while True:
             # Wait for some output data:
             raw_data = out_queue.get()
@@ -199,10 +182,11 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar,
 
             # Unpack data:
             read, ppath, logp, is_rc = raw_data
-            read = pysam.AlignedSegment.fromstring(read, out_bam_header)
 
             # Condense the output annotations so we can write them out with indices:
-            segments = _collapse_annotations(ppath)
+            segments = bam_utils.collapse_annotations(ppath)
+
+            read = pysam.AlignedSegment.fromstring(read, out_bam_header)
 
             # Obligatory log message:
             logger.debug(
@@ -213,22 +197,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar,
                 segments,
             )
 
-            # Set our tag and write out the read to the annotated file:
-            read.set_tag(bam_utils.SEGMENTS_TAG, bam_utils.SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments]))
-
-            # Set the model info tags:
-            read.set_tag(bam_utils.READ_MODEL_SCORE_TAG, logp)
-            read.set_tag(bam_utils.READ_MODEL_NAME_TAG, model_name)
-
-            # If we're reverse complemented, we make it easy and just reverse complement the read and add a tag saying
-            # that the read was RC:
-            read.set_tag(bam_utils.SEGMENTS_RC_TAG, is_rc)
-            if is_rc:
-                quals = read.query_qualities[::-1]
-                seq = reverse_complement(read.query_sequence)
-                read.query_sequence = seq
-                read.query_qualities = quals
-            out_bam_file.write(read)
+            # Write our our read:
+            bam_utils.write_annotated_read(read, segments, is_rc, logp, model, ssw_aligner, out_bam_file)
 
             # Increment our counters:
             res["num_reads_annotated"] += 1
@@ -272,29 +242,11 @@ def _worker_segmentation_fn(in_queue, out_queue, worker_num, model):
     logger.debug(f"Worker %d: Num reads segmented: %d", worker_num, num_reads_segmented)
 
 
-def _collapse_annotations(path):
-    """Collapses given path into a list of SegmentInfo objects."""
-    last = ""
-    start = 0
-    segments = []
-    i = 0
-    for i, seg in enumerate(path):
-        if seg != last:
-            if i != 0:
-                segments.append(SegmentInfo(last, start, i - 1))
-            last = seg
-            start = i
-    # Don't forget the last one:
-    segments.append(SegmentInfo(last, start, i))
-
-    return segments
-
-
 def _segment_read(read, model):
     is_rc = False
     logp, ppath = model.annotate(read.query_sequence)
 
-    rc_logp, rc_ppath = model.annotate(reverse_complement(read.query_sequence))
+    rc_logp, rc_ppath = model.annotate(bam_utils.reverse_complement(read.query_sequence))
     if rc_logp > logp:
         logp = rc_logp
         ppath = rc_ppath

diff --git a/src/longbow/discriminate/command.py b/src/longbow/discriminate/command.py
@@ -9,15 +9,16 @@
 import click_log
 import tqdm
 
+import ssw
+
 import pysam
 import multiprocessing as mp
 
-from ..annotate.command import SegmentInfo
-from ..annotate.command import _collapse_annotations
 from ..utils import bam_utils
 from ..utils.model import reverse_complement
 from ..utils.model import LibraryModel
 
+
 logging.basicConfig(stream=sys.stderr)
 logger = logging.getLogger("discriminate")
 click_log.basic_config(logger)
@@ -158,6 +159,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_base_name, model_list, r
             total=read_count
         ) as pbar:
 
+            ssw_aligner = ssw.Aligner()
+
             while True:
                 # Wait for some output data:
                 raw_data = out_queue.get()
@@ -171,36 +174,25 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_base_name, model_list, r
 
                 # Unpack data:
                 read, ppath, logp, is_rc, model_name = raw_data
-                read = pysam.AlignedSegment.fromstring(read, out_bam_header)
+
+                # Get our model for this read.
+                # NOTE: We should always have a model after this code because all models that can be assigned to the
+                #       reads are in the model_list.
+                model = None
+                for m in model_list:
+                    if m.name == model_name:
+                        model = m
+                        break
 
                 # Condense the output annotations so we can write them out with indices:
-                segments = _collapse_annotations(ppath)
-
-                # Obligatory log message:
-                logger.debug(
-                    "Path for read %s (%2.2f)%s: %s",
-                    read.query_name,
-                    logp,
-                    " (RC)" if is_rc else "",
-                    segments,
-                )
+                segments = bam_utils.collapse_annotations(ppath)
 
-                # Set our tag and write out the read to the annotated file:
-                read.set_tag(
-                    bam_utils.SEGMENTS_TAG, bam_utils.SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments])
-                )
+                read = pysam.AlignedSegment.fromstring(read, out_bam_header)
 
-                # If we're reverse complemented, we make it easy and just reverse complement the read and add a
-                # tag saying that the read was RC:
-                read.set_tag(bam_utils.SEGMENTS_RC_TAG, is_rc)
-                read.set_tag(bam_utils.READ_MODEL_SCORE_TAG, logp)
-                read.set_tag(bam_utils.READ_MODEL_NAME_TAG, model_name)
-                if is_rc:
-                    quals = read.query_qualities[::-1]
-                    seq = reverse_complement(read.query_sequence)
-                    read.query_sequence = seq
-                    read.query_qualities = quals
-                out_file_dict[model_name].write(read)
+                # Write our our read:
+                bam_utils.write_annotated_read(
+                    read, segments, is_rc, logp, model, ssw_aligner, out_file_dict[model_name]
+                )
 
                 # Increment our counters:
                 res["num_reads_annotated"] += 1

diff --git a/src/longbow/utils/bam_utils.py b/src/longbow/utils/bam_utils.py
@@ -4,13 +4,16 @@
 import re
 import logging
 import click_log
+import collections
+import re
 
 from construct import *
 from inspect import getframeinfo, currentframe, getdoc
 
 import pysam
 
 from ..meta import VERSION
+from ..utils.model import reverse_complement
 
 logging.basicConfig(stream=sys.stderr)
 logger = logging.getLogger("bam_utils")
@@ -20,17 +23,39 @@
 
 # Constants for bam file reading / writing:
 SEGMENTS_TAG = "SG"
+SEGMENTS_QUAL_TAG = "XQ"
 SEGMENTS_RC_TAG = "RC"
 SEGMENT_TAG_DELIMITER = ","
 READ_MODEL_NAME_TAG = "YN"
 READ_MODEL_SCORE_TAG = "YS"
 READ_IS_VALID_FOR_MODEL_TAG = "YV"
 READ_FIRST_KEY_SEG_TAG = "YK"
 READ_NUM_KEY_SEGMENTS_TAG = "YG"
+READ_APPROX_QUAL_TAG = "YQ"
 
 READ_UMI_TAG = ""
 
 
+# Named tuple to store alignment information:
+class SegmentInfo(collections.namedtuple("SegmentInfo", ["name", "start", "end"])):
+
+    _tag_regex = re.compile(r"(.*?):(\d+)-(\d+)")
+
+    def __len__(self):
+        return self.end - self.start
+
+    def __str__(self):
+        return f"SegmentInfo({self.to_tag()})"
+
+    def to_tag(self):
+        return f"{self.name}:{self.start}-{self.end}"
+
+    @classmethod
+    def from_tag(cls, tag_string):
+        match = cls._tag_regex.match(tag_string)
+        return SegmentInfo(match[1], int(match[2]), int(match[3]))
+
+
 def load_read_count(pbi_file):
     """Compute file offsets for specified read names"""
 
@@ -109,3 +134,89 @@ def check_for_preexisting_files(file_list, exist_ok=False):
                 do_files_exist = True
     if do_files_exist:
         sys.exit(1)
+
+
+def get_segment_score(read_sequence, segment, model, ssw_aligner=None):
+    """Get the alignment score of the given segment against the read sequence."""
+
+    # We don't score random segments:
+    if segment.name == "random":
+        return 0, 0
+
+    # Create a default aligner if we weren't given one:
+    if not ssw_aligner:
+        ssw_aligner = ssw.Aligner()
+
+    # Get our alignment and our score:
+    alignment = ssw_aligner.align(read_sequence[segment.start:segment.end], model.adapter_dict[segment.name])
+    optimal_score = alignment.score
+
+    # The max score is the match score * the length of the reference segment
+    max_score = (segment.end - segment.start) * ssw_aligner.matrix.get_match()
+
+    return optimal_score, max_score
+
+
+def collapse_annotations(path):
+    """Collapses given path into a list of SegmentInfo objects."""
+    last = ""
+    start = 0
+    segments = []
+    i = 0
+    for i, seg in enumerate(path):
+        if seg != last:
+            if i != 0:
+                segments.append(SegmentInfo(last, start, i - 1))
+            last = seg
+            start = i
+    # Don't forget the last one:
+    segments.append(SegmentInfo(last, start, i))
+
+    return segments
+
+
+def write_annotated_read(read, segments, is_rc, logp, model, ssw_aligner, out_bam_file):
+    """Write the given pysam.AlignedSegment read object to the given file with the given metadata."""
+
+    # Obligatory log message:
+    logger.debug(
+        "Path for read %s (%2.2f)%s: %s",
+        read.query_name,
+        logp,
+        " (RC)" if is_rc else "",
+        segments,
+    )
+
+    # Set our tag and write out the read to the annotated file:
+    read.set_tag(SEGMENTS_TAG, SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments]))
+
+    # Set the model info tags:
+    read.set_tag(READ_MODEL_SCORE_TAG, logp)
+    read.set_tag(READ_MODEL_NAME_TAG, model.name)
+
+    # If we're reverse complemented, we make it easy and just reverse complement the read and add a tag saying
+    # that the read was RC:
+    read.set_tag(SEGMENTS_RC_TAG, is_rc)
+    if is_rc:
+        quals = read.query_qualities[::-1]
+        seq = reverse_complement(read.query_sequence)
+        read.query_sequence = seq
+        read.query_qualities = quals
+
+    # Get our segment scores and set them:
+    total_score = 0
+    total_max_score = 0
+    score_strings = []
+    for s in segments:
+        score, max_score = get_segment_score(read.query_sequence, s, model, ssw_aligner)
+        score_strings.append(f"{score}/{max_score}")
+        total_score += score
+        total_max_score += max_score
+
+    read.set_tag(SEGMENTS_QUAL_TAG, SEGMENT_TAG_DELIMITER.join(score_strings))
+    if total_max_score != 0:
+        read.set_tag(READ_APPROX_QUAL_TAG, f"{total_score / total_max_score:.4f}")
+    else:
+        read.set_tag(READ_APPROX_QUAL_TAG, f"0.0")
+
+    out_bam_file.write(read)