Skip to content

Commit

Permalink
Adding Tests, prototype model, and refactoring (#40)
Browse files Browse the repository at this point in the history
* Added in the mas8 prototype model for completeness.
* Added version field to Longbow Models for reproducibility.
- Added segment scores calculation for each non-random longbow modeled segment
- Segment scores quality now added to output reads in new tag (XQ)
- Overall segment score quality now added to output reads in new tag (YQ).  This can be used as a proxy for read quality.
- Updated test-requirements.txt to include all packages required to run tests
- Added integration test for annotate, including test data and expected data
- Added test utils module
- Added integration test setting in tox
  • Loading branch information
jonn-smith authored May 8, 2021
1 parent 17f3f13 commit 80ff193
Show file tree
Hide file tree
Showing 15 changed files with 290 additions and 90 deletions.
74 changes: 13 additions & 61 deletions src/longbow/annotate/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,21 +3,22 @@
import itertools
import re
import time
import collections
import os

import click
import click_log
import tqdm

import ssw

import pysam
import multiprocessing as mp

import gzip
from construct import *

from ..utils import bam_utils
from ..utils.model import reverse_complement
from ..utils.bam_utils import SegmentInfo
from ..utils.model import LibraryModel


Expand All @@ -26,26 +27,6 @@
click_log.basic_config(logger)


# Named tuple to store alignment information:
class SegmentInfo(collections.namedtuple("SegmentInfo", ["name", "start", "end"])):

_tag_regex = re.compile(r"(.*?):(\d+)-(\d+)")

def __len__(self):
return self.end - self.start

def __str__(self):
return f"SegmentInfo({self.to_tag()})"

def to_tag(self):
return f"{self.name}:{self.start}-{self.end}"

@classmethod
def from_tag(cls, tag_string):
match = cls._tag_regex.match(tag_string)
return SegmentInfo(match[1], int(match[2]), int(match[3]))


@click.command(name=logger.name)
@click_log.simple_verbosity_option(logger)
@click.option(
Expand Down Expand Up @@ -129,7 +110,7 @@ def main(pbi, threads, output_bam, m10, input_bam):
res = manager.dict({"num_reads_annotated": 0, "num_sections": 0})
output_worker = mp.Process(
target=_write_thread_fn,
args=(results, out_header, output_bam, not sys.stdin.isatty(), res, read_count, m.name)
args=(results, out_header, output_bam, not sys.stdin.isatty(), res, read_count, m)
)
output_worker.start()

Expand Down Expand Up @@ -172,7 +153,7 @@ def get_segments(read):
]


def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, res, read_count, model_name):
def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, res, read_count, model):
"""Thread / process fn to write out all our data."""

with pysam.AlignmentFile(
Expand All @@ -186,6 +167,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar,
total=read_count
) as pbar:

ssw_aligner = ssw.Aligner()

while True:
# Wait for some output data:
raw_data = out_queue.get()
Expand All @@ -199,10 +182,11 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar,

# Unpack data:
read, ppath, logp, is_rc = raw_data
read = pysam.AlignedSegment.fromstring(read, out_bam_header)

# Condense the output annotations so we can write them out with indices:
segments = _collapse_annotations(ppath)
segments = bam_utils.collapse_annotations(ppath)

read = pysam.AlignedSegment.fromstring(read, out_bam_header)

# Obligatory log message:
logger.debug(
Expand All @@ -213,22 +197,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar,
segments,
)

# Set our tag and write out the read to the annotated file:
read.set_tag(bam_utils.SEGMENTS_TAG, bam_utils.SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments]))

# Set the model info tags:
read.set_tag(bam_utils.READ_MODEL_SCORE_TAG, logp)
read.set_tag(bam_utils.READ_MODEL_NAME_TAG, model_name)

# If we're reverse complemented, we make it easy and just reverse complement the read and add a tag saying
# that the read was RC:
read.set_tag(bam_utils.SEGMENTS_RC_TAG, is_rc)
if is_rc:
quals = read.query_qualities[::-1]
seq = reverse_complement(read.query_sequence)
read.query_sequence = seq
read.query_qualities = quals
out_bam_file.write(read)
# Write our our read:
bam_utils.write_annotated_read(read, segments, is_rc, logp, model, ssw_aligner, out_bam_file)

# Increment our counters:
res["num_reads_annotated"] += 1
Expand Down Expand Up @@ -272,29 +242,11 @@ def _worker_segmentation_fn(in_queue, out_queue, worker_num, model):
logger.debug(f"Worker %d: Num reads segmented: %d", worker_num, num_reads_segmented)


def _collapse_annotations(path):
"""Collapses given path into a list of SegmentInfo objects."""
last = ""
start = 0
segments = []
i = 0
for i, seg in enumerate(path):
if seg != last:
if i != 0:
segments.append(SegmentInfo(last, start, i - 1))
last = seg
start = i
# Don't forget the last one:
segments.append(SegmentInfo(last, start, i))

return segments


def _segment_read(read, model):
is_rc = False
logp, ppath = model.annotate(read.query_sequence)

rc_logp, rc_ppath = model.annotate(reverse_complement(read.query_sequence))
rc_logp, rc_ppath = model.annotate(bam_utils.reverse_complement(read.query_sequence))
if rc_logp > logp:
logp = rc_logp
ppath = rc_ppath
Expand Down
48 changes: 20 additions & 28 deletions src/longbow/discriminate/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,16 @@
import click_log
import tqdm

import ssw

import pysam
import multiprocessing as mp

from ..annotate.command import SegmentInfo
from ..annotate.command import _collapse_annotations
from ..utils import bam_utils
from ..utils.model import reverse_complement
from ..utils.model import LibraryModel


logging.basicConfig(stream=sys.stderr)
logger = logging.getLogger("discriminate")
click_log.basic_config(logger)
Expand Down Expand Up @@ -158,6 +159,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_base_name, model_list, r
total=read_count
) as pbar:

ssw_aligner = ssw.Aligner()

while True:
# Wait for some output data:
raw_data = out_queue.get()
Expand All @@ -171,36 +174,25 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_base_name, model_list, r

# Unpack data:
read, ppath, logp, is_rc, model_name = raw_data
read = pysam.AlignedSegment.fromstring(read, out_bam_header)

# Get our model for this read.
# NOTE: We should always have a model after this code because all models that can be assigned to the
# reads are in the model_list.
model = None
for m in model_list:
if m.name == model_name:
model = m
break

# Condense the output annotations so we can write them out with indices:
segments = _collapse_annotations(ppath)

# Obligatory log message:
logger.debug(
"Path for read %s (%2.2f)%s: %s",
read.query_name,
logp,
" (RC)" if is_rc else "",
segments,
)
segments = bam_utils.collapse_annotations(ppath)

# Set our tag and write out the read to the annotated file:
read.set_tag(
bam_utils.SEGMENTS_TAG, bam_utils.SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments])
)
read = pysam.AlignedSegment.fromstring(read, out_bam_header)

# If we're reverse complemented, we make it easy and just reverse complement the read and add a
# tag saying that the read was RC:
read.set_tag(bam_utils.SEGMENTS_RC_TAG, is_rc)
read.set_tag(bam_utils.READ_MODEL_SCORE_TAG, logp)
read.set_tag(bam_utils.READ_MODEL_NAME_TAG, model_name)
if is_rc:
quals = read.query_qualities[::-1]
seq = reverse_complement(read.query_sequence)
read.query_sequence = seq
read.query_qualities = quals
out_file_dict[model_name].write(read)
# Write our our read:
bam_utils.write_annotated_read(
read, segments, is_rc, logp, model, ssw_aligner, out_file_dict[model_name]
)

# Increment our counters:
res["num_reads_annotated"] += 1
Expand Down
111 changes: 111 additions & 0 deletions src/longbow/utils/bam_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
import re
import logging
import click_log
import collections
import re

from construct import *
from inspect import getframeinfo, currentframe, getdoc

import pysam

from ..meta import VERSION
from ..utils.model import reverse_complement

logging.basicConfig(stream=sys.stderr)
logger = logging.getLogger("bam_utils")
Expand All @@ -20,17 +23,39 @@

# Constants for bam file reading / writing:
SEGMENTS_TAG = "SG"
SEGMENTS_QUAL_TAG = "XQ"
SEGMENTS_RC_TAG = "RC"
SEGMENT_TAG_DELIMITER = ","
READ_MODEL_NAME_TAG = "YN"
READ_MODEL_SCORE_TAG = "YS"
READ_IS_VALID_FOR_MODEL_TAG = "YV"
READ_FIRST_KEY_SEG_TAG = "YK"
READ_NUM_KEY_SEGMENTS_TAG = "YG"
READ_APPROX_QUAL_TAG = "YQ"

READ_UMI_TAG = ""


# Named tuple to store alignment information:
class SegmentInfo(collections.namedtuple("SegmentInfo", ["name", "start", "end"])):

_tag_regex = re.compile(r"(.*?):(\d+)-(\d+)")

def __len__(self):
return self.end - self.start

def __str__(self):
return f"SegmentInfo({self.to_tag()})"

def to_tag(self):
return f"{self.name}:{self.start}-{self.end}"

@classmethod
def from_tag(cls, tag_string):
match = cls._tag_regex.match(tag_string)
return SegmentInfo(match[1], int(match[2]), int(match[3]))


def load_read_count(pbi_file):
"""Compute file offsets for specified read names"""

Expand Down Expand Up @@ -109,3 +134,89 @@ def check_for_preexisting_files(file_list, exist_ok=False):
do_files_exist = True
if do_files_exist:
sys.exit(1)


def get_segment_score(read_sequence, segment, model, ssw_aligner=None):
"""Get the alignment score of the given segment against the read sequence."""

# We don't score random segments:
if segment.name == "random":
return 0, 0

# Create a default aligner if we weren't given one:
if not ssw_aligner:
ssw_aligner = ssw.Aligner()

# Get our alignment and our score:
alignment = ssw_aligner.align(read_sequence[segment.start:segment.end], model.adapter_dict[segment.name])
optimal_score = alignment.score

# The max score is the match score * the length of the reference segment
max_score = (segment.end - segment.start) * ssw_aligner.matrix.get_match()

return optimal_score, max_score


def collapse_annotations(path):
"""Collapses given path into a list of SegmentInfo objects."""
last = ""
start = 0
segments = []
i = 0
for i, seg in enumerate(path):
if seg != last:
if i != 0:
segments.append(SegmentInfo(last, start, i - 1))
last = seg
start = i
# Don't forget the last one:
segments.append(SegmentInfo(last, start, i))

return segments


def write_annotated_read(read, segments, is_rc, logp, model, ssw_aligner, out_bam_file):
"""Write the given pysam.AlignedSegment read object to the given file with the given metadata."""

# Obligatory log message:
logger.debug(
"Path for read %s (%2.2f)%s: %s",
read.query_name,
logp,
" (RC)" if is_rc else "",
segments,
)

# Set our tag and write out the read to the annotated file:
read.set_tag(SEGMENTS_TAG, SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments]))

# Set the model info tags:
read.set_tag(READ_MODEL_SCORE_TAG, logp)
read.set_tag(READ_MODEL_NAME_TAG, model.name)

# If we're reverse complemented, we make it easy and just reverse complement the read and add a tag saying
# that the read was RC:
read.set_tag(SEGMENTS_RC_TAG, is_rc)
if is_rc:
quals = read.query_qualities[::-1]
seq = reverse_complement(read.query_sequence)
read.query_sequence = seq
read.query_qualities = quals

# Get our segment scores and set them:
total_score = 0
total_max_score = 0
score_strings = []
for s in segments:
score, max_score = get_segment_score(read.query_sequence, s, model, ssw_aligner)
score_strings.append(f"{score}/{max_score}")
total_score += score
total_max_score += max_score

read.set_tag(SEGMENTS_QUAL_TAG, SEGMENT_TAG_DELIMITER.join(score_strings))
if total_max_score != 0:
read.set_tag(READ_APPROX_QUAL_TAG, f"{total_score / total_max_score:.4f}")
else:
read.set_tag(READ_APPROX_QUAL_TAG, f"0.0")

out_bam_file.write(read)
Loading

0 comments on commit 80ff193

Please sign in to comment.