diff --git a/src/longbow/annotate/command.py b/src/longbow/annotate/command.py index f359f4ca..6d5879de 100644 --- a/src/longbow/annotate/command.py +++ b/src/longbow/annotate/command.py @@ -3,13 +3,14 @@ import itertools import re import time -import collections import os import click import click_log import tqdm +import ssw + import pysam import multiprocessing as mp @@ -17,7 +18,7 @@ from construct import * from ..utils import bam_utils -from ..utils.model import reverse_complement +from ..utils.bam_utils import SegmentInfo from ..utils.model import LibraryModel @@ -26,26 +27,6 @@ click_log.basic_config(logger) -# Named tuple to store alignment information: -class SegmentInfo(collections.namedtuple("SegmentInfo", ["name", "start", "end"])): - - _tag_regex = re.compile(r"(.*?):(\d+)-(\d+)") - - def __len__(self): - return self.end - self.start - - def __str__(self): - return f"SegmentInfo({self.to_tag()})" - - def to_tag(self): - return f"{self.name}:{self.start}-{self.end}" - - @classmethod - def from_tag(cls, tag_string): - match = cls._tag_regex.match(tag_string) - return SegmentInfo(match[1], int(match[2]), int(match[3])) - - @click.command(name=logger.name) @click_log.simple_verbosity_option(logger) @click.option( @@ -129,7 +110,7 @@ def main(pbi, threads, output_bam, m10, input_bam): res = manager.dict({"num_reads_annotated": 0, "num_sections": 0}) output_worker = mp.Process( target=_write_thread_fn, - args=(results, out_header, output_bam, not sys.stdin.isatty(), res, read_count, m.name) + args=(results, out_header, output_bam, not sys.stdin.isatty(), res, read_count, m) ) output_worker.start() @@ -172,7 +153,7 @@ def get_segments(read): ] -def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, res, read_count, model_name): +def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, res, read_count, model): """Thread / process fn to write out all our data.""" with pysam.AlignmentFile( @@ -186,6 +167,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, total=read_count ) as pbar: + ssw_aligner = ssw.Aligner() + while True: # Wait for some output data: raw_data = out_queue.get() @@ -199,10 +182,11 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, # Unpack data: read, ppath, logp, is_rc = raw_data - read = pysam.AlignedSegment.fromstring(read, out_bam_header) # Condense the output annotations so we can write them out with indices: - segments = _collapse_annotations(ppath) + segments = bam_utils.collapse_annotations(ppath) + + read = pysam.AlignedSegment.fromstring(read, out_bam_header) # Obligatory log message: logger.debug( @@ -213,22 +197,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_file_name, disable_pbar, segments, ) - # Set our tag and write out the read to the annotated file: - read.set_tag(bam_utils.SEGMENTS_TAG, bam_utils.SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments])) - - # Set the model info tags: - read.set_tag(bam_utils.READ_MODEL_SCORE_TAG, logp) - read.set_tag(bam_utils.READ_MODEL_NAME_TAG, model_name) - - # If we're reverse complemented, we make it easy and just reverse complement the read and add a tag saying - # that the read was RC: - read.set_tag(bam_utils.SEGMENTS_RC_TAG, is_rc) - if is_rc: - quals = read.query_qualities[::-1] - seq = reverse_complement(read.query_sequence) - read.query_sequence = seq - read.query_qualities = quals - out_bam_file.write(read) + # Write our our read: + bam_utils.write_annotated_read(read, segments, is_rc, logp, model, ssw_aligner, out_bam_file) # Increment our counters: res["num_reads_annotated"] += 1 @@ -272,29 +242,11 @@ def _worker_segmentation_fn(in_queue, out_queue, worker_num, model): logger.debug(f"Worker %d: Num reads segmented: %d", worker_num, num_reads_segmented) -def _collapse_annotations(path): - """Collapses given path into a list of SegmentInfo objects.""" - last = "" - start = 0 - segments = [] - i = 0 - for i, seg in enumerate(path): - if seg != last: - if i != 0: - segments.append(SegmentInfo(last, start, i - 1)) - last = seg - start = i - # Don't forget the last one: - segments.append(SegmentInfo(last, start, i)) - - return segments - - def _segment_read(read, model): is_rc = False logp, ppath = model.annotate(read.query_sequence) - rc_logp, rc_ppath = model.annotate(reverse_complement(read.query_sequence)) + rc_logp, rc_ppath = model.annotate(bam_utils.reverse_complement(read.query_sequence)) if rc_logp > logp: logp = rc_logp ppath = rc_ppath diff --git a/src/longbow/discriminate/command.py b/src/longbow/discriminate/command.py index 9c8eaf7e..0dc3491f 100644 --- a/src/longbow/discriminate/command.py +++ b/src/longbow/discriminate/command.py @@ -9,15 +9,16 @@ import click_log import tqdm +import ssw + import pysam import multiprocessing as mp -from ..annotate.command import SegmentInfo -from ..annotate.command import _collapse_annotations from ..utils import bam_utils from ..utils.model import reverse_complement from ..utils.model import LibraryModel + logging.basicConfig(stream=sys.stderr) logger = logging.getLogger("discriminate") click_log.basic_config(logger) @@ -158,6 +159,8 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_base_name, model_list, r total=read_count ) as pbar: + ssw_aligner = ssw.Aligner() + while True: # Wait for some output data: raw_data = out_queue.get() @@ -171,36 +174,25 @@ def _write_thread_fn(out_queue, out_bam_header, out_bam_base_name, model_list, r # Unpack data: read, ppath, logp, is_rc, model_name = raw_data - read = pysam.AlignedSegment.fromstring(read, out_bam_header) + + # Get our model for this read. + # NOTE: We should always have a model after this code because all models that can be assigned to the + # reads are in the model_list. + model = None + for m in model_list: + if m.name == model_name: + model = m + break # Condense the output annotations so we can write them out with indices: - segments = _collapse_annotations(ppath) - - # Obligatory log message: - logger.debug( - "Path for read %s (%2.2f)%s: %s", - read.query_name, - logp, - " (RC)" if is_rc else "", - segments, - ) + segments = bam_utils.collapse_annotations(ppath) - # Set our tag and write out the read to the annotated file: - read.set_tag( - bam_utils.SEGMENTS_TAG, bam_utils.SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments]) - ) + read = pysam.AlignedSegment.fromstring(read, out_bam_header) - # If we're reverse complemented, we make it easy and just reverse complement the read and add a - # tag saying that the read was RC: - read.set_tag(bam_utils.SEGMENTS_RC_TAG, is_rc) - read.set_tag(bam_utils.READ_MODEL_SCORE_TAG, logp) - read.set_tag(bam_utils.READ_MODEL_NAME_TAG, model_name) - if is_rc: - quals = read.query_qualities[::-1] - seq = reverse_complement(read.query_sequence) - read.query_sequence = seq - read.query_qualities = quals - out_file_dict[model_name].write(read) + # Write our our read: + bam_utils.write_annotated_read( + read, segments, is_rc, logp, model, ssw_aligner, out_file_dict[model_name] + ) # Increment our counters: res["num_reads_annotated"] += 1 diff --git a/src/longbow/utils/bam_utils.py b/src/longbow/utils/bam_utils.py index 40b8f70f..cb21d29d 100644 --- a/src/longbow/utils/bam_utils.py +++ b/src/longbow/utils/bam_utils.py @@ -4,6 +4,8 @@ import re import logging import click_log +import collections +import re from construct import * from inspect import getframeinfo, currentframe, getdoc @@ -11,6 +13,7 @@ import pysam from ..meta import VERSION +from ..utils.model import reverse_complement logging.basicConfig(stream=sys.stderr) logger = logging.getLogger("bam_utils") @@ -20,6 +23,7 @@ # Constants for bam file reading / writing: SEGMENTS_TAG = "SG" +SEGMENTS_QUAL_TAG = "XQ" SEGMENTS_RC_TAG = "RC" SEGMENT_TAG_DELIMITER = "," READ_MODEL_NAME_TAG = "YN" @@ -27,10 +31,31 @@ READ_IS_VALID_FOR_MODEL_TAG = "YV" READ_FIRST_KEY_SEG_TAG = "YK" READ_NUM_KEY_SEGMENTS_TAG = "YG" +READ_APPROX_QUAL_TAG = "YQ" READ_UMI_TAG = "" +# Named tuple to store alignment information: +class SegmentInfo(collections.namedtuple("SegmentInfo", ["name", "start", "end"])): + + _tag_regex = re.compile(r"(.*?):(\d+)-(\d+)") + + def __len__(self): + return self.end - self.start + + def __str__(self): + return f"SegmentInfo({self.to_tag()})" + + def to_tag(self): + return f"{self.name}:{self.start}-{self.end}" + + @classmethod + def from_tag(cls, tag_string): + match = cls._tag_regex.match(tag_string) + return SegmentInfo(match[1], int(match[2]), int(match[3])) + + def load_read_count(pbi_file): """Compute file offsets for specified read names""" @@ -109,3 +134,89 @@ def check_for_preexisting_files(file_list, exist_ok=False): do_files_exist = True if do_files_exist: sys.exit(1) + + +def get_segment_score(read_sequence, segment, model, ssw_aligner=None): + """Get the alignment score of the given segment against the read sequence.""" + + # We don't score random segments: + if segment.name == "random": + return 0, 0 + + # Create a default aligner if we weren't given one: + if not ssw_aligner: + ssw_aligner = ssw.Aligner() + + # Get our alignment and our score: + alignment = ssw_aligner.align(read_sequence[segment.start:segment.end], model.adapter_dict[segment.name]) + optimal_score = alignment.score + + # The max score is the match score * the length of the reference segment + max_score = (segment.end - segment.start) * ssw_aligner.matrix.get_match() + + return optimal_score, max_score + + +def collapse_annotations(path): + """Collapses given path into a list of SegmentInfo objects.""" + last = "" + start = 0 + segments = [] + i = 0 + for i, seg in enumerate(path): + if seg != last: + if i != 0: + segments.append(SegmentInfo(last, start, i - 1)) + last = seg + start = i + # Don't forget the last one: + segments.append(SegmentInfo(last, start, i)) + + return segments + + +def write_annotated_read(read, segments, is_rc, logp, model, ssw_aligner, out_bam_file): + """Write the given pysam.AlignedSegment read object to the given file with the given metadata.""" + + # Obligatory log message: + logger.debug( + "Path for read %s (%2.2f)%s: %s", + read.query_name, + logp, + " (RC)" if is_rc else "", + segments, + ) + + # Set our tag and write out the read to the annotated file: + read.set_tag(SEGMENTS_TAG, SEGMENT_TAG_DELIMITER.join([s.to_tag() for s in segments])) + + # Set the model info tags: + read.set_tag(READ_MODEL_SCORE_TAG, logp) + read.set_tag(READ_MODEL_NAME_TAG, model.name) + + # If we're reverse complemented, we make it easy and just reverse complement the read and add a tag saying + # that the read was RC: + read.set_tag(SEGMENTS_RC_TAG, is_rc) + if is_rc: + quals = read.query_qualities[::-1] + seq = reverse_complement(read.query_sequence) + read.query_sequence = seq + read.query_qualities = quals + + # Get our segment scores and set them: + total_score = 0 + total_max_score = 0 + score_strings = [] + for s in segments: + score, max_score = get_segment_score(read.query_sequence, s, model, ssw_aligner) + score_strings.append(f"{score}/{max_score}") + total_score += score + total_max_score += max_score + + read.set_tag(SEGMENTS_QUAL_TAG, SEGMENT_TAG_DELIMITER.join(score_strings)) + if total_max_score != 0: + read.set_tag(READ_APPROX_QUAL_TAG, f"{total_score / total_max_score:.4f}") + else: + read.set_tag(READ_APPROX_QUAL_TAG, f"0.0") + + out_bam_file.write(read) diff --git a/src/longbow/utils/model.py b/src/longbow/utils/model.py index bd2679d8..8297b3ef 100644 --- a/src/longbow/utils/model.py +++ b/src/longbow/utils/model.py @@ -1,3 +1,4 @@ +import sys import re import json import logging @@ -43,6 +44,7 @@ class LibraryModel: def __init__(self, name, + version, array_element_structure, adapters, direct_connections, @@ -51,6 +53,7 @@ def __init__(self, do_build=True): self.name = name + self.version = version self.array_element_structure = array_element_structure self.adapter_dict = adapters @@ -330,6 +333,7 @@ def to_json(self, outfile=None, indent=4): model_data = { "name": self.name, + "version": self.version, "array_element_structure": self.array_element_structure, "adapters": self.adapter_dict, "direct_connections": {k: list(v) for k, v in self.direct_connections_dict.items()}, @@ -357,6 +361,7 @@ def from_json_file(json_file): m = LibraryModel( name=json_data["name"], + version=json_data["version"], array_element_structure=tuple(tuple(v) for v in json_data["array_element_structure"]), adapters=json_data["adapters"], direct_connections={k: set(v) for k, v in json_data["direct_connections"].items()}, @@ -371,6 +376,7 @@ def build_and_return_mas_seq_model(): """Create and return the model for the standard 15 element MAS-seq array.""" return LibraryModel( name="mas15", + version="1.0.0", array_element_structure=( # NOTE: the first element doesn't currently have the "A" adapter in this version of the library. ("A", "10x_Adapter", "random", "Poly_A", "3p_Adapter"), @@ -457,6 +463,7 @@ def build_and_return_mas_seq_10_model(): """Create and return the model for the 10 element MAS-seq array.""" return LibraryModel( name="mas10", + version="1.0.0", array_element_structure=( ("Q", "10x_Adapter", "random", "Poly_A", "3p_Adapter"), ("C", "10x_Adapter", "random", "Poly_A", "3p_Adapter"), @@ -517,6 +524,60 @@ def build_and_return_mas_seq_10_model(): end_element_names={"Poly_A", "R"}, ) + @staticmethod + def build_and_return_mas_seq_8_model(): + """Create and return the model for the prototype 8 element MAS-seq array.""" + return LibraryModel( + name="mas8prototype", + version="1.0.0", + array_element_structure=( + # NOTE: the first element may not have the "A" adapter in this version of the library. + ("A", "10x_Adapter", "random", "Poly_T", "random", "TSO"), + ("B", "10x_Adapter", "random", "Poly_T", "random", "TSO"), + ("C", "10x_Adapter", "random", "Poly_T", "random", "TSO"), + ("D", "10x_Adapter", "random", "Poly_T", "random", "TSO"), + ("E", "10x_Adapter", "random", "Poly_T", "random", "TSO"), + ("F", "10x_Adapter", "random", "Poly_T", "random", "TSO"), + ("G", "10x_Adapter", "random", "Poly_T", "random", "TSO"), + ("H", "10x_Adapter", "random", "Poly_T", "random", "TSO", "A"), + ), + adapters={ + "10x_Adapter": "CTACACGACGCTCTTCCGATCT", + "Poly_T": "T" * 30, + "TSO": "CCCATGTACTCTGCGTTGATACCACTGCTT", + "A": "ACGTACAG", + "B": "AAACTGCA", + "C": "AGAGTCAC", + "D": "AGCAAGTT", + "E": "AAGTGGTG", + "F": "AGTGGACT", + "G": "ACAGGTTA", + "H": "ATCTCACA", + }, + direct_connections={ + "TSO": { + "A", + "B", + "C", + "D", + "E", + "F", + "G", + "H", + }, + "A": {"10x_Adapter"}, + "B": {"10x_Adapter"}, + "C": {"10x_Adapter"}, + "D": {"10x_Adapter"}, + "E": {"10x_Adapter"}, + "F": {"10x_Adapter"}, + "G": {"10x_Adapter"}, + "H": {"10x_Adapter"}, + }, + start_element_names={"A", "10x_Adapter"}, + end_element_names={"TSO", "A"}, + ) + # IUPAC RC's from: http://arep.med.harvard.edu/labgc/adnan/projects/Utilities/revcomp.html # and https://www.dnabaser.com/articles/IUPAC%20ambiguity%20codes.html diff --git a/test-requirements.txt b/test-requirements.txt index d3c2cd33..4556807f 100644 --- a/test-requirements.txt +++ b/test-requirements.txt @@ -1,3 +1,13 @@ pytest pytest-cov coverage >= 4.5 +click +click_log +tqdm +ssw +pysam +construct +numpy +pandas +pomegranate +matplotlib diff --git a/tests/integration/test_annotate.py b/tests/integration/test_annotate.py new file mode 100644 index 00000000..37b9afe6 --- /dev/null +++ b/tests/integration/test_annotate.py @@ -0,0 +1,35 @@ +import pytest +import os + +from click.testing import CliRunner + +from longbow.__main__ import main_entry as longbow + +from ..utils import assert_bam_files_equal + +TEST_DATA_FOLDER = path = os.path.abspath( + __file__ + os.path.sep + "../../" + os.path.sep + "test_data" +) + os.path.sep +EXPECTED_DATA_FOLDER = TEST_DATA_FOLDER + "annotate" + os.path.sep + + +@pytest.mark.parametrize("input_bam, expected_bam, use_mas10", [ + [TEST_DATA_FOLDER + "mas15_test_input.bam", EXPECTED_DATA_FOLDER + "mas15_expected.bam", False], + [TEST_DATA_FOLDER + "mas10_test_input.bam", EXPECTED_DATA_FOLDER + "mas10_expected.bam", True], +]) +def test_annotate(tmpdir, input_bam, expected_bam, use_mas10): + + if use_mas10: + actual_file = tmpdir.join("annotate_actual_out.mas10.bam") + args = ["annotate", "-t", 4, "-v", "INFO", "--m10", input_bam, "-o", actual_file] + else: + actual_file = tmpdir.join("annotate_actual_out.mas15.bam") + args = ["annotate", "-t", 4, "-v", "INFO", input_bam, "-o", actual_file] + + runner = CliRunner() + runner.invoke(longbow, args) + + # Equal files result as True: + assert_bam_files_equal(actual_file, expected_bam) + + diff --git a/tests/test_data/annotate/mas10_expected.bam b/tests/test_data/annotate/mas10_expected.bam new file mode 100644 index 00000000..39e98920 Binary files /dev/null and b/tests/test_data/annotate/mas10_expected.bam differ diff --git a/tests/test_data/annotate/mas15_expected.bam b/tests/test_data/annotate/mas15_expected.bam new file mode 100644 index 00000000..32d16494 Binary files /dev/null and b/tests/test_data/annotate/mas15_expected.bam differ diff --git a/tests/test_data/mas15_test_input.bam b/tests/test_data/mas15_test_input.bam new file mode 100644 index 00000000..12f505e9 Binary files /dev/null and b/tests/test_data/mas15_test_input.bam differ diff --git a/tests/test_data/mas15_test_input.bam.pbi b/tests/test_data/mas15_test_input.bam.pbi new file mode 100644 index 00000000..d378813d Binary files /dev/null and b/tests/test_data/mas15_test_input.bam.pbi differ diff --git a/tests/test_data/test_input.read_names.txt b/tests/test_data/mas15_test_input.read_names.txt similarity index 87% rename from tests/test_data/test_input.read_names.txt rename to tests/test_data/mas15_test_input.read_names.txt index 4adef1ac..b7d3964a 100644 --- a/tests/test_data/test_input.read_names.txt +++ b/tests/test_data/mas15_test_input.read_names.txt @@ -1,4 +1,4 @@ -m64020_201213_022403/1/ccs +m64020_201213_022403/1/ccs m64020_201213_022403/12/ccs m64020_201213_022403/18/ccs m64020_201213_022403/23/ccs diff --git a/tests/test_data/test_input.bam b/tests/test_data/test_input.bam deleted file mode 100644 index 1ce7bd20..00000000 Binary files a/tests/test_data/test_input.bam and /dev/null differ diff --git a/tests/test_data/test_input.bam.pbi b/tests/test_data/test_input.bam.pbi deleted file mode 100644 index b4cba2f8..00000000 Binary files a/tests/test_data/test_input.bam.pbi and /dev/null differ diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 00000000..cd69635a --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,34 @@ +import pysam +import filecmp + + +def assert_bam_files_equal(file1, file2, order_matters=False, compare_header=False): + """Assert that the contents of the two given bam files are equivalent.""" + + if order_matters and compare_header: + assert filecmp.cmp(file1, file2) + + with pysam.AlignmentFile(file1, "rb", check_sq=False, require_index=False) as bam1, \ + pysam.AlignmentFile(file2, "rb", check_sq=False, require_index=False) as bam2: + + if compare_header: + assert bam1.header == bam2.header + + if order_matters: + for read1, read2 in zip(bam1, bam2): + assert read1 == read2 + + else: + f1_reads = set() + + for r in bam1: + f1_reads.add(r) + + nreads_2 = 0 + for r in bam2: + assert r in f1_reads + nreads_2 += 1 + + assert nreads_2 == len(f1_reads) + + diff --git a/tox.ini b/tox.ini index d47c3f17..6529c7ac 100644 --- a/tox.ini +++ b/tox.ini @@ -28,6 +28,11 @@ deps = -rtest-requirements.txt commands = pytest tests/unit {posargs} +[testenv:integration] +deps = -rtest-requirements.txt +commands = + pytest tests/integration {posargs} + [testenv:lint] deps = pylint