Fixed issues in segment with bulk libraries. (#141)

* Fixed issues in `segment` with bulk libraries. - Added better logging in `segment` for model annotations. - `segment` now cues off the model `annotation_segments` for what annotations are required on output. - `segment` now has repeatable order for `it` tag output (READ_CLIPPED_SEQS_LIST_TAG) - Split `bam_utils.has_cbc_and_umi` into two methods. - Added two new tags for demultiplexing indices in bulk (and other) libraries. - Now `LibraryModel.has_umi_annotation` will cue off of both the raw and final UMI tags. - Now `LibraryModel.has_cell_barcode_annotation` will cue off of both the raw and final CBC tags. - Updated all models that annotate UMIs to annotate both the final and raw UMI tags. - Fixed typo in the `mas_15_bulk_10x5p_single_internal` `array_element_structure`. - Updated tests for `segment` to validate against expected output for 3 models (`mas_15_bulk_10x5p_single_internal`, `mas_10_sc_10x5p_single_none`, `mas_15_sc_10x5p_single_none`). - Added unit test file for the model.
broadinstitute · Jul 20, 2022 · 3ca9a7e · 3ca9a7e
1 parent f497d4a
commit 3ca9a7e
Show file tree

Hide file tree

Showing 12 changed files with 182 additions and 73 deletions.
diff --git a/src/longbow/extract/command.py b/src/longbow/extract/command.py
@@ -185,7 +185,7 @@ def main(pbi, output_bam, force, base_padding, create_barcode_conf_file,
         # Set up our barcode confidence file here:
         barcode_conf_file = None
         if create_barcode_conf_file:
-            if lb_model.has_barcode_annotation:
+            if lb_model.has_cell_barcode_annotation:
                 logger.info(f"Creating barcode confidence file: {longbow.utils.constants.BARCODE_CONF_FILE_NAME}")
                 barcode_conf_file = open(longbow.utils.constants.BARCODE_CONF_FILE_NAME, 'w')
             else:
@@ -230,6 +230,7 @@ def main(pbi, output_bam, force, base_padding, create_barcode_conf_file,
                             )
                         )
                 else:
+                    # Read is already segmented, so we don't have to do anything else:
                     segmented_reads = [read]
 
                 extracted_segment = False

diff --git a/src/longbow/segment/command.py b/src/longbow/segment/command.py
@@ -138,21 +138,20 @@ def main(threads, output_bam, create_barcode_conf_file, model, ignore_cbc_and_um
         else:
             # Check to see if the model has a CBC / UMI.  If not, we should turn on this flag
             # and warn the user:
+
             has_cbc_or_umi_annotation = False
-            if lb_model.annotation_segments:
-                for seg_tag_list in lb_model.annotation_segments.values():
-                    for seg_tag, seg_pos_tag in seg_tag_list:
-                        if seg_tag == longbow.utils.constants.READ_UMI_TAG or \
-                                seg_tag == longbow.utils.constants.READ_BARCODE_TAG or \
-                                seg_tag == longbow.utils.constants.READ_RAW_UMI_TAG or \
-                                seg_tag == longbow.utils.constants.READ_RAW_BARCODE_TAG:
-                            has_cbc_or_umi_annotation = True
-                            break
-                    if has_cbc_or_umi_annotation:
-                        break
+            if lb_model.has_umi_annotation:
+                logger.info("Model has UMI annotation.")
+            else:
+                has_cbc_or_umi_annotation = True
+
+            if lb_model.has_cell_barcode_annotation:
+                logger.info("Model has Cell Barcode annotation.")
+            else:
+                has_cbc_or_umi_annotation = True
 
             if not has_cbc_or_umi_annotation:
-                logger.warning("Model does not have CBC or UMI tags.  All segments will be emitted.")
+                logger.warning("Model does not have Cell Barcode or UMI tags.  All segments will be emitted.")
                 ignore_cbc_and_umi = True
 
         out_header = bam_utils.create_bam_header_with_program_group(logger.name, bam_file.header, models=[lb_model])
@@ -241,13 +240,16 @@ def _sub_process_write_fn(
 
     barcode_conf_file = None
     if create_barcode_conf_file:
-        if model.has_barcode_annotation:
+        if model.has_cell_barcode_annotation:
             logger.info(f"Creating barcode confidence file: {longbow.utils.constants.BARCODE_CONF_FILE_NAME}")
             barcode_conf_file = open(longbow.utils.constants.BARCODE_CONF_FILE_NAME, 'w')
         else:
             logger.warning(f"Model does not have a barcode output, but barcode creation flag was given.  "
                            f"Barcode confidence file will NOT be created.")
 
+    model_annotates_cbc = model.has_cell_barcode_annotation
+    model_annotates_umi = model.has_umi_annotation
+
     with pysam.AlignmentFile(
         out_bam_file_name, "wb", header=out_bam_header
     ) as out_bam_file:
@@ -280,6 +282,8 @@ def _sub_process_write_fn(
                 out_bam_file,
                 barcode_conf_file,
                 ignore_cbc_and_umi,
+                model_annotates_cbc,
+                model_annotates_umi
             )
 
             # Increment our counters:
@@ -514,7 +518,7 @@ def segment_read_with_bounded_region_algorithm(read, model, segments=None):
 
 
 def _write_segmented_read(
-    model, read, segments, delimiters, bam_out, barcode_conf_file, ignore_cbc_and_umi
+    model, read, segments, delimiters, bam_out, barcode_conf_file, ignore_cbc_and_umi, model_annotates_cbc, model_annotates_umi
 ):
     """Split and write out the segments of each read to the given bam output file.
 
@@ -546,6 +550,8 @@ def _write_segmented_read(
             delim_name,
             prev_delim_name,
             ignore_cbc_and_umi,
+            model_annotates_cbc,
+            model_annotates_umi,
             sri
         )
 
@@ -570,6 +576,8 @@ def _write_split_array_element(
     delim_name,
     prev_delim_name,
     ignore_cbc_and_umi,
+    model_annotates_cbc,
+    model_annotates_umi,
     split_read_index
 ):
     """Write out an individual array element that has been split out according to the given coordinates."""
@@ -579,10 +587,24 @@ def _write_split_array_element(
     if barcode_conf_file is not None and a.has_tag(longbow.utils.constants.READ_BARCODE_CONF_FACTOR_TAG):
         barcode_conf_file.write(f"{a.get_tag(longbow.utils.constants.READ_RAW_BARCODE_TAG)}\t{a.get_tag(longbow.utils.constants.READ_BARCODE_CONF_FACTOR_TAG)}\n")
 
-    has_cbc_and_umi = bam_utils.has_cbc_and_umi(a)
-
-    if has_cbc_and_umi or ignore_cbc_and_umi:
+    if ignore_cbc_and_umi:
         bam_out.write(a)
+        return True
+    else:
+        # We have to check the CBC and UMI:
+        # TODO: Expand this for other annotation segments.
+        cbc_ok = (not model_annotates_cbc) or bam_utils.has_cbc(a)
+        umi_ok = (not model_annotates_umi) or bam_utils.has_umi(a)
+
+        if not cbc_ok:
+            logger.warning(f"Read {a.query_name} has no CBC.  Ignoring.")
+            return False
+        elif not umi_ok:
+            logger.warning(f"Read {a.query_name} has no UMI.  Ignoring.")
+            return False
+        else:
+            bam_out.write(a)
+            return True
 
 
 def create_simple_split_array_element(delim_name, end_coord, model, prev_delim_name, read, segments, start_coord, split_read_index):
@@ -659,7 +681,7 @@ def create_simple_split_array_element(delim_name, end_coord, model, prev_delim_n
                 a.set_tag(longbow.utils.constants.READ_BARCODE_CONF_FACTOR_TAG, conf_factor)
 
     # Set IsoSeq3-compatible tags:
-    a.set_tag(longbow.utils.constants.READ_CLIPPED_SEQS_LIST_TAG, ','.join(clipped_tags))
+    a.set_tag(longbow.utils.constants.READ_CLIPPED_SEQS_LIST_TAG, ','.join(sorted(clipped_tags)))
     a.set_tag(longbow.utils.constants.READ_NUM_CONSENSUS_PASSES_TAG, 1)
     a.set_tag(longbow.utils.constants.READ_ZMW_NAMES_TAG, read.query_name)
     a.set_tag(longbow.utils.constants.READ_NUM_ZMWS_TAG, 1)

diff --git a/src/longbow/utils/bam_utils.py b/src/longbow/utils/bam_utils.py
@@ -410,8 +410,12 @@ def get_confidence_factor_raw_quals(quals: array.array, scale_factor: float = CO
     )
 
 
-def has_cbc_and_umi(read):
-    return read.has_tag(READ_RAW_BARCODE_TAG) and read.has_tag(READ_RAW_UMI_TAG)
+def has_cbc(read):
+    return read.has_tag(READ_RAW_BARCODE_TAG)
+
+
+def has_umi(read):
+    return read.has_tag(READ_RAW_UMI_TAG)
 
 
 def get_model_name_from_bam_header(header):

diff --git a/src/longbow/utils/constants.py b/src/longbow/utils/constants.py
@@ -72,6 +72,9 @@
 READ_SPATIAL_BARCODE2_TAG = "X2"
 READ_SPATIAL_BARCODE2_POS_TAG = "XQ"
 
+READ_DEMUX_TAG = "id"
+READ_DEMUX_POS_TAG = "ip"
+
 READ_BGZF_VIRTUAL_OFFSET_TAG = "vo"
 
 COULD_CORRECT_BARCODE_TAG = "YC"  # True IFF barcode correction was able to be performed (including "correction" where the original barcode did not change).  False otherwise.

diff --git a/src/longbow/utils/model.py b/src/longbow/utils/model.py
@@ -100,21 +100,19 @@ def __init__(self,
 
     @property
     def has_umi_annotation(self):
-        return self._has_annotation(longbow.utils.constants.READ_RAW_UMI_TAG)
+        return self._has_annotation(longbow.utils.constants.READ_RAW_UMI_TAG) or self._has_annotation(longbow.utils.constants.READ_UMI_TAG)
 
     @property
-    def has_barcode_annotation(self):
-        return self._has_annotation(longbow.utils.constants.READ_RAW_BARCODE_TAG)
+    def has_cell_barcode_annotation(self):
+        return self._has_annotation(longbow.utils.constants.READ_RAW_BARCODE_TAG) or self._has_annotation(longbow.utils.constants.READ_BARCODE_TAG)
 
     def _has_annotation(self, annotation_tag):
-        if self.annotation_segments is None:
-            return False
-
-        # Get all annotation tags:
-        for k, tag_tuple_list in self.annotation_segments.items():
-            for tag_tuple in tag_tuple_list:
-                if annotation_tag in tag_tuple:
-                    return True
+        if self.annotation_segments:
+            # Get all annotation tags:
+            for _, tag_tuple_list in self.annotation_segments.items():
+                for tag_tuple in tag_tuple_list:
+                    if annotation_tag in tag_tuple:
+                        return True
 
         # If we're still running, we didn't find our annotation tag.
         # Therefore we don't have the given annotation.
@@ -1308,7 +1306,7 @@ def build_pre_configured_model(model_name):
     # Model naming convention:
     #     prefix (mas, isoseq)
     #     modality (bulk, sc, spatial)
-    #     input library type (10x5p, 103p, slideseq)
+    #     input library type (10x5p, 10x3p, slideseq)
     #     umi style (none, single, dual)
     #     plexity (pbkit, internal, none)
     #
@@ -1411,8 +1409,8 @@ def build_pre_configured_model(model_name):
             "named_random_segments": {"UMI", "cDNA", "CBC"},
             "coding_region": "cDNA",
             "annotation_segments": {
-                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
-                longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
+                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
+                        (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
                 "CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
                 longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
             },
@@ -1510,8 +1508,8 @@ def build_pre_configured_model(model_name):
             "named_random_segments": {"UMI", "cDNA", "CBC"},
             "coding_region": "cDNA",
             "annotation_segments": {
-                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
-                longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
+                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
+                        (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
                 "CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
                 longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
             },
@@ -1536,7 +1534,7 @@ def build_pre_configured_model(model_name):
                 ("L", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter"),
                 ("M", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter"),
                 ("N", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter"),
-                ("O", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "3p_Adapter", "sample_index", "P"),
+                ("O", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter", "P"),
             ),
             "adapters": {
                 "5p_Adapter": "TCTACACGACGCTCTTCCGATCT",
@@ -1610,8 +1608,9 @@ def build_pre_configured_model(model_name):
             "named_random_segments": {"UMI", "cDNA", "sample_index"},
             "coding_region": "cDNA",
             "annotation_segments": {
-                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
-                "sample_index": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
+                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
+                        (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
+                "sample_index": [(longbow.utils.constants.READ_DEMUX_TAG, longbow.utils.constants.READ_DEMUX_POS_TAG)],
             },
             "deprecated": False,
         },
@@ -1688,8 +1687,8 @@ def build_pre_configured_model(model_name):
             "named_random_segments": {"UMI", "cDNA", "CBC"},
             "coding_region": "cDNA",
             "annotation_segments": {
-                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
-                    longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
+                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
+                        (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
                 "CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
                     longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
             },
@@ -1796,7 +1795,8 @@ def build_pre_configured_model(model_name):
             "named_random_segments": {"UMI", "SBC2", "SBC1", "cDNA"},
             "coding_region": "cDNA",
             "annotation_segments": {
-                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
+                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
+                        (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
                 "SBC1": [(longbow.utils.constants.READ_SPATIAL_BARCODE1_TAG,
                          longbow.utils.constants.READ_SPATIAL_BARCODE1_POS_TAG)],
                 "SBC2": [(longbow.utils.constants.READ_SPATIAL_BARCODE2_TAG,
@@ -1923,8 +1923,8 @@ def build_pre_configured_model(model_name):
             "named_random_segments": {"UMI", "cDNA", "CBC"},
             "coding_region": "cDNA",
             "annotation_segments": {
-                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
-                longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
+                "UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
+                        (longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
                 "CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
                 longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
             },

diff --git a/tests/integration/test_segment.py b/tests/integration/test_segment.py
@@ -9,52 +9,85 @@
 from longbow.__main__ import main_entry as longbow
 
 from ..utils import cat_file_to_pipe
+from ..utils import convert_sam_to_bam
+from ..utils import assert_reads_files_equal
+
+################################################################################
+
+TOOL_NAME = "segment"
 
 TEST_DATA_FOLDER = path = os.path.abspath(
     __file__ + os.path.sep + "../../" + os.path.sep + "test_data"
 ) + os.path.sep
+EXPECTED_DATA_FOLDER = TEST_DATA_FOLDER + TOOL_NAME + os.path.sep
+
+################################################################################
 
 
 @pytest.fixture(scope="module", params=[
-    (TEST_DATA_FOLDER + "mas15_test_input.bam", "mas_15_sc_10x5p_single_none"),
-    (TEST_DATA_FOLDER + "mas10_test_input.bam", "mas_10_sc_10x5p_single_none"),
+    (TEST_DATA_FOLDER + "mas15_test_input.bam",
+     EXPECTED_DATA_FOLDER + "mas_15_sc_10x5p_single_none.expected.bam",
+     "mas_15_sc_10x5p_single_none"),
+
+    (TEST_DATA_FOLDER + "mas10_test_input.bam",
+     EXPECTED_DATA_FOLDER + "mas_10_sc_10x5p_single_none.expected.bam",
+     "mas_10_sc_10x5p_single_none"),
+
+    (TEST_DATA_FOLDER + "mas_15_bulk_10x5p_single_internal.bam",
+     EXPECTED_DATA_FOLDER + "mas_15_bulk_10x5p_single_internal.expected.bam",
+     "mas_15_bulk_10x5p_single_internal"),
 ])
 def filtered_bam_file_from_pipeline(request):
-    input_bam, model_name = request.param
+    input_file, expected_file, model_name = request.param
+
+    if input_file.endswith(".bam"):
+        input_bam = input_file
+    else:
+        # Convert test file to bam:
+        with tempfile.NamedTemporaryFile(delete=True) as input_bam:
+            convert_sam_to_bam(input_file, input_bam)
 
     with tempfile.NamedTemporaryFile(delete=True) as annotate_bam, \
          tempfile.NamedTemporaryFile(delete=True) as filter_bam:
 
         runner = CliRunner()
 
-        result_annotate = runner.invoke(longbow, ["annotate", "-m", model_name, "-f", "-o", annotate_bam.name, input_bam])
+        result_annotate = runner.invoke(longbow, ["annotate", "-t", 1, "-m", model_name,
+                                                  "-f", "-o", annotate_bam.name, input_bam])
         assert result_annotate.exit_code == 0
 
-        result_filter = runner.invoke(longbow, ["filter",   "-m", model_name, "-f", "-o", filter_bam.name,   annotate_bam.name])
+        result_filter = runner.invoke(longbow, ["filter", "-m", model_name,
+                                                "-f", "-o", filter_bam.name, annotate_bam.name])
         assert result_filter.exit_code == 0
 
         # Yield file here so that when we return, we get to clean up automatically
-        yield filter_bam.name
+        yield filter_bam.name, expected_file, model_name
 
 
 def test_segment_from_file(tmpdir, filtered_bam_file_from_pipeline):
+    input_bam_file, expected_bam, model_name = filtered_bam_file_from_pipeline
+
     actual_file = tmpdir.join(f"segment_actual_out.bam")
-    args = ["segment", "-f", "-o", actual_file, filtered_bam_file_from_pipeline]
+    args = ["segment", "-t", 1, "-f", "-o", actual_file, input_bam_file]
 
     runner = CliRunner()
     result = runner.invoke(longbow, args)
 
     assert result.exit_code == 0
+    assert_reads_files_equal(actual_file, expected_bam, order_matters=True)
 
 
 def test_segment_from_pipe(tmpdir, filtered_bam_file_from_pipeline):
+    input_bam_file, expected_bam, model_name = filtered_bam_file_from_pipeline
+
     actual_file = tmpdir.join(f"filter_actual_out.pipe.bam")
 
     proc = subprocess.Popen(
-        [ sys.executable, "-m", "longbow", "segment", "-f", "-o", actual_file ],
+        [sys.executable, "-m", "longbow", "segment", "-t", "1", "-f", "-o", actual_file],
         stdin=subprocess.PIPE
     )
 
-    cat_file_to_pipe(filtered_bam_file_from_pipeline, proc)
+    cat_file_to_pipe(input_bam_file, proc)
 
     assert proc.returncode == 0
+    assert_reads_files_equal(actual_file, expected_bam, order_matters=True)
diff --git a/tests/test_data/mas_15_bulk_10x5p_single_internal.bam b/tests/test_data/mas_15_bulk_10x5p_single_internal.bam