Skip to content

Commit

Permalink
Fixed issues in segment with bulk libraries. (#141)
Browse files Browse the repository at this point in the history
* Fixed issues in `segment` with bulk libraries.

- Added better logging in `segment` for model annotations.
- `segment` now cues off the model `annotation_segments` for what annotations are required on output.
- `segment` now has repeatable order for `it` tag output (READ_CLIPPED_SEQS_LIST_TAG)
- Split `bam_utils.has_cbc_and_umi` into two methods.
- Added two new tags for demultiplexing indices in bulk (and other) libraries.
- Now `LibraryModel.has_umi_annotation` will cue off of both the raw and final UMI tags.
- Now `LibraryModel.has_cell_barcode_annotation` will cue off of both the raw and final CBC tags.
- Updated all models that annotate UMIs to annotate both the final and raw UMI tags.
- Fixed typo in the `mas_15_bulk_10x5p_single_internal` `array_element_structure`.
- Updated tests for `segment` to validate against expected output for 3 models (`mas_15_bulk_10x5p_single_internal`, `mas_10_sc_10x5p_single_none`, `mas_15_sc_10x5p_single_none`).
- Added unit test file for the model.
  • Loading branch information
jonn-smith authored Jul 20, 2022
1 parent f497d4a commit 3ca9a7e
Show file tree
Hide file tree
Showing 12 changed files with 182 additions and 73 deletions.
3 changes: 2 additions & 1 deletion src/longbow/extract/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def main(pbi, output_bam, force, base_padding, create_barcode_conf_file,
# Set up our barcode confidence file here:
barcode_conf_file = None
if create_barcode_conf_file:
if lb_model.has_barcode_annotation:
if lb_model.has_cell_barcode_annotation:
logger.info(f"Creating barcode confidence file: {longbow.utils.constants.BARCODE_CONF_FILE_NAME}")
barcode_conf_file = open(longbow.utils.constants.BARCODE_CONF_FILE_NAME, 'w')
else:
Expand Down Expand Up @@ -230,6 +230,7 @@ def main(pbi, output_bam, force, base_padding, create_barcode_conf_file,
)
)
else:
# Read is already segmented, so we don't have to do anything else:
segmented_reads = [read]

extracted_segment = False
Expand Down
58 changes: 40 additions & 18 deletions src/longbow/segment/command.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,21 +138,20 @@ def main(threads, output_bam, create_barcode_conf_file, model, ignore_cbc_and_um
else:
# Check to see if the model has a CBC / UMI. If not, we should turn on this flag
# and warn the user:

has_cbc_or_umi_annotation = False
if lb_model.annotation_segments:
for seg_tag_list in lb_model.annotation_segments.values():
for seg_tag, seg_pos_tag in seg_tag_list:
if seg_tag == longbow.utils.constants.READ_UMI_TAG or \
seg_tag == longbow.utils.constants.READ_BARCODE_TAG or \
seg_tag == longbow.utils.constants.READ_RAW_UMI_TAG or \
seg_tag == longbow.utils.constants.READ_RAW_BARCODE_TAG:
has_cbc_or_umi_annotation = True
break
if has_cbc_or_umi_annotation:
break
if lb_model.has_umi_annotation:
logger.info("Model has UMI annotation.")
else:
has_cbc_or_umi_annotation = True

if lb_model.has_cell_barcode_annotation:
logger.info("Model has Cell Barcode annotation.")
else:
has_cbc_or_umi_annotation = True

if not has_cbc_or_umi_annotation:
logger.warning("Model does not have CBC or UMI tags. All segments will be emitted.")
logger.warning("Model does not have Cell Barcode or UMI tags. All segments will be emitted.")
ignore_cbc_and_umi = True

out_header = bam_utils.create_bam_header_with_program_group(logger.name, bam_file.header, models=[lb_model])
Expand Down Expand Up @@ -241,13 +240,16 @@ def _sub_process_write_fn(

barcode_conf_file = None
if create_barcode_conf_file:
if model.has_barcode_annotation:
if model.has_cell_barcode_annotation:
logger.info(f"Creating barcode confidence file: {longbow.utils.constants.BARCODE_CONF_FILE_NAME}")
barcode_conf_file = open(longbow.utils.constants.BARCODE_CONF_FILE_NAME, 'w')
else:
logger.warning(f"Model does not have a barcode output, but barcode creation flag was given. "
f"Barcode confidence file will NOT be created.")

model_annotates_cbc = model.has_cell_barcode_annotation
model_annotates_umi = model.has_umi_annotation

with pysam.AlignmentFile(
out_bam_file_name, "wb", header=out_bam_header
) as out_bam_file:
Expand Down Expand Up @@ -280,6 +282,8 @@ def _sub_process_write_fn(
out_bam_file,
barcode_conf_file,
ignore_cbc_and_umi,
model_annotates_cbc,
model_annotates_umi
)

# Increment our counters:
Expand Down Expand Up @@ -514,7 +518,7 @@ def segment_read_with_bounded_region_algorithm(read, model, segments=None):


def _write_segmented_read(
model, read, segments, delimiters, bam_out, barcode_conf_file, ignore_cbc_and_umi
model, read, segments, delimiters, bam_out, barcode_conf_file, ignore_cbc_and_umi, model_annotates_cbc, model_annotates_umi
):
"""Split and write out the segments of each read to the given bam output file.
Expand Down Expand Up @@ -546,6 +550,8 @@ def _write_segmented_read(
delim_name,
prev_delim_name,
ignore_cbc_and_umi,
model_annotates_cbc,
model_annotates_umi,
sri
)

Expand All @@ -570,6 +576,8 @@ def _write_split_array_element(
delim_name,
prev_delim_name,
ignore_cbc_and_umi,
model_annotates_cbc,
model_annotates_umi,
split_read_index
):
"""Write out an individual array element that has been split out according to the given coordinates."""
Expand All @@ -579,10 +587,24 @@ def _write_split_array_element(
if barcode_conf_file is not None and a.has_tag(longbow.utils.constants.READ_BARCODE_CONF_FACTOR_TAG):
barcode_conf_file.write(f"{a.get_tag(longbow.utils.constants.READ_RAW_BARCODE_TAG)}\t{a.get_tag(longbow.utils.constants.READ_BARCODE_CONF_FACTOR_TAG)}\n")

has_cbc_and_umi = bam_utils.has_cbc_and_umi(a)

if has_cbc_and_umi or ignore_cbc_and_umi:
if ignore_cbc_and_umi:
bam_out.write(a)
return True
else:
# We have to check the CBC and UMI:
# TODO: Expand this for other annotation segments.
cbc_ok = (not model_annotates_cbc) or bam_utils.has_cbc(a)
umi_ok = (not model_annotates_umi) or bam_utils.has_umi(a)

if not cbc_ok:
logger.warning(f"Read {a.query_name} has no CBC. Ignoring.")
return False
elif not umi_ok:
logger.warning(f"Read {a.query_name} has no UMI. Ignoring.")
return False
else:
bam_out.write(a)
return True


def create_simple_split_array_element(delim_name, end_coord, model, prev_delim_name, read, segments, start_coord, split_read_index):
Expand Down Expand Up @@ -659,7 +681,7 @@ def create_simple_split_array_element(delim_name, end_coord, model, prev_delim_n
a.set_tag(longbow.utils.constants.READ_BARCODE_CONF_FACTOR_TAG, conf_factor)

# Set IsoSeq3-compatible tags:
a.set_tag(longbow.utils.constants.READ_CLIPPED_SEQS_LIST_TAG, ','.join(clipped_tags))
a.set_tag(longbow.utils.constants.READ_CLIPPED_SEQS_LIST_TAG, ','.join(sorted(clipped_tags)))
a.set_tag(longbow.utils.constants.READ_NUM_CONSENSUS_PASSES_TAG, 1)
a.set_tag(longbow.utils.constants.READ_ZMW_NAMES_TAG, read.query_name)
a.set_tag(longbow.utils.constants.READ_NUM_ZMWS_TAG, 1)
Expand Down
8 changes: 6 additions & 2 deletions src/longbow/utils/bam_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,8 +410,12 @@ def get_confidence_factor_raw_quals(quals: array.array, scale_factor: float = CO
)


def has_cbc_and_umi(read):
return read.has_tag(READ_RAW_BARCODE_TAG) and read.has_tag(READ_RAW_UMI_TAG)
def has_cbc(read):
return read.has_tag(READ_RAW_BARCODE_TAG)


def has_umi(read):
return read.has_tag(READ_RAW_UMI_TAG)


def get_model_name_from_bam_header(header):
Expand Down
3 changes: 3 additions & 0 deletions src/longbow/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,9 @@
READ_SPATIAL_BARCODE2_TAG = "X2"
READ_SPATIAL_BARCODE2_POS_TAG = "XQ"

READ_DEMUX_TAG = "id"
READ_DEMUX_POS_TAG = "ip"

READ_BGZF_VIRTUAL_OFFSET_TAG = "vo"

COULD_CORRECT_BARCODE_TAG = "YC" # True IFF barcode correction was able to be performed (including "correction" where the original barcode did not change). False otherwise.
Expand Down
48 changes: 24 additions & 24 deletions src/longbow/utils/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,21 +100,19 @@ def __init__(self,

@property
def has_umi_annotation(self):
return self._has_annotation(longbow.utils.constants.READ_RAW_UMI_TAG)
return self._has_annotation(longbow.utils.constants.READ_RAW_UMI_TAG) or self._has_annotation(longbow.utils.constants.READ_UMI_TAG)

@property
def has_barcode_annotation(self):
return self._has_annotation(longbow.utils.constants.READ_RAW_BARCODE_TAG)
def has_cell_barcode_annotation(self):
return self._has_annotation(longbow.utils.constants.READ_RAW_BARCODE_TAG) or self._has_annotation(longbow.utils.constants.READ_BARCODE_TAG)

def _has_annotation(self, annotation_tag):
if self.annotation_segments is None:
return False

# Get all annotation tags:
for k, tag_tuple_list in self.annotation_segments.items():
for tag_tuple in tag_tuple_list:
if annotation_tag in tag_tuple:
return True
if self.annotation_segments:
# Get all annotation tags:
for _, tag_tuple_list in self.annotation_segments.items():
for tag_tuple in tag_tuple_list:
if annotation_tag in tag_tuple:
return True

# If we're still running, we didn't find our annotation tag.
# Therefore we don't have the given annotation.
Expand Down Expand Up @@ -1308,7 +1306,7 @@ def build_pre_configured_model(model_name):
# Model naming convention:
# prefix (mas, isoseq)
# modality (bulk, sc, spatial)
# input library type (10x5p, 103p, slideseq)
# input library type (10x5p, 10x3p, slideseq)
# umi style (none, single, dual)
# plexity (pbkit, internal, none)
#
Expand Down Expand Up @@ -1411,8 +1409,8 @@ def build_pre_configured_model(model_name):
"named_random_segments": {"UMI", "cDNA", "CBC"},
"coding_region": "cDNA",
"annotation_segments": {
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
(longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
},
Expand Down Expand Up @@ -1510,8 +1508,8 @@ def build_pre_configured_model(model_name):
"named_random_segments": {"UMI", "cDNA", "CBC"},
"coding_region": "cDNA",
"annotation_segments": {
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
(longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
},
Expand All @@ -1536,7 +1534,7 @@ def build_pre_configured_model(model_name):
("L", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter"),
("M", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter"),
("N", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter"),
("O", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "3p_Adapter", "sample_index", "P"),
("O", "5p_Adapter", "UMI", "SLS", "cDNA", "Poly_A", "sample_index", "3p_Adapter", "P"),
),
"adapters": {
"5p_Adapter": "TCTACACGACGCTCTTCCGATCT",
Expand Down Expand Up @@ -1610,8 +1608,9 @@ def build_pre_configured_model(model_name):
"named_random_segments": {"UMI", "cDNA", "sample_index"},
"coding_region": "cDNA",
"annotation_segments": {
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"sample_index": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
(longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"sample_index": [(longbow.utils.constants.READ_DEMUX_TAG, longbow.utils.constants.READ_DEMUX_POS_TAG)],
},
"deprecated": False,
},
Expand Down Expand Up @@ -1688,8 +1687,8 @@ def build_pre_configured_model(model_name):
"named_random_segments": {"UMI", "cDNA", "CBC"},
"coding_region": "cDNA",
"annotation_segments": {
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
(longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
},
Expand Down Expand Up @@ -1796,7 +1795,8 @@ def build_pre_configured_model(model_name):
"named_random_segments": {"UMI", "SBC2", "SBC1", "cDNA"},
"coding_region": "cDNA",
"annotation_segments": {
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
(longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"SBC1": [(longbow.utils.constants.READ_SPATIAL_BARCODE1_TAG,
longbow.utils.constants.READ_SPATIAL_BARCODE1_POS_TAG)],
"SBC2": [(longbow.utils.constants.READ_SPATIAL_BARCODE2_TAG,
Expand Down Expand Up @@ -1923,8 +1923,8 @@ def build_pre_configured_model(model_name):
"named_random_segments": {"UMI", "cDNA", "CBC"},
"coding_region": "cDNA",
"annotation_segments": {
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG), (
longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"UMI": [(longbow.utils.constants.READ_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG),
(longbow.utils.constants.READ_RAW_UMI_TAG, longbow.utils.constants.READ_UMI_POS_TAG)],
"CBC": [(longbow.utils.constants.READ_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG), (
longbow.utils.constants.READ_RAW_BARCODE_TAG, longbow.utils.constants.READ_BARCODE_POS_TAG)],
},
Expand Down
51 changes: 42 additions & 9 deletions tests/integration/test_segment.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,52 +9,85 @@
from longbow.__main__ import main_entry as longbow

from ..utils import cat_file_to_pipe
from ..utils import convert_sam_to_bam
from ..utils import assert_reads_files_equal

################################################################################

TOOL_NAME = "segment"

TEST_DATA_FOLDER = path = os.path.abspath(
__file__ + os.path.sep + "../../" + os.path.sep + "test_data"
) + os.path.sep
EXPECTED_DATA_FOLDER = TEST_DATA_FOLDER + TOOL_NAME + os.path.sep

################################################################################


@pytest.fixture(scope="module", params=[
(TEST_DATA_FOLDER + "mas15_test_input.bam", "mas_15_sc_10x5p_single_none"),
(TEST_DATA_FOLDER + "mas10_test_input.bam", "mas_10_sc_10x5p_single_none"),
(TEST_DATA_FOLDER + "mas15_test_input.bam",
EXPECTED_DATA_FOLDER + "mas_15_sc_10x5p_single_none.expected.bam",
"mas_15_sc_10x5p_single_none"),
(TEST_DATA_FOLDER + "mas10_test_input.bam",
EXPECTED_DATA_FOLDER + "mas_10_sc_10x5p_single_none.expected.bam",
"mas_10_sc_10x5p_single_none"),
(TEST_DATA_FOLDER + "mas_15_bulk_10x5p_single_internal.bam",
EXPECTED_DATA_FOLDER + "mas_15_bulk_10x5p_single_internal.expected.bam",
"mas_15_bulk_10x5p_single_internal"),
])
def filtered_bam_file_from_pipeline(request):
input_bam, model_name = request.param
input_file, expected_file, model_name = request.param

if input_file.endswith(".bam"):
input_bam = input_file
else:
# Convert test file to bam:
with tempfile.NamedTemporaryFile(delete=True) as input_bam:
convert_sam_to_bam(input_file, input_bam)

with tempfile.NamedTemporaryFile(delete=True) as annotate_bam, \
tempfile.NamedTemporaryFile(delete=True) as filter_bam:

runner = CliRunner()

result_annotate = runner.invoke(longbow, ["annotate", "-m", model_name, "-f", "-o", annotate_bam.name, input_bam])
result_annotate = runner.invoke(longbow, ["annotate", "-t", 1, "-m", model_name,
"-f", "-o", annotate_bam.name, input_bam])
assert result_annotate.exit_code == 0

result_filter = runner.invoke(longbow, ["filter", "-m", model_name, "-f", "-o", filter_bam.name, annotate_bam.name])
result_filter = runner.invoke(longbow, ["filter", "-m", model_name,
"-f", "-o", filter_bam.name, annotate_bam.name])
assert result_filter.exit_code == 0

# Yield file here so that when we return, we get to clean up automatically
yield filter_bam.name
yield filter_bam.name, expected_file, model_name


def test_segment_from_file(tmpdir, filtered_bam_file_from_pipeline):
input_bam_file, expected_bam, model_name = filtered_bam_file_from_pipeline

actual_file = tmpdir.join(f"segment_actual_out.bam")
args = ["segment", "-f", "-o", actual_file, filtered_bam_file_from_pipeline]
args = ["segment", "-t", 1, "-f", "-o", actual_file, input_bam_file]

runner = CliRunner()
result = runner.invoke(longbow, args)

assert result.exit_code == 0
assert_reads_files_equal(actual_file, expected_bam, order_matters=True)


def test_segment_from_pipe(tmpdir, filtered_bam_file_from_pipeline):
input_bam_file, expected_bam, model_name = filtered_bam_file_from_pipeline

actual_file = tmpdir.join(f"filter_actual_out.pipe.bam")

proc = subprocess.Popen(
[ sys.executable, "-m", "longbow", "segment", "-f", "-o", actual_file ],
[sys.executable, "-m", "longbow", "segment", "-t", "1", "-f", "-o", actual_file],
stdin=subprocess.PIPE
)

cat_file_to_pipe(filtered_bam_file_from_pipeline, proc)
cat_file_to_pipe(input_bam_file, proc)

assert proc.returncode == 0
assert_reads_files_equal(actual_file, expected_bam, order_matters=True)
Binary file not shown.
Loading

0 comments on commit 3ca9a7e

Please sign in to comment.