Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement simple match extend #183

Merged
merged 5 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 8 additions & 3 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -305,10 +305,15 @@ def common_cluster_query(fn):
),
click.option(
"--extend_strategy",
type=click.Choice(["legacy", "greedy"]),
type=click.Choice(["legacy", "greedy", "simple_match"]),
default="legacy",
help="Strategy to extend BGCs. 'legacy' will use the original BiG-SCAPE extension strategy, "
"while 'greedy' will use a new greedy extension strategy. (default: legacy).",
help="Strategy to extend BGCs. (default: legacy)\n"
" legacy: will use the original BiG-SCAPE extension strategy\n"
" greedy: will use an approach where the broadest comparable region is selected "
"that contains matching domains between the two regions in a pair. "
"See the documentation for more details\n"
" simple_match: Will use an extension approach with only matches and gaps, ignoring "
"the type of domain, occurence or position of the domain. See the documentation for more details.",
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
),
# networking parameters
click.option(
Expand Down
104 changes: 104 additions & 0 deletions big_scape/comparison/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,3 +592,107 @@ def extend_greedy(pair: RecordPair) -> None:

logging.debug("after greedy extend:")
logging.debug(pair.comparable_region)


def extend_simple_match(pair: RecordPair, match, gap):
"""Performs extension by first creating a simple match matrix, then
performing a match/gap extentsion similar to legaxy expansion
nlouwen marked this conversation as resolved.
Show resolved Hide resolved

This method expects LCS to have been performed on the pair, and will
do all four directions at once

Args:
pair: The record pair to extend
match: The score for a match
gap: The score for a gap
"""

# I *think* the simplest way to do this is to create a list of domains with their source cds. for reasons.
a_domains = []
b_domains = []

# so we'll do a loop through cds and through domains to keep track of everything
for cds_idx, cds in enumerate(pair.record_a.get_cds()):
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
for domain in cds.hsps:
a_domains.append((domain, cds_idx))

for cds_idx, cds in enumerate(pair.record_b.get_cds()):
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
for domain in cds.hsps:
b_domains.append((domain, cds_idx))

# get the common domains
common_domains = set([a[0] for a in a_domains]).intersection(
set([b[0] for b in b_domains])
)

# TODO: this is obviously repetitive, but beyond extracting a function I'm
# not sure how to make it better
# for now I'd prefer this to be obvious and clear over looking good

# a forward
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_a_stop + 1, len(a_domains)):
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
domain = a_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = a_domains[i][1]
max_score = score
pair.comparable_region.a_stop = cds_idx
pair.comparable_region.domain_a_stop = i
nlouwen marked this conversation as resolved.
Show resolved Hide resolved

# a reverse
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_a_start - 1, -1, -1):
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
domain = a_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = a_domains[i][1]
max_score = score
pair.comparable_region.a_start = cds_idx
pair.comparable_region.domain_a_start = i

# b forward
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_b_stop + 1, len(b_domains)):
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
domain = b_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = b_domains[i][1]
max_score = score
pair.comparable_region.b_stop = cds_idx
pair.comparable_region.domain_b_stop = i
nlouwen marked this conversation as resolved.
Show resolved Hide resolved

# b reverse
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_b_start - 1, -1, -1):
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
domain = b_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = b_domains[i][1]
max_score = score
pair.comparable_region.b_start = cds_idx
pair.comparable_region.domain_b_start = i
9 changes: 9 additions & 0 deletions big_scape/comparison/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from .extend import (
extend,
extend_greedy,
extend_simple_match,
reset,
len_check,
biosynthetic_check,
Expand Down Expand Up @@ -319,9 +320,17 @@ def expand_pair(pair: RecordPair, alignment_mode: bs_enums.ALIGNMENT_MODE) -> bo
BigscapeConfig.EXPAND_GAP_SCORE,
BigscapeConfig.EXPAND_MAX_MATCH_PERC,
)

if click_context.obj["extend_strategy"] == "greedy":
extend_greedy(pair)

if click_context.obj["extend_strategy"] == "simple_match":
extend_simple_match(
pair,
BigscapeConfig.EXPAND_MATCH_SCORE,
BigscapeConfig.EXPAND_GAP_SCORE,
)

# after local expansion, additionally expand shortest arms in glocal/auto
if alignment_mode != bs_enums.ALIGNMENT_MODE.LOCAL:
expand_glocal(pair)
Expand Down
51 changes: 51 additions & 0 deletions test/comparison/test_extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,3 +882,54 @@ def test_expand_greedy(self):
]

self.assertTrue(all(conditions))

def test_match_extend(self):
"""Tests the new match extend implementation

This implmentation will be more lenient for domain matches which are
present on the other side of the lcs. Altogether this should be a more relaxed
approach to extension and will result in larger comparable regions

The following example:

A: XXXXXCXXABXXXX
B: XXXXABCXXX
LCS: ^^

Would fail to extend to C in the legacy extend implementation, but should extend
in the new implementation given a match score of 5 and mismatch of -2
"""

a_cds, b_cds = generate_mock_cds_lists(14, 10, [6, 9, 10], [5, 6, 7], False)
record_a = generate_mock_region(a_cds)
record_b = generate_mock_region(b_cds)
pair = big_scape.comparison.record_pair.RecordPair(record_a, record_b)

# emulate LCS
pair.comparable_region = bs_comp.ComparableRegion(
9, 10, 6, 7, 9, 10, 6, 7, False
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
)

bs_comp.extend.extend_simple_match(pair, 5, -2)

expected_comparable_region = bs_comp.ComparableRegion(
6, 10, 5, 7, 6, 10, 5, 7, False
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
)

self.assertEqual(pair.comparable_region, expected_comparable_region)
self.assertEqual(
pair.comparable_region.domain_a_start,
expected_comparable_region.domain_a_start,
)
self.assertEqual(
pair.comparable_region.domain_b_start,
expected_comparable_region.domain_b_start,
)
self.assertEqual(
pair.comparable_region.domain_a_stop,
expected_comparable_region.domain_a_stop,
)
self.assertEqual(
pair.comparable_region.domain_b_stop,
expected_comparable_region.domain_b_stop,
)
nlouwen marked this conversation as resolved.
Show resolved Hide resolved
Loading