Skip to content

Commit

Permalink
Merge pull request #183 from medema-group/feature/match-extend
Browse files Browse the repository at this point in the history
Implement simple match extend
  • Loading branch information
adraismawur authored Oct 1, 2024
2 parents c5f60df + 8b2f21e commit 6a07618
Show file tree
Hide file tree
Showing 5 changed files with 207 additions and 2 deletions.
4 changes: 2 additions & 2 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,11 +309,11 @@ def common_cluster_query(fn):
),
click.option(
"--extend_strategy",
type=click.Choice(["legacy", "greedy"]),
type=click.Choice(["legacy", "greedy", "simple_match"]),
default="legacy",
callback=validate_extend_strategy,
help="Strategy to extend BGCs. 'legacy' will use the original BiG-SCAPE extension strategy, "
"while 'greedy' will use a new greedy extension strategy. For an in depth description,"
"while 'greedy' or 'simple_match' will use new extension strategies. For an in depth description,"
" see the wiki. (default: legacy).",
),
# networking parameters
Expand Down
104 changes: 104 additions & 0 deletions big_scape/comparison/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,3 +592,107 @@ def extend_greedy(pair: RecordPair) -> None:

logging.debug("after greedy extend:")
logging.debug(pair.comparable_region)


def extend_simple_match(pair: RecordPair, match, gap):
"""Performs extension by first creating a simple match matrix, then
performing a match/gap extentsion similar to legacy expansion
This method expects LCS to have been performed on the pair, and will
do all four directions at once
Args:
pair: The record pair to extend
match: The score for a match
gap: The score for a gap
"""

# I *think* the simplest way to do this is to create a list of domains with their source cds. for reasons.
a_domains = []
b_domains = []

# so we'll do a loop through cds and through domains to keep track of everything
for cds_idx, cds in enumerate(pair.record_a.get_cds_with_domains()):
for domain in cds.hsps:
a_domains.append((domain, cds_idx))

for cds_idx, cds in enumerate(pair.record_b.get_cds_with_domains()):
for domain in cds.hsps:
b_domains.append((domain, cds_idx))

# get the common domains
common_domains = set([a[0] for a in a_domains]).intersection(
set([b[0] for b in b_domains])
)

# TODO: this is obviously repetitive, but beyond extracting a function I'm
# not sure how to make it better
# for now I'd prefer this to be obvious and clear over looking good

# a forward
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_a_stop, len(a_domains)):
domain = a_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = a_domains[i][1]
max_score = score
pair.comparable_region.a_stop = cds_idx + 1
pair.comparable_region.domain_a_stop = i + 1

# a reverse
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_a_start - 1, -1, -1):
domain = a_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = a_domains[i][1]
max_score = score
pair.comparable_region.a_start = cds_idx
pair.comparable_region.domain_a_start = i

# b forward
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_b_stop, len(b_domains)):
domain = b_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = b_domains[i][1]
max_score = score
pair.comparable_region.b_stop = cds_idx + 1
pair.comparable_region.domain_b_stop = i + 1

# b reverse
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_b_start - 1, -1, -1):
domain = b_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = b_domains[i][1]
max_score = score
pair.comparable_region.b_start = cds_idx
pair.comparable_region.domain_b_start = i
8 changes: 8 additions & 0 deletions big_scape/comparison/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from .extend import (
extend,
extend_greedy,
extend_simple_match,
reset,
len_check,
biosynthetic_check,
Expand Down Expand Up @@ -320,6 +321,13 @@ def expand_pair(
if extend_strategy == bs_enums.EXTEND_STRATEGY.GREEDY:
extend_greedy(pair)

if extend_strategy == bs_enums.EXTEND_STRATEGY.SIMPLE_MATCH:
extend_simple_match(
pair,
BigscapeConfig.EXPAND_MATCH_SCORE,
BigscapeConfig.EXPAND_GAP_SCORE,
)

# after local expansion, additionally expand shortest arms in glocal/auto
if alignment_mode != bs_enums.ALIGNMENT_MODE.LOCAL:
expand_glocal(pair)
Expand Down
1 change: 1 addition & 0 deletions big_scape/enums/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class ALIGNMENT_MODE(Enum):
class EXTEND_STRATEGY(Enum):
LEGACY = "legacy"
GREEDY = "greedy"
SIMPLE_MATCH = "simple_match"


class COMPARISON_MODE(Enum):
Expand Down
92 changes: 92 additions & 0 deletions test/comparison/test_extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,3 +882,95 @@ def test_expand_greedy(self):
]

self.assertTrue(all(conditions))

def test_match_extend(self):
"""Tests the new match extend implementation
This implmentation will be more lenient for domain matches which are
present on the other side of the lcs. Altogether this should be a more relaxed
approach to extension and will result in larger comparable regions
The following example:
A: XXXXXCXXABXXXX
B: XXXXABCXXX
LCS: ^^
Would fail to extend to C in the legacy extend implementation, but should extend
in the new implementation given a match score of 5 and mismatch of -2
"""

a_cds, b_cds = generate_mock_cds_lists(14, 10, [6, 9, 10], [5, 6, 7], False)
record_a = generate_mock_region(a_cds)
record_b = generate_mock_region(b_cds)
pair = big_scape.comparison.record_pair.RecordPair(record_a, record_b)

# emulate LCS
pair.comparable_region = bs_comp.ComparableRegion(
9, 11, 6, 8, 9, 11, 6, 8, False
)

bs_comp.extend.extend_simple_match(pair, 5, -2)

expected_comparable_region = bs_comp.ComparableRegion(
6, 11, 5, 8, 6, 11, 5, 8, False
)

self.assertEqual(pair.comparable_region, expected_comparable_region)
self.assertEqual(
pair.comparable_region.domain_a_start,
expected_comparable_region.domain_a_start,
)
self.assertEqual(
pair.comparable_region.domain_b_start,
expected_comparable_region.domain_b_start,
)
self.assertEqual(
pair.comparable_region.domain_a_stop,
expected_comparable_region.domain_a_stop,
)
self.assertEqual(
pair.comparable_region.domain_b_stop,
expected_comparable_region.domain_b_stop,
)

def test_expand_simple_match_multi_domain(self):
"""Tests glocal expand with multi domain cdss"""
# brackets indicate a cds with multiple domains
#
# A: [XX]BX[A BC]DEX XXXX
# B: XXXXXXXXXX XX A[BC]EDX[XXXX]
#
a_cds, b_cds = generate_mock_cds_lists(
10, 17, [3, 3, 3, 4, 5], [12, 13, 13, 15, 14], False
)
a_cds[0].hsps.append(a_cds[0].hsps[0])
a_cds[1].hsps = [a_cds[3].hsps[1]]
b_cds[-1].hsps.extend([b_cds[-1].hsps[0]] * 3)
record_a = generate_mock_region(a_cds)
record_b = generate_mock_region(b_cds)
pair = big_scape.comparison.record_pair.RecordPair(record_a, record_b)
pair.comparable_region = bs_comp.ComparableRegion(
3, 4, 12, 14, 4, 7, 12, 15, False
)
bs_comp.extend.extend_simple_match(pair, 5, -2)
expected_comparable_region = bs_comp.ComparableRegion(
1, 6, 12, 16, 2, 9, 12, 17, False
)
self.assertEqual(pair.comparable_region, expected_comparable_region)
self.assertEqual(
pair.comparable_region.domain_a_start,
expected_comparable_region.domain_a_start,
)
self.assertEqual(
pair.comparable_region.domain_b_start,
expected_comparable_region.domain_b_start,
)
self.assertEqual(
pair.comparable_region.domain_a_stop,
expected_comparable_region.domain_a_stop,
)
self.assertEqual(
pair.comparable_region.domain_b_stop,
expected_comparable_region.domain_b_stop,
)

0 comments on commit 6a07618

Please sign in to comment.