Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement simple match extend #183

Merged
merged 5 commits into from
Oct 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,11 +309,11 @@ def common_cluster_query(fn):
),
click.option(
"--extend_strategy",
type=click.Choice(["legacy", "greedy"]),
type=click.Choice(["legacy", "greedy", "simple_match"]),
default="legacy",
callback=validate_extend_strategy,
help="Strategy to extend BGCs. 'legacy' will use the original BiG-SCAPE extension strategy, "
"while 'greedy' will use a new greedy extension strategy. For an in depth description,"
"while 'greedy' or 'simple_match' will use new extension strategies. For an in depth description,"
" see the wiki. (default: legacy).",
),
# networking parameters
Expand Down
104 changes: 104 additions & 0 deletions big_scape/comparison/extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -592,3 +592,107 @@ def extend_greedy(pair: RecordPair) -> None:

logging.debug("after greedy extend:")
logging.debug(pair.comparable_region)


def extend_simple_match(pair: RecordPair, match, gap):
"""Performs extension by first creating a simple match matrix, then
performing a match/gap extentsion similar to legacy expansion

This method expects LCS to have been performed on the pair, and will
do all four directions at once

Args:
pair: The record pair to extend
match: The score for a match
gap: The score for a gap
"""

# I *think* the simplest way to do this is to create a list of domains with their source cds. for reasons.
a_domains = []
b_domains = []

# so we'll do a loop through cds and through domains to keep track of everything
for cds_idx, cds in enumerate(pair.record_a.get_cds_with_domains()):
for domain in cds.hsps:
a_domains.append((domain, cds_idx))

for cds_idx, cds in enumerate(pair.record_b.get_cds_with_domains()):
for domain in cds.hsps:
b_domains.append((domain, cds_idx))

# get the common domains
common_domains = set([a[0] for a in a_domains]).intersection(
set([b[0] for b in b_domains])
)

# TODO: this is obviously repetitive, but beyond extracting a function I'm
# not sure how to make it better
# for now I'd prefer this to be obvious and clear over looking good

# a forward
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_a_stop, len(a_domains)):
domain = a_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = a_domains[i][1]
max_score = score
pair.comparable_region.a_stop = cds_idx + 1
pair.comparable_region.domain_a_stop = i + 1

# a reverse
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_a_start - 1, -1, -1):
domain = a_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = a_domains[i][1]
max_score = score
pair.comparable_region.a_start = cds_idx
pair.comparable_region.domain_a_start = i

# b forward
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_b_stop, len(b_domains)):
domain = b_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = b_domains[i][1]
max_score = score
pair.comparable_region.b_stop = cds_idx + 1
pair.comparable_region.domain_b_stop = i + 1

# b reverse
score = 0
max_score = 0
for i in range(pair.comparable_region.lcs_domain_b_start - 1, -1, -1):
domain = b_domains[i][0]

if domain not in common_domains:
score += gap
else:
score += match

if score > max_score:
cds_idx = b_domains[i][1]
max_score = score
pair.comparable_region.b_start = cds_idx
pair.comparable_region.domain_b_start = i
8 changes: 8 additions & 0 deletions big_scape/comparison/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
from .extend import (
extend,
extend_greedy,
extend_simple_match,
reset,
len_check,
biosynthetic_check,
Expand Down Expand Up @@ -320,6 +321,13 @@ def expand_pair(
if extend_strategy == bs_enums.EXTEND_STRATEGY.GREEDY:
extend_greedy(pair)

if extend_strategy == bs_enums.EXTEND_STRATEGY.SIMPLE_MATCH:
extend_simple_match(
pair,
BigscapeConfig.EXPAND_MATCH_SCORE,
BigscapeConfig.EXPAND_GAP_SCORE,
)

# after local expansion, additionally expand shortest arms in glocal/auto
if alignment_mode != bs_enums.ALIGNMENT_MODE.LOCAL:
expand_glocal(pair)
Expand Down
1 change: 1 addition & 0 deletions big_scape/enums/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ class ALIGNMENT_MODE(Enum):
class EXTEND_STRATEGY(Enum):
LEGACY = "legacy"
GREEDY = "greedy"
SIMPLE_MATCH = "simple_match"


class COMPARISON_MODE(Enum):
Expand Down
92 changes: 92 additions & 0 deletions test/comparison/test_extend.py
Original file line number Diff line number Diff line change
Expand Up @@ -882,3 +882,95 @@ def test_expand_greedy(self):
]

self.assertTrue(all(conditions))

def test_match_extend(self):
"""Tests the new match extend implementation

This implmentation will be more lenient for domain matches which are
present on the other side of the lcs. Altogether this should be a more relaxed
approach to extension and will result in larger comparable regions

The following example:

A: XXXXXCXXABXXXX
B: XXXXABCXXX
LCS: ^^

Would fail to extend to C in the legacy extend implementation, but should extend
in the new implementation given a match score of 5 and mismatch of -2
"""

a_cds, b_cds = generate_mock_cds_lists(14, 10, [6, 9, 10], [5, 6, 7], False)
record_a = generate_mock_region(a_cds)
record_b = generate_mock_region(b_cds)
pair = big_scape.comparison.record_pair.RecordPair(record_a, record_b)

# emulate LCS
pair.comparable_region = bs_comp.ComparableRegion(
9, 11, 6, 8, 9, 11, 6, 8, False
)

bs_comp.extend.extend_simple_match(pair, 5, -2)

expected_comparable_region = bs_comp.ComparableRegion(
6, 11, 5, 8, 6, 11, 5, 8, False
)

self.assertEqual(pair.comparable_region, expected_comparable_region)
self.assertEqual(
pair.comparable_region.domain_a_start,
expected_comparable_region.domain_a_start,
)
self.assertEqual(
pair.comparable_region.domain_b_start,
expected_comparable_region.domain_b_start,
)
self.assertEqual(
pair.comparable_region.domain_a_stop,
expected_comparable_region.domain_a_stop,
)
self.assertEqual(
pair.comparable_region.domain_b_stop,
expected_comparable_region.domain_b_stop,
)
nlouwen marked this conversation as resolved.
Show resolved Hide resolved

def test_expand_simple_match_multi_domain(self):
"""Tests glocal expand with multi domain cdss"""
# brackets indicate a cds with multiple domains
#
# A: [XX]BX[A BC]DEX XXXX
# B: XXXXXXXXXX XX A[BC]EDX[XXXX]
#
a_cds, b_cds = generate_mock_cds_lists(
10, 17, [3, 3, 3, 4, 5], [12, 13, 13, 15, 14], False
)
a_cds[0].hsps.append(a_cds[0].hsps[0])
a_cds[1].hsps = [a_cds[3].hsps[1]]
b_cds[-1].hsps.extend([b_cds[-1].hsps[0]] * 3)
record_a = generate_mock_region(a_cds)
record_b = generate_mock_region(b_cds)
pair = big_scape.comparison.record_pair.RecordPair(record_a, record_b)
pair.comparable_region = bs_comp.ComparableRegion(
3, 4, 12, 14, 4, 7, 12, 15, False
)
bs_comp.extend.extend_simple_match(pair, 5, -2)
expected_comparable_region = bs_comp.ComparableRegion(
1, 6, 12, 16, 2, 9, 12, 17, False
)
self.assertEqual(pair.comparable_region, expected_comparable_region)
self.assertEqual(
pair.comparable_region.domain_a_start,
expected_comparable_region.domain_a_start,
)
self.assertEqual(
pair.comparable_region.domain_b_start,
expected_comparable_region.domain_b_start,
)
self.assertEqual(
pair.comparable_region.domain_a_stop,
expected_comparable_region.domain_a_stop,
)
self.assertEqual(
pair.comparable_region.domain_b_stop,
expected_comparable_region.domain_b_stop,
)
Loading