diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py index 0969b1cf..eae1d843 100644 --- a/big_scape/cli/cli_common_options.py +++ b/big_scape/cli/cli_common_options.py @@ -309,11 +309,11 @@ def common_cluster_query(fn): ), click.option( "--extend_strategy", - type=click.Choice(["legacy", "greedy"]), + type=click.Choice(["legacy", "greedy", "simple_match"]), default="legacy", callback=validate_extend_strategy, help="Strategy to extend BGCs. 'legacy' will use the original BiG-SCAPE extension strategy, " - "while 'greedy' will use a new greedy extension strategy. For an in depth description," + "while 'greedy' or 'simple_match' will use new extension strategies. For an in depth description," " see the wiki. (default: legacy).", ), # networking parameters diff --git a/big_scape/comparison/extend.py b/big_scape/comparison/extend.py index c57a5230..6f4be6cc 100644 --- a/big_scape/comparison/extend.py +++ b/big_scape/comparison/extend.py @@ -592,3 +592,107 @@ def extend_greedy(pair: RecordPair) -> None: logging.debug("after greedy extend:") logging.debug(pair.comparable_region) + + +def extend_simple_match(pair: RecordPair, match, gap): + """Performs extension by first creating a simple match matrix, then + performing a match/gap extentsion similar to legacy expansion + + This method expects LCS to have been performed on the pair, and will + do all four directions at once + + Args: + pair: The record pair to extend + match: The score for a match + gap: The score for a gap + """ + + # I *think* the simplest way to do this is to create a list of domains with their source cds. for reasons. + a_domains = [] + b_domains = [] + + # so we'll do a loop through cds and through domains to keep track of everything + for cds_idx, cds in enumerate(pair.record_a.get_cds_with_domains()): + for domain in cds.hsps: + a_domains.append((domain, cds_idx)) + + for cds_idx, cds in enumerate(pair.record_b.get_cds_with_domains()): + for domain in cds.hsps: + b_domains.append((domain, cds_idx)) + + # get the common domains + common_domains = set([a[0] for a in a_domains]).intersection( + set([b[0] for b in b_domains]) + ) + + # TODO: this is obviously repetitive, but beyond extracting a function I'm + # not sure how to make it better + # for now I'd prefer this to be obvious and clear over looking good + + # a forward + score = 0 + max_score = 0 + for i in range(pair.comparable_region.lcs_domain_a_stop, len(a_domains)): + domain = a_domains[i][0] + + if domain not in common_domains: + score += gap + else: + score += match + + if score > max_score: + cds_idx = a_domains[i][1] + max_score = score + pair.comparable_region.a_stop = cds_idx + 1 + pair.comparable_region.domain_a_stop = i + 1 + + # a reverse + score = 0 + max_score = 0 + for i in range(pair.comparable_region.lcs_domain_a_start - 1, -1, -1): + domain = a_domains[i][0] + + if domain not in common_domains: + score += gap + else: + score += match + + if score > max_score: + cds_idx = a_domains[i][1] + max_score = score + pair.comparable_region.a_start = cds_idx + pair.comparable_region.domain_a_start = i + + # b forward + score = 0 + max_score = 0 + for i in range(pair.comparable_region.lcs_domain_b_stop, len(b_domains)): + domain = b_domains[i][0] + + if domain not in common_domains: + score += gap + else: + score += match + + if score > max_score: + cds_idx = b_domains[i][1] + max_score = score + pair.comparable_region.b_stop = cds_idx + 1 + pair.comparable_region.domain_b_stop = i + 1 + + # b reverse + score = 0 + max_score = 0 + for i in range(pair.comparable_region.lcs_domain_b_start - 1, -1, -1): + domain = b_domains[i][0] + + if domain not in common_domains: + score += gap + else: + score += match + + if score > max_score: + cds_idx = b_domains[i][1] + max_score = score + pair.comparable_region.b_start = cds_idx + pair.comparable_region.domain_b_start = i diff --git a/big_scape/comparison/workflow.py b/big_scape/comparison/workflow.py index c0f378b2..89c97bcc 100644 --- a/big_scape/comparison/workflow.py +++ b/big_scape/comparison/workflow.py @@ -47,6 +47,7 @@ from .extend import ( extend, extend_greedy, + extend_simple_match, reset, len_check, biosynthetic_check, @@ -320,6 +321,13 @@ def expand_pair( if extend_strategy == bs_enums.EXTEND_STRATEGY.GREEDY: extend_greedy(pair) + if extend_strategy == bs_enums.EXTEND_STRATEGY.SIMPLE_MATCH: + extend_simple_match( + pair, + BigscapeConfig.EXPAND_MATCH_SCORE, + BigscapeConfig.EXPAND_GAP_SCORE, + ) + # after local expansion, additionally expand shortest arms in glocal/auto if alignment_mode != bs_enums.ALIGNMENT_MODE.LOCAL: expand_glocal(pair) diff --git a/big_scape/enums/comparison.py b/big_scape/enums/comparison.py index 1747ae46..52dae89f 100644 --- a/big_scape/enums/comparison.py +++ b/big_scape/enums/comparison.py @@ -14,6 +14,7 @@ class ALIGNMENT_MODE(Enum): class EXTEND_STRATEGY(Enum): LEGACY = "legacy" GREEDY = "greedy" + SIMPLE_MATCH = "simple_match" class COMPARISON_MODE(Enum): diff --git a/test/comparison/test_extend.py b/test/comparison/test_extend.py index 556a873f..ba5f152a 100644 --- a/test/comparison/test_extend.py +++ b/test/comparison/test_extend.py @@ -882,3 +882,95 @@ def test_expand_greedy(self): ] self.assertTrue(all(conditions)) + + def test_match_extend(self): + """Tests the new match extend implementation + + This implmentation will be more lenient for domain matches which are + present on the other side of the lcs. Altogether this should be a more relaxed + approach to extension and will result in larger comparable regions + + The following example: + + A: XXXXXCXXABXXXX + B: XXXXABCXXX + LCS: ^^ + + Would fail to extend to C in the legacy extend implementation, but should extend + in the new implementation given a match score of 5 and mismatch of -2 + """ + + a_cds, b_cds = generate_mock_cds_lists(14, 10, [6, 9, 10], [5, 6, 7], False) + record_a = generate_mock_region(a_cds) + record_b = generate_mock_region(b_cds) + pair = big_scape.comparison.record_pair.RecordPair(record_a, record_b) + + # emulate LCS + pair.comparable_region = bs_comp.ComparableRegion( + 9, 11, 6, 8, 9, 11, 6, 8, False + ) + + bs_comp.extend.extend_simple_match(pair, 5, -2) + + expected_comparable_region = bs_comp.ComparableRegion( + 6, 11, 5, 8, 6, 11, 5, 8, False + ) + + self.assertEqual(pair.comparable_region, expected_comparable_region) + self.assertEqual( + pair.comparable_region.domain_a_start, + expected_comparable_region.domain_a_start, + ) + self.assertEqual( + pair.comparable_region.domain_b_start, + expected_comparable_region.domain_b_start, + ) + self.assertEqual( + pair.comparable_region.domain_a_stop, + expected_comparable_region.domain_a_stop, + ) + self.assertEqual( + pair.comparable_region.domain_b_stop, + expected_comparable_region.domain_b_stop, + ) + + def test_expand_simple_match_multi_domain(self): + """Tests glocal expand with multi domain cdss""" + # brackets indicate a cds with multiple domains + # + # A: [XX]BX[A BC]DEX XXXX + # B: XXXXXXXXXX XX A[BC]EDX[XXXX] + # + a_cds, b_cds = generate_mock_cds_lists( + 10, 17, [3, 3, 3, 4, 5], [12, 13, 13, 15, 14], False + ) + a_cds[0].hsps.append(a_cds[0].hsps[0]) + a_cds[1].hsps = [a_cds[3].hsps[1]] + b_cds[-1].hsps.extend([b_cds[-1].hsps[0]] * 3) + record_a = generate_mock_region(a_cds) + record_b = generate_mock_region(b_cds) + pair = big_scape.comparison.record_pair.RecordPair(record_a, record_b) + pair.comparable_region = bs_comp.ComparableRegion( + 3, 4, 12, 14, 4, 7, 12, 15, False + ) + bs_comp.extend.extend_simple_match(pair, 5, -2) + expected_comparable_region = bs_comp.ComparableRegion( + 1, 6, 12, 16, 2, 9, 12, 17, False + ) + self.assertEqual(pair.comparable_region, expected_comparable_region) + self.assertEqual( + pair.comparable_region.domain_a_start, + expected_comparable_region.domain_a_start, + ) + self.assertEqual( + pair.comparable_region.domain_b_start, + expected_comparable_region.domain_b_start, + ) + self.assertEqual( + pair.comparable_region.domain_a_stop, + expected_comparable_region.domain_a_stop, + ) + self.assertEqual( + pair.comparable_region.domain_b_stop, + expected_comparable_region.domain_b_stop, + )