Skip to content

Commit

Permalink
Merge pull request #208 from medema-group/hotfix/error-msg
Browse files Browse the repository at this point in the history
Hotfix/error msg
  • Loading branch information
nlouwen authored Nov 14, 2024
2 parents f31e784 + d37f3cd commit a4e549a
Show file tree
Hide file tree
Showing 13 changed files with 122 additions and 58 deletions.
37 changes: 21 additions & 16 deletions big_scape/comparison/lcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import big_scape.genbank as bs_genbank
import big_scape.comparison.record_pair as bs_comparison
import big_scape.hmm as bs_hmm
from big_scape.diagnostics import DisableLogger
from big_scape.data import DB


Expand Down Expand Up @@ -223,11 +224,9 @@ def find_domain_lcs_region(

# quickly check if we didn't find an LCS
if fwd_match_len == 0 and rev_match_len == 0:
logging.error(
"No match found in LCS. This should not happen after first jaccard"
)
logging.error("a domains: %s", a_domains)
logging.error("b domains: %s", b_domains)
logging.error("%s: No match found in LCS", pair)
logging.debug("a domains: %s", a_domains)
logging.debug("b domains: %s", b_domains)
raise RuntimeError("No match found in LCS.")

# now we need to do something silly. we want to assemble a list of these matching
Expand Down Expand Up @@ -472,11 +471,9 @@ def find_domain_lcs_protocluster(

# quickly check if we didn't find an LCS
if fwd_match_len == 0 and rev_match_len == 0:
logging.error(
"No match found in LCS. This should not happen after first jaccard"
)
logging.error("a domains: %s", a_domains)
logging.error("b domains: %s", b_domains)
logging.error("%s: No match found in LCS", pair)
logging.debug("a domains: %s", a_domains)
logging.debug("b domains: %s", b_domains)
raise RuntimeError("No match found in LCS.")

# now we need to do something silly. we want to assemble a list of these matching
Expand Down Expand Up @@ -693,12 +690,20 @@ def construct_missing_global_lcs(records: list[bs_genbank.BGCRecord], exemplar:
pair = bs_comparison.RecordPair(
record_db_dict[rec_a_id], record_db_dict[rec_b_id]
)
if isinstance(pair.record_a, bs_genbank.ProtoCluster) and isinstance(
pair.record_b, bs_genbank.ProtoCluster
):
lcs_data = find_domain_lcs_protocluster(pair)
else:
lcs_data = find_domain_lcs_region(pair)

# try to find an lcs. if no lcs is present this record has no domain overlap
# with the exemplar, meaning it will be removed from the tree anyways.
try:
# we don't really care about error logs at this point
with DisableLogger():
if isinstance(pair.record_a, bs_genbank.ProtoCluster) and isinstance(
pair.record_b, bs_genbank.ProtoCluster
):
lcs_data = find_domain_lcs_protocluster(pair)
else:
lcs_data = find_domain_lcs_region(pair)
except RuntimeError:
continue

DB.execute(
update(distance_table)
Expand Down
10 changes: 5 additions & 5 deletions big_scape/data/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,11 +245,6 @@ def load_from_disk(db_path: Path) -> None:
+ str(type(raw_memory_connection))
)

# backup only writes those tables that have data, it seems
DB.create_tables()

DB.reflect()

page_count = raw_file_connection.execute("PRAGMA page_count;")
page_count = page_count.fetchone()[0]

Expand All @@ -260,6 +255,11 @@ def progress(status, remaining, total):

raw_file_connection.backup(raw_memory_connection, progress=progress)

# backup only writes those tables that have data, it seems
DB.create_tables()

DB.reflect()

@staticmethod
def close_db() -> None:
"""Closes the database connection. This does not save the database to disk"""
Expand Down
10 changes: 8 additions & 2 deletions big_scape/diagnostics/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
"""Contains modules for diagnostics"""

from .logger import init_logger, init_logger_file
from .logger import init_logger, init_logger_file, DisableLogger
from .profiler import Profiler, calc_cpu_percent

__all__ = ["init_logger", "init_logger_file", "Profiler", "calc_cpu_percent"]
__all__ = [
"init_logger",
"init_logger_file",
"DisableLogger",
"Profiler",
"calc_cpu_percent",
]
11 changes: 11 additions & 0 deletions big_scape/diagnostics/logger.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Module containing code to handle logging"""

# from python
from contextlib import AbstractContextManager
import logging
import sys

Expand Down Expand Up @@ -38,3 +39,13 @@ def init_logger_file(run) -> None: # pragma: no cover
file_handler.setFormatter(log_formatter)
root_logger.addHandler(file_handler)
logging.info(" ".join(sys.argv))


class DisableLogger(AbstractContextManager):
"""Context manager to temporarily disable logs"""

def __enter__(self) -> None:
logging.disable(logging.CRITICAL)

def __exit__(self, exc_type, exc_value, traceback) -> None:
logging.disable(logging.NOTSET)
2 changes: 1 addition & 1 deletion big_scape/file_input/load_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,7 @@ def load_gbk(
"""

if not path.is_file():
logging.error("GBK path does not point to a file!")
logging.error("%s: GBK path does not point to a file!", path)
raise IsADirectoryError()

return GBK.parse(path, source_type, run, cds_overlap_cutoff)
Expand Down
19 changes: 14 additions & 5 deletions big_scape/genbank/candidate_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def save_all(self, parent_id: int) -> None:
def parse(
cls, feature: SeqFeature, parent_gbk: Optional[GBK] = None
) -> CandidateCluster:
"""_summary_Creates a cand_cluster object from a region feature in a GBK file
"""Creates a cand_cluster object from a region feature in a GBK file
Args:
feature (SeqFeature): cand_cluster GBK feature
Expand All @@ -113,29 +113,38 @@ def parse(
Returns:
CandidateCluster: Candidate cluster object
"""
err_path = parent_gbk.path if parent_gbk else ""

if feature.type != "cand_cluster":
logging.error(
"Feature is not of correct type! (expected: cand_cluster, was: %s)",
"%s: feature is not of correct type! (expected: cand_cluster, was: %s)",
err_path,
feature.type,
)
raise InvalidGBKError()

if "candidate_cluster_number" not in feature.qualifiers:
logging.error(
"candidate_cluster_number qualifier not found in cand_cluster feature!"
"%s: candidate_cluster_number qualifier not found in cand_cluster feature!",
err_path,
)
raise InvalidGBKError()

cand_cluster_number = int(feature.qualifiers["candidate_cluster_number"][0])

if "kind" not in feature.qualifiers:
logging.error("kind qualifier not found in cand_cluster feature!")
logging.error(
"%s: kind qualifier not found in cand_cluster feature!", err_path
)
raise InvalidGBKError()

cand_cluster_kind = feature.qualifiers["kind"][0]

if "protoclusters" not in feature.qualifiers:
logging.error("protoclusters qualifier not found in region feature!")
logging.error(
"%s: protoclusters qualifier not found in cand_cluster feature!",
err_path,
)
raise InvalidGBKError()

proto_clusters: dict[int, Optional[ProtoCluster]] = {}
Expand Down
34 changes: 21 additions & 13 deletions big_scape/genbank/gbk.py
Original file line number Diff line number Diff line change
Expand Up @@ -354,7 +354,14 @@ def parse(
gbk = cls(path, hash, source_type)

# get record. should only ever be one for Antismash GBK
record: SeqRecord = next(SeqIO.parse(path, "genbank"))
records = list(SeqIO.parse(path, "genbank"))
if len(records) > 1:
logging.warning(
"%s: GBK contains multiple sequence records! "
"Only the first will be considered.",
path,
)
record: SeqRecord = records.pop(0)
gbk.nt_seq = record.seq

gbk.metadata["description"] = record.description
Expand Down Expand Up @@ -470,14 +477,15 @@ def parse_as4(
# If no cluster feature was found and force-gbk is false, GBK is invalid
if not force_gbk:
logging.error(
"%s: GBK file does not contain an antiSMASH cluster feature", self.path
"%s: GBK file does not contain an antiSMASH cluster or region feature. "
"Consider using --force-gbk to include this GBK anyways.",
self.path,
)
raise InvalidGBKError()

# at this point we need to make a region object from the whole GBK
logging.warning(
"%s: GBK file does not contain an antiSMASH region feature. "
"Using --force_gbk, assuming AS4",
"%s: non-antiSMASH GBK file detected, forcing artificial region feature.",
self.path,
)

Expand Down Expand Up @@ -627,15 +635,15 @@ def collapse_hybrids_in_cand_clusters(
for number in cand_cluster.proto_clusters.keys()
]
merged_protocluster = MergedProtoCluster.merge(protoclusters)
merged_tmp_proto_clusters[merged_protocluster.number] = (
merged_protocluster
)
merged_tmp_proto_clusters[
merged_protocluster.number
] = merged_protocluster

# update the protocluster old:new ids for the merged protoclusters of this cand_cluster
for proto_cluster_num in cand_cluster.proto_clusters.keys():
merged_protocluster_ids[proto_cluster_num] = (
merged_protocluster.number
)
merged_protocluster_ids[
proto_cluster_num
] = merged_protocluster.number

# now we build a new version of the tmp_proto_clusters dict that contains the merged protoclusters
# as well as protoclusters which did not need merging, with updated unique IDs/numbers
Expand All @@ -649,9 +657,9 @@ def collapse_hybrids_in_cand_clusters(
# this protocluster has been merged, so we need to add it to
# the dict with its new protocluster number
new_proto_cluster_num = merged_protocluster_ids[proto_cluster_num]
updated_tmp_proto_clusters[new_proto_cluster_num] = (
merged_tmp_proto_clusters[new_proto_cluster_num]
)
updated_tmp_proto_clusters[
new_proto_cluster_num
] = merged_tmp_proto_clusters[new_proto_cluster_num]
updated_proto_cluster_dict[new_proto_cluster_num] = None
else:
# protoclusters which have not been merged are added to the dict as is
Expand Down
8 changes: 6 additions & 2 deletions big_scape/genbank/proto_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,16 +136,20 @@ def parse(
Returns:
ProtoCluster: protocluster object
"""
err_path = parent_gbk.path if parent_gbk else ""

if feature.type != "protocluster":
logging.error(
"Feature is not of correct type! (expected: protocluster, was: %s)",
"%s: feature is not of correct type! (expected: protocluster, was: %s)",
err_path,
feature.type,
)
raise InvalidGBKError()

if "protocluster_number" not in feature.qualifiers:
logging.error(
"protocluster_number qualifier not found in protocluster feature!"
"%s: protocluster_number qualifier not found in protocluster feature!",
err_path,
)
raise InvalidGBKError()

Expand Down
8 changes: 6 additions & 2 deletions big_scape/genbank/proto_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,16 +78,20 @@ def parse(cls, feature: SeqFeature, parent_gbk: Optional[GBK] = None) -> ProtoCo
Returns:
ProtoCore: protocore object
"""
err_path = parent_gbk.path if parent_gbk else ""

if feature.type != "proto_core":
logging.error(
"Feature is not of correct type! (expected: proto_core, was: %s)",
"%s: feature is not of correct type! (expected: proto_core, was: %s)",
err_path,
feature.type,
)
raise InvalidGBKError()

if "protocluster_number" not in feature.qualifiers:
logging.error(
"protocluster_number qualifier not found in proto_core feature!"
"%s: protocluster_number qualifier not found in proto_core feature!",
err_path,
)
raise InvalidGBKError()

Expand Down
28 changes: 20 additions & 8 deletions big_scape/genbank/region.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,9 +109,12 @@ def parse_as5(cls, feature: SeqFeature, parent_gbk: Optional[GBK] = None) -> Reg
Returns:
Region: region object
"""
err_path = parent_gbk.path if parent_gbk else ""

if feature.type != "region":
logging.error(
"Feature is not of correct type! (expected: region, was: %s)",
"%s: feature is not of correct type! (expected: region, was: %s)",
err_path,
feature.type,
)
raise InvalidGBKError()
Expand All @@ -120,7 +123,10 @@ def parse_as5(cls, feature: SeqFeature, parent_gbk: Optional[GBK] = None) -> Reg
# children classes (protocluster, protocore)

if "region_number" not in feature.qualifiers:
logging.error("region number qualifier not found in region feature!")
logging.error(
"%s: region number qualifier not found in region feature!",
err_path,
)
raise InvalidGBKError()

region_number = int(feature.qualifiers["region_number"][0])
Expand All @@ -147,8 +153,9 @@ def parse_as5(cls, feature: SeqFeature, parent_gbk: Optional[GBK] = None) -> Reg
return region

logging.error(
"candidate_cluster_numbers qualifier not found in region feature! "
"Consider checking whether there is something special about this gbk"
"%s: candidate_cluster_numbers qualifier not found in region feature! "
"Consider checking whether there is something special about this gbk",
err_path,
)
raise InvalidGBKError()

Expand All @@ -172,9 +179,12 @@ def parse_as4(cls, feature: SeqFeature, parent_gbk: Optional[GBK] = None) -> Reg
Returns:
Region: region object
"""
err_path = parent_gbk.path if parent_gbk else ""

if feature.type != "cluster":
logging.error(
"Feature is not of correct type! (expected: cluster, was: %s)",
"%s: feature is not of correct type! (expected: cluster, was: %s)",
err_path,
feature.type,
)
raise InvalidGBKError()
Expand All @@ -186,7 +196,9 @@ def parse_as4(cls, feature: SeqFeature, parent_gbk: Optional[GBK] = None) -> Reg
"note" not in feature.qualifiers
or "Cluster number" not in feature.qualifiers["note"][0]
):
logging.error("cluster number qualifier not found in cluster feature!")
logging.error(
"%s: Cluster number qualifier not found in cluster feature!", err_path
)
raise InvalidGBKError()

cluster_note_number = feature.qualifiers["note"][0]
Expand Down Expand Up @@ -221,8 +233,8 @@ def parse_full_region(cls, record: SeqRecord, parent_gbk: GBK) -> Region:
Returns:
Region: region object
"""
nt_start = 1
nt_stop = len(record.seq) // 3
nt_start = 0
nt_stop = len(record.seq)
contig_edge = False

# record may have multiple products. handle them here
Expand Down
5 changes: 4 additions & 1 deletion test/comparison/test_lcs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import unittest
import random
import string
import big_scape.comparison.record_pair
from pathlib import Path

# from other modules
import big_scape.comparison.record_pair
import big_scape.genbank as bs_genbank
import big_scape.hmm as bs_hmmer
import big_scape.comparison as bs_comparison
Expand Down Expand Up @@ -145,7 +146,9 @@ def test_lcs_domains_empty(self):
cds_a, cds_b = generate_mock_cds_lists(0, 0, [], [], False)

pc_a = generate_mock_protocluster(cds_a)
pc_a.parent_gbk = bs_genbank.GBK(Path("test1"), "", "")
pc_b = generate_mock_protocluster(cds_b)
pc_b.parent_gbk = bs_genbank.GBK(Path("test2"), "", "")
pair = big_scape.comparison.record_pair.RecordPair(pc_a, pc_b)

with self.assertRaises(RuntimeError):
Expand Down
Loading

0 comments on commit a4e549a

Please sign in to comment.