From 16c8ccc894797cac8d225aff6236806b1cc001d0 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Mon, 7 Oct 2024 15:25:00 +0200 Subject: [PATCH 1/3] re-implement (?) disk-only database --- big_scape/cli/cli_common_options.py | 9 +++++++++ big_scape/data/sqlite.py | 30 +++++++++++++++++++++++++++++ big_scape/run_bigscape.py | 8 +++++++- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py index 5bbaf4db..40e1a755 100644 --- a/big_scape/cli/cli_common_options.py +++ b/big_scape/cli/cli_common_options.py @@ -103,6 +103,15 @@ def common_all(fn): "but in case of a crashed run no info will be stored and you'll have to " "re-start the run from scratch", ), + click.option( + "--disk-only", + type=bool, + is_flag=True, + default=False, + help="Do not store any results in memory, only on disk. This is almost certainly " + "slower than the default behaviour, but can be useful for very large runs or " + "runs with limited memory.", + ), click.option( "--no-interactive", type=bool, diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py index 24163b63..4958d2c5 100644 --- a/big_scape/data/sqlite.py +++ b/big_scape/data/sqlite.py @@ -96,6 +96,26 @@ def open_memory_connection() -> None: ) DB.connection = DB.engine.connect() + def open_disk_connection(db_path: Path) -> None: + if DB.opened(): + raise DBAlreadyOpenError() + + DB.engine = create_engine( + "sqlite:///" + str(db_path), + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + DB.connection = DB.engine.connect() + + @staticmethod + def create_on_disk(db_path: Path) -> None: + """Open a connection to a database file""" + DB.open_disk_connection(db_path) + + DB.create_tables() + + DB.reflect() + @staticmethod def create_in_mem() -> None: """Create a new database in-memory""" @@ -118,6 +138,10 @@ def save_to_disk(db_path: Path) -> None: if click_context and click_context.obj["no_db_dump"]: return + # skip this if we are using disk-only mode + if click_context and click_context.obj["disk_only"]: + return + if not DB.opened(): raise DBClosedError() @@ -175,6 +199,12 @@ def load_from_disk(db_path: Path) -> None: if not db_path.exists(): raise FileNotFoundError() + # disk only means we don't have to dump to memory + click_context = click.get_current_context(silent=True) + if click_context and click_context.obj["disk_only"]: + DB.create_on_disk(db_path) + return + file_engine = create_engine("sqlite:///" + str(db_path)) file_engine.connect() diff --git a/big_scape/run_bigscape.py b/big_scape/run_bigscape.py index 77483fa2..f67de1ad 100644 --- a/big_scape/run_bigscape.py +++ b/big_scape/run_bigscape.py @@ -111,8 +111,14 @@ def signal_handler(sig, frame): # INPUT - create an in memory bs_data.DB or load from disk if not run["db_path"].exists(): - bs_data.DB.create_in_mem() + if run["disk_only"]: + logging.info("Creating on disk database") + bs_data.DB.create_on_disk(run["db_path"]) + else: + logging.info("Creating in memory database") + bs_data.DB.create_in_mem() else: + logging.info("Loading database from disk") bs_data.DB.load_from_disk(run["db_path"]) bs_data.DB.check_config_hash() From 510713eb4e5b6acd56cd6e753a8580045f40e8ec Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Mon, 7 Oct 2024 15:25:00 +0200 Subject: [PATCH 2/3] re-implement (?) disk-only database --- big_scape/cli/cli_common_options.py | 9 +++++++++ big_scape/data/sqlite.py | 30 +++++++++++++++++++++++++++++ big_scape/run_bigscape.py | 8 +++++++- 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py index 5bbaf4db..40e1a755 100644 --- a/big_scape/cli/cli_common_options.py +++ b/big_scape/cli/cli_common_options.py @@ -103,6 +103,15 @@ def common_all(fn): "but in case of a crashed run no info will be stored and you'll have to " "re-start the run from scratch", ), + click.option( + "--disk-only", + type=bool, + is_flag=True, + default=False, + help="Do not store any results in memory, only on disk. This is almost certainly " + "slower than the default behaviour, but can be useful for very large runs or " + "runs with limited memory.", + ), click.option( "--no-interactive", type=bool, diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py index efb192f4..38fdca2b 100644 --- a/big_scape/data/sqlite.py +++ b/big_scape/data/sqlite.py @@ -96,6 +96,26 @@ def open_memory_connection() -> None: ) DB.connection = DB.engine.connect() + def open_disk_connection(db_path: Path) -> None: + if DB.opened(): + raise DBAlreadyOpenError() + + DB.engine = create_engine( + "sqlite:///" + str(db_path), + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + DB.connection = DB.engine.connect() + + @staticmethod + def create_on_disk(db_path: Path) -> None: + """Open a connection to a database file""" + DB.open_disk_connection(db_path) + + DB.create_tables() + + DB.reflect() + @staticmethod def create_in_mem() -> None: """Create a new database in-memory""" @@ -124,6 +144,10 @@ def save_to_disk(db_path: Path, force=False) -> None: if not force: return + # skip this if we are using disk-only mode + if click_context and click_context.obj["disk_only"]: + return + if not DB.opened(): raise DBClosedError() @@ -181,6 +205,12 @@ def load_from_disk(db_path: Path) -> None: if not db_path.exists(): raise FileNotFoundError() + # disk only means we don't have to dump to memory + click_context = click.get_current_context(silent=True) + if click_context and click_context.obj["disk_only"]: + DB.create_on_disk(db_path) + return + file_engine = create_engine("sqlite:///" + str(db_path)) file_engine.connect() diff --git a/big_scape/run_bigscape.py b/big_scape/run_bigscape.py index ddf78a3e..e6780623 100644 --- a/big_scape/run_bigscape.py +++ b/big_scape/run_bigscape.py @@ -111,8 +111,14 @@ def signal_handler(sig, frame): # INPUT - create an in memory bs_data.DB or load from disk if not run["db_path"].exists(): - bs_data.DB.create_in_mem() + if run["disk_only"]: + logging.info("Creating on disk database") + bs_data.DB.create_on_disk(run["db_path"]) + else: + logging.info("Creating in memory database") + bs_data.DB.create_in_mem() else: + logging.info("Loading database from disk") bs_data.DB.load_from_disk(run["db_path"]) bs_data.DB.check_config_hash() From b4a70b7fcccf5577412a38f834237219253eff63 Mon Sep 17 00:00:00 2001 From: Arjan Draisma Date: Thu, 10 Oct 2024 16:48:19 +0200 Subject: [PATCH 3/3] add cli validation --- big_scape/cli/benchmark_cli.py | 7 ++++++- big_scape/cli/cli_common_options.py | 1 + big_scape/cli/cli_validations.py | 19 +++++++++++++++++++ big_scape/cli/cluster_cli.py | 2 ++ big_scape/cli/query_cli.py | 2 ++ 5 files changed, 30 insertions(+), 1 deletion(-) diff --git a/big_scape/cli/benchmark_cli.py b/big_scape/cli/benchmark_cli.py index 30bd87be..b4d7500a 100644 --- a/big_scape/cli/benchmark_cli.py +++ b/big_scape/cli/benchmark_cli.py @@ -10,7 +10,11 @@ # from this module from .cli_common_options import common_all -from .cli_validations import set_start, validate_output_paths +from .cli_validations import ( + set_start, + validate_output_paths, + validate_disk_only, +) # BiG-SCAPE benchmark mode @@ -49,6 +53,7 @@ def benchmark(ctx, *args, **kwargs): # workflow validations validate_output_paths(ctx) + validate_disk_only(ctx) # set start time and label set_start(ctx.obj) diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py index 40e1a755..c4f8bef8 100644 --- a/big_scape/cli/cli_common_options.py +++ b/big_scape/cli/cli_common_options.py @@ -345,6 +345,7 @@ def common_cluster_query(fn): help="Use a specific type of antiSMASH record for comparison. (default: region).", ), ] + for opt in options[::-1]: fn = opt(fn) return fn diff --git a/big_scape/cli/cli_validations.py b/big_scape/cli/cli_validations.py index 796ce8bc..7c041c1c 100644 --- a/big_scape/cli/cli_validations.py +++ b/big_scape/cli/cli_validations.py @@ -128,6 +128,25 @@ def validate_output_paths(ctx) -> None: return None +# db modes validations +def validate_disk_only(ctx) -> None: + """Checks if the database storage/dumping modes that were set are compatible""" + + if not ("no_db_dump" in ctx.obj and "disk_only" in ctx.obj): + raise RuntimeError( + "Something went wrong with the database storage/dumping mode parameters. " + "Please contact the developers." + ) + + if ctx.obj["no_db_dump"] and ctx.obj["disk_only"]: + logging.error( + "You have selected both --no-db-dump and --disk-only. Please select only one" + ) + raise click.UsageError( + "You have selected both --no-db-dump and --disk-only. Please select only one" + ) + + # comparison validations diff --git a/big_scape/cli/cluster_cli.py b/big_scape/cli/cluster_cli.py index 3da9b0c7..1ef0409e 100644 --- a/big_scape/cli/cluster_cli.py +++ b/big_scape/cli/cluster_cli.py @@ -11,6 +11,7 @@ from .cli_common_options import common_all, common_cluster_query from .cli_validations import ( validate_output_paths, + validate_disk_only, validate_binning_cluster_workflow, validate_pfam_path, validate_domain_include_list, @@ -77,6 +78,7 @@ def cluster(ctx, *args, **kwargs): validate_pfam_path(ctx) validate_domain_include_list(ctx) validate_output_paths(ctx) + validate_disk_only(ctx) # set start time and run label set_start(ctx.obj) diff --git a/big_scape/cli/query_cli.py b/big_scape/cli/query_cli.py index 011d9901..b3c7e82c 100644 --- a/big_scape/cli/query_cli.py +++ b/big_scape/cli/query_cli.py @@ -12,6 +12,7 @@ from .cli_common_options import common_all, common_cluster_query from .cli_validations import ( validate_output_paths, + validate_disk_only, validate_query_bgc, validate_pfam_path, set_start, @@ -81,6 +82,7 @@ def query(ctx, *args, **kwarg): validate_output_paths(ctx) validate_binning_query_workflow(ctx) validate_query_record(ctx) + validate_disk_only(ctx) # set start time and label set_start(ctx.obj)