diff --git a/big_scape/cli/benchmark_cli.py b/big_scape/cli/benchmark_cli.py index 30bd87be..b4d7500a 100644 --- a/big_scape/cli/benchmark_cli.py +++ b/big_scape/cli/benchmark_cli.py @@ -10,7 +10,11 @@ # from this module from .cli_common_options import common_all -from .cli_validations import set_start, validate_output_paths +from .cli_validations import ( + set_start, + validate_output_paths, + validate_disk_only, +) # BiG-SCAPE benchmark mode @@ -49,6 +53,7 @@ def benchmark(ctx, *args, **kwargs): # workflow validations validate_output_paths(ctx) + validate_disk_only(ctx) # set start time and label set_start(ctx.obj) diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py index 5bbaf4db..c4f8bef8 100644 --- a/big_scape/cli/cli_common_options.py +++ b/big_scape/cli/cli_common_options.py @@ -103,6 +103,15 @@ def common_all(fn): "but in case of a crashed run no info will be stored and you'll have to " "re-start the run from scratch", ), + click.option( + "--disk-only", + type=bool, + is_flag=True, + default=False, + help="Do not store any results in memory, only on disk. This is almost certainly " + "slower than the default behaviour, but can be useful for very large runs or " + "runs with limited memory.", + ), click.option( "--no-interactive", type=bool, @@ -336,6 +345,7 @@ def common_cluster_query(fn): help="Use a specific type of antiSMASH record for comparison. (default: region).", ), ] + for opt in options[::-1]: fn = opt(fn) return fn diff --git a/big_scape/cli/cli_validations.py b/big_scape/cli/cli_validations.py index 796ce8bc..7c041c1c 100644 --- a/big_scape/cli/cli_validations.py +++ b/big_scape/cli/cli_validations.py @@ -128,6 +128,25 @@ def validate_output_paths(ctx) -> None: return None +# db modes validations +def validate_disk_only(ctx) -> None: + """Checks if the database storage/dumping modes that were set are compatible""" + + if not ("no_db_dump" in ctx.obj and "disk_only" in ctx.obj): + raise RuntimeError( + "Something went wrong with the database storage/dumping mode parameters. " + "Please contact the developers." + ) + + if ctx.obj["no_db_dump"] and ctx.obj["disk_only"]: + logging.error( + "You have selected both --no-db-dump and --disk-only. Please select only one" + ) + raise click.UsageError( + "You have selected both --no-db-dump and --disk-only. Please select only one" + ) + + # comparison validations diff --git a/big_scape/cli/cluster_cli.py b/big_scape/cli/cluster_cli.py index 3da9b0c7..1ef0409e 100644 --- a/big_scape/cli/cluster_cli.py +++ b/big_scape/cli/cluster_cli.py @@ -11,6 +11,7 @@ from .cli_common_options import common_all, common_cluster_query from .cli_validations import ( validate_output_paths, + validate_disk_only, validate_binning_cluster_workflow, validate_pfam_path, validate_domain_include_list, @@ -77,6 +78,7 @@ def cluster(ctx, *args, **kwargs): validate_pfam_path(ctx) validate_domain_include_list(ctx) validate_output_paths(ctx) + validate_disk_only(ctx) # set start time and run label set_start(ctx.obj) diff --git a/big_scape/cli/query_cli.py b/big_scape/cli/query_cli.py index 011d9901..b3c7e82c 100644 --- a/big_scape/cli/query_cli.py +++ b/big_scape/cli/query_cli.py @@ -12,6 +12,7 @@ from .cli_common_options import common_all, common_cluster_query from .cli_validations import ( validate_output_paths, + validate_disk_only, validate_query_bgc, validate_pfam_path, set_start, @@ -81,6 +82,7 @@ def query(ctx, *args, **kwarg): validate_output_paths(ctx) validate_binning_query_workflow(ctx) validate_query_record(ctx) + validate_disk_only(ctx) # set start time and label set_start(ctx.obj) diff --git a/big_scape/data/sqlite.py b/big_scape/data/sqlite.py index efb192f4..f2b08bdd 100644 --- a/big_scape/data/sqlite.py +++ b/big_scape/data/sqlite.py @@ -96,6 +96,26 @@ def open_memory_connection() -> None: ) DB.connection = DB.engine.connect() + def open_disk_connection(db_path: Path) -> None: + if DB.opened(): + raise DBAlreadyOpenError() + + DB.engine = create_engine( + "sqlite:///" + str(db_path), + connect_args={"check_same_thread": False}, + poolclass=StaticPool, + ) + DB.connection = DB.engine.connect() + + @staticmethod + def create_on_disk(db_path: Path) -> None: + """Open a connection to a database file""" + DB.open_disk_connection(db_path) + + DB.create_tables() + + DB.reflect() + @staticmethod def create_in_mem() -> None: """Create a new database in-memory""" @@ -124,6 +144,14 @@ def save_to_disk(db_path: Path, force=False) -> None: if not force: return + # skip this if we are using disk-only mode + if click_context and click_context.obj["disk_only"]: + return + + # skip this if we are using disk-only mode + if click_context and click_context.obj["disk_only"]: + return + if not DB.opened(): raise DBClosedError() @@ -181,6 +209,12 @@ def load_from_disk(db_path: Path) -> None: if not db_path.exists(): raise FileNotFoundError() + # disk only means we don't have to dump to memory + click_context = click.get_current_context(silent=True) + if click_context and click_context.obj["disk_only"]: + DB.create_on_disk(db_path) + return + file_engine = create_engine("sqlite:///" + str(db_path)) file_engine.connect() diff --git a/big_scape/run_bigscape.py b/big_scape/run_bigscape.py index ddf78a3e..e6780623 100644 --- a/big_scape/run_bigscape.py +++ b/big_scape/run_bigscape.py @@ -111,8 +111,14 @@ def signal_handler(sig, frame): # INPUT - create an in memory bs_data.DB or load from disk if not run["db_path"].exists(): - bs_data.DB.create_in_mem() + if run["disk_only"]: + logging.info("Creating on disk database") + bs_data.DB.create_on_disk(run["db_path"]) + else: + logging.info("Creating in memory database") + bs_data.DB.create_in_mem() else: + logging.info("Loading database from disk") bs_data.DB.load_from_disk(run["db_path"]) bs_data.DB.check_config_hash()