Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

re-implement (?) disk-only database #191

Merged
merged 6 commits into from
Oct 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion big_scape/cli/benchmark_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@

# from this module
from .cli_common_options import common_all
from .cli_validations import set_start, validate_output_paths
from .cli_validations import (
set_start,
validate_output_paths,
validate_disk_only,
)


# BiG-SCAPE benchmark mode
Expand Down Expand Up @@ -49,6 +53,7 @@ def benchmark(ctx, *args, **kwargs):

# workflow validations
validate_output_paths(ctx)
validate_disk_only(ctx)

# set start time and label
set_start(ctx.obj)
Expand Down
10 changes: 10 additions & 0 deletions big_scape/cli/cli_common_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,15 @@ def common_all(fn):
"but in case of a crashed run no info will be stored and you'll have to "
"re-start the run from scratch",
),
click.option(
"--disk-only",
type=bool,
is_flag=True,
default=False,
help="Do not store any results in memory, only on disk. This is almost certainly "
"slower than the default behaviour, but can be useful for very large runs or "
"runs with limited memory.",
),
click.option(
"--no-interactive",
type=bool,
Expand Down Expand Up @@ -336,6 +345,7 @@ def common_cluster_query(fn):
help="Use a specific type of antiSMASH record for comparison. (default: region).",
),
]

for opt in options[::-1]:
fn = opt(fn)
return fn
19 changes: 19 additions & 0 deletions big_scape/cli/cli_validations.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,25 @@ def validate_output_paths(ctx) -> None:
return None


# db modes validations
def validate_disk_only(ctx) -> None:
"""Checks if the database storage/dumping modes that were set are compatible"""

if not ("no_db_dump" in ctx.obj and "disk_only" in ctx.obj):
raise RuntimeError(
"Something went wrong with the database storage/dumping mode parameters. "
"Please contact the developers."
)

if ctx.obj["no_db_dump"] and ctx.obj["disk_only"]:
logging.error(
"You have selected both --no-db-dump and --disk-only. Please select only one"
)
raise click.UsageError(
"You have selected both --no-db-dump and --disk-only. Please select only one"
)


# comparison validations


Expand Down
2 changes: 2 additions & 0 deletions big_scape/cli/cluster_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from .cli_common_options import common_all, common_cluster_query
from .cli_validations import (
validate_output_paths,
validate_disk_only,
validate_binning_cluster_workflow,
validate_pfam_path,
validate_domain_include_list,
Expand Down Expand Up @@ -77,6 +78,7 @@ def cluster(ctx, *args, **kwargs):
validate_pfam_path(ctx)
validate_domain_include_list(ctx)
validate_output_paths(ctx)
validate_disk_only(ctx)

# set start time and run label
set_start(ctx.obj)
Expand Down
2 changes: 2 additions & 0 deletions big_scape/cli/query_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from .cli_common_options import common_all, common_cluster_query
from .cli_validations import (
validate_output_paths,
validate_disk_only,
validate_query_bgc,
validate_pfam_path,
set_start,
Expand Down Expand Up @@ -81,6 +82,7 @@ def query(ctx, *args, **kwarg):
validate_output_paths(ctx)
validate_binning_query_workflow(ctx)
validate_query_record(ctx)
validate_disk_only(ctx)

# set start time and label
set_start(ctx.obj)
Expand Down
42 changes: 41 additions & 1 deletion big_scape/data/sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,26 @@ def open_memory_connection() -> None:
)
DB.connection = DB.engine.connect()

def open_disk_connection(db_path: Path) -> None:
if DB.opened():
raise DBAlreadyOpenError()

DB.engine = create_engine(
"sqlite:///" + str(db_path),
connect_args={"check_same_thread": False},
poolclass=StaticPool,
)
DB.connection = DB.engine.connect()

@staticmethod
def create_on_disk(db_path: Path) -> None:
"""Open a connection to a database file"""
DB.open_disk_connection(db_path)

DB.create_tables()

DB.reflect()

@staticmethod
def create_in_mem() -> None:
"""Create a new database in-memory"""
Expand All @@ -106,7 +126,7 @@ def create_in_mem() -> None:
DB.reflect()

@staticmethod
def save_to_disk(db_path: Path) -> None:
def save_to_disk(db_path: Path, force=False) -> None:
"""Saves the in-memory database to a .db file. This overwrites any last database
file in the same location
"""
Expand All @@ -116,6 +136,20 @@ def save_to_disk(db_path: Path) -> None:
click_context = click.get_current_context(silent=True)

if click_context and click_context.obj["no_db_dump"]:
# added force to override this override. fun times.
# this is used when you use the --no-db-dump flag so that we can force it
# to save at the end anyway
# TODO: this logic should be moved out, where the no_db_dump flag is checked
# when it is appropriate to do so
if not force:
return

# skip this if we are using disk-only mode
if click_context and click_context.obj["disk_only"]:
return

# skip this if we are using disk-only mode
if click_context and click_context.obj["disk_only"]:
return

if not DB.opened():
Expand Down Expand Up @@ -175,6 +209,12 @@ def load_from_disk(db_path: Path) -> None:
if not db_path.exists():
raise FileNotFoundError()

# disk only means we don't have to dump to memory
click_context = click.get_current_context(silent=True)
if click_context and click_context.obj["disk_only"]:
DB.create_on_disk(db_path)
return

file_engine = create_engine("sqlite:///" + str(db_path))
file_engine.connect()

Expand Down
10 changes: 8 additions & 2 deletions big_scape/run_bigscape.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,8 +111,14 @@ def signal_handler(sig, frame):

# INPUT - create an in memory bs_data.DB or load from disk
if not run["db_path"].exists():
bs_data.DB.create_in_mem()
if run["disk_only"]:
logging.info("Creating on disk database")
bs_data.DB.create_on_disk(run["db_path"])
else:
logging.info("Creating in memory database")
bs_data.DB.create_in_mem()
else:
logging.info("Loading database from disk")
bs_data.DB.load_from_disk(run["db_path"])
bs_data.DB.check_config_hash()

Expand Down Expand Up @@ -352,6 +358,6 @@ def signal_handler(sig, frame):
exec_time = end - start_time

bs_data.DB.set_run_end(run["run_id"], start_time, end)
bs_data.DB.save_to_disk(run["db_path"])
bs_data.DB.save_to_disk(run["db_path"], True)

logging.info("All tasks done at %f seconds", exec_time.total_seconds())
Loading