From c90b3f4e2f1d1de3a4ee70188ef8fe822105e376 Mon Sep 17 00:00:00 2001
From: Catarina Loureiro <catarina.salesesantosloureiro@wur.nl>
Date: Fri, 22 Nov 2024 10:24:58 +0100
Subject: [PATCH] small changes to cli arg help messages, and let benchmark
 account for labels in run names

---
 .../benchmarking/benchmark_data_loader.py     |   4 +
 big_scape/cli/cli_common_options.py           | 276 ++++++++++--------
 big_scape/cli/cli_validations.py              |   6 +-
 big_scape/cli/cluster_cli.py                  |  32 +-
 big_scape/cli/query_cli.py                    |  49 ++--
 big_scape/config.yml                          |  21 +-
 6 files changed, 227 insertions(+), 161 deletions(-)

diff --git a/big_scape/benchmarking/benchmark_data_loader.py b/big_scape/benchmarking/benchmark_data_loader.py
index efd18624..d4c98760 100644
--- a/big_scape/benchmarking/benchmark_data_loader.py
+++ b/big_scape/benchmarking/benchmark_data_loader.py
@@ -111,6 +111,10 @@ def load_computed_bs2_labels(self, data_path: Path) -> None:
         run_times = [
             p.stem.replace("_full", "") for p in data_path.glob("*_full.network")
         ]
+
+        # assume date-time is the last element, remove eventual labels
+        run_times = [rt[-19:] for rt in run_times]
+
         if len(run_times) == 0:
             raise FileNotFoundError("No BiG-SCAPE 2 output found")
         elif len(run_times) == 1:
diff --git a/big_scape/cli/cli_common_options.py b/big_scape/cli/cli_common_options.py
index ec69cd1e..1b601c68 100644
--- a/big_scape/cli/cli_common_options.py
+++ b/big_scape/cli/cli_common_options.py
@@ -34,13 +34,15 @@ def common_all(fn):
     """
     options = [
         click.option(
-            "--config_file_path",
+            "--config-file-path",
             type=click.Path(
                 exists=True, dir_okay=False, file_okay=True, path_type=Path
             ),
             default=Path(bs_paths.DEFAULT_CONFIG_FILE),
-            help="Path to BiG-SCAPE config file, which stores values for a "
-            "series of advanced use parameters. (default: bundled big_scape/config.yml).",
+            help=(
+                "Path to BiG-SCAPE config.yml file, which stores values for a "
+                "series of advanced use parameters. (default: bundled big_scape/config.yml)."
+            ),
         ),
         # diagnostic parameters
         click.option(
@@ -64,9 +66,11 @@ def common_all(fn):
             "--label",
             default=None,
             type=str,
-            help="A run label to be added to the output results folder name, as well as "
-            "dropdown menu in the visualization page. "
-            "By default, BiG-SCAPE runs will have a name such as YYYY-MM-DD_HH-MM-SS_[label]",
+            help=(
+                "A run label to be added to the output results folder name, as well as "
+                "dropdown menu in the visualization page. "
+                "By default, BiG-SCAPE runs will have a name such as [label]_YYYY-MM-DD_HH-MM-SS."
+            ),
         ),
         click.option(
             "-c",
@@ -80,65 +84,19 @@ def common_all(fn):
         # output parameters
         click.option(
             "-o",
-            "--output_dir",
+            "--output-dir",
             type=click.Path(path_type=Path, dir_okay=True, file_okay=False),
             required=True,
             callback=validate_output_dir,
             help="Output directory for all BiG-SCAPE results files.",
         ),
         click.option(
-            "--log_path",
+            "--log-path",
             type=click.Path(
                 path_type=Path(exists=False), dir_okay=True, file_okay=False
             ),
             help="Path to output log file. (default: output_dir/timestamp.log).",
         ),
-        click.option(
-            "--no-db-dump",
-            type=bool,
-            is_flag=True,
-            default=False,
-            help="Do not dump the sqlite database to disk. This will speed up your run, "
-            "but in case of a crashed run no info will be stored and you'll have to "
-            "re-start the run from scratch",
-        ),
-        click.option(
-            "--disk-only",
-            type=bool,
-            is_flag=True,
-            default=False,
-            help="Do not store any results in memory, only on disk. This is almost certainly "
-            "slower than the default behaviour, but can be useful for very large runs or "
-            "runs with limited memory.",
-        ),
-        click.option(
-            "--db-only-output",
-            type=bool,
-            is_flag=True,
-            default=False,
-            help="Do not generate any output besides the data stored in the database. "
-            "Suitable for advanced users that wish to only make use of the results "
-            "stored in the SQLite database.",
-        ),
-        click.option(
-            "--no-trees",
-            type=bool,
-            is_flag=True,
-            default=False,
-            help="Do not generate any GCF newick trees. Suitable for users that do not "
-            "utilize our output visualization, but only make use of the output stored "
-            "in the tsv files and/or SQLite database.",
-        ),
-        click.option(
-            "--force-gbk",
-            type=bool,
-            is_flag=True,
-            default=False,
-            help="If GBK files are found without antiSMASH annotations, this adds a region covering "
-            "the full sequence, and sets its product to 'other'. Warning: BiG-SCAPE still "
-            "needs CDS features and a sequence feature to work with non-antiSMASH gbks. "
-            "Furthermore, this feature is still under development, use at own risk.",
-        ),
     ]
     for opt in options[::-1]:
         fn = opt(fn)
@@ -164,181 +122,267 @@ def common_cluster_query(fn):
         ),
         click.option(
             "-i",
-            "--input_dir",
-            "--gbk_dir",
+            "--input-dir",
+            "--gbk-dir",
             callback=validate_not_empty_dir,
             type=click.Path(
                 exists=True, file_okay=False, dir_okay=True, path_type=Path
             ),
             required=True,
-            help="Input directory containing gbk files to be used by BiG-SCAPE.",
+            help="Input directory containing .gbk files to be used by BiG-SCAPE. See the wiki for more details.",
         ),
         click.option(
-            "--input_mode",
+            "--input-mode",
             default="recursive",
             callback=validate_input_mode,
             type=click.Choice(["recursive", "flat"]),
             help=(
-                "Where to look for input GBK files. "
-                "recursive: search for gbk files recursively in input directory. "
-                "flat: search for gbk files in input directory only. "
+                "Tells BiG-SCAPE Where to look for input GBK files. "
+                "recursive: search for .gbk files recursively in input directory. "
+                "flat: search for .gbk files in input directory only. "
                 "(default: recursive)."
             ),
         ),
         # TODO: adjust choices
         click.option(
             "-m",
-            "--mibig_version",
+            "--mibig-version",
             type=str,
             required=False,
-            help="MIBiG release number (from 3.1 onwards). If not provided, MIBiG will not be "
-            "included in the analysis. If required, BiG-SCAPE will download the "
-            "MIBiG database to ./big_scape/MIBiG/mibig_antismash_<version>_gbk. "
-            "(Advanced) Any custom MIBiG collection can be used as long as the expected "
-            "folder is present.",
+            help=(
+                "MIBiG release number (from 3.1 onwards). If not provided, MIBiG will not be "
+                "included in the analysis. If required, BiG-SCAPE will download the "
+                "MIBiG database to ./big_scape/MIBiG/mibig_antismash_<version>_gbk. "
+                "For advanced users: any custom (antiSMASH-processed) MIBiG collection"
+                " can be used as long as the expected folder is present, e.g. provide"
+                " -m mymibig with ./big_scape/MIBiG/mibig_antismash_mymibig_gbk."
+                "For more information, see the wiki."
+            ),
         ),
         click.option(
             "-r",
-            "--reference_dir",
+            "--reference-dir",
             callback=validate_not_empty_dir,
             type=click.Path(
                 exists=True, file_okay=False, dir_okay=True, path_type=Path
             ),
-            help="Path to directory containing user defined, non-MIBiG, antiSMASH processed reference BGCs.",
+            help=(
+                "Path to directory containing user defined, non-MIBiG, antiSMASH processed reference BGCs."
+                "For more information, see the wiki."
+            ),
         ),
         click.option(
-            "--include_gbk",
+            "--include-gbk",
             type=str,
             default="cluster,region",
             callback=validate_filter_gbk,
             help=(
-                "A comma separated list of strings. Only gbk files that have "
-                "the string(s) in their filename will be used for the analysis "
-                "(default: 'cluster,region'). Use an asterisk to accept every "
-                "file (overrides '--exclude_gbk_str')."
+                "A comma separated list of strings. Only .gbk files that have "
+                "the string(s) in their filename will be used for the analysis."
+                "Use an asterisk to accept every file ('*' overrides '--exclude_gbk_str')."
+                "(default: cluster, region)."
             ),
         ),
         click.option(
-            "--exclude_gbk",
+            "--exclude-gbk",
             type=str,
             default="final",
             callback=validate_filter_gbk,
             help=(
                 "A comma separated list of strings. "
-                "If any string in this list occurs in the gbk filename, this "
+                "If any string in this list occurs in the .gbk filename, this "
                 "file will not be used for the analysis (default: final)."
             ),
         ),
         click.option(
             "-p",
-            "--pfam_path",
+            "--pfam-path",
             type=click.Path(
                 exists=True, dir_okay=False, file_okay=True, path_type=Path
             ),
-            help="Path to Pfam database file.",
+            help=(
+                "Path to Pfam database `.hmm` file (e.g `Pfam-A.hmm`)."
+                " If the `.hmm` file has already been pressed and the pressed files"
+                " are included in the same folder as the Pfam `.hmm` file, BiG-SCAPE "
+                "will also use these pressed files. If this is not the case, BiG-SCAPE"
+                " will run `hmmpress`. Note: the latter requires the user to have write "
+                "permissions to the given Pfam folder."
+            ),
         ),
         click.option(
             # TODO: implement
-            "--domain_includelist_all_path",
+            "--domain-includelist-all-path",
             type=click.Path(
                 exists=True, dir_okay=False, file_okay=True, path_type=Path
             ),
             callback=validate_includelist_all,
             help=(
-                "Path to txt file with Pfam accessions. Only BGCs containing all "
-                "the listed accessions will be analysed. In this file, each "
-                "line contains a single Pfam accession (with an optional comment,"
-                " separated by a tab). Lines starting with '#' are ignored. Pfam "
-                "accessions are case-sensitive."
+                "Path to .txt file with phmm domain accessions (commonly, Pfam accessions "
+                "(e.g. PF00501)). Only regions containing all the listed accessions will "
+                "be analyzed. In this file, each line contains a single phmm domain accession "
+                "(with an optional comment, separated by a tab). Lines starting with '#' "
+                "are ignored. Domain accessions are case-sensitive. Cannot be provided in "
+                "conjuction with --domain-includelist-any-path."
             ),
         ),
         click.option(
             # TODO: implement
-            "--domain_includelist_any_path",
+            "--domain-includelist-any-path",
             type=click.Path(
                 exists=True, dir_okay=False, file_okay=True, path_type=Path
             ),
             callback=validate_includelist_any,
             help=(
-                "Path to txt file with Pfam accessions. Only BGCs containing any of "
-                "the listed accessions will be analysed. In this file, each "
-                "line contains a single Pfam accession (with an optional comment,"
-                " separated by a tab). Lines starting with '#' are ignored. Pfam "
-                "accessions are case-sensitive."
+                "Path to .txt file with phmm domain accessions (commonly, Pfam accessions "
+                "(e.g. PF00501)). Only BGCs containing any of the listed accessions will "
+                "be analyzed. In this file, each line contains a single phmm domain accession "
+                "(with an optional comment, separated by a tab). Lines starting with '#' "
+                "are ignored.  Domain accessions are case-sensitive. Cannot be provided in "
+                "conjuction with --domain-includelist-all-path."
             ),
         ),
         click.option(
-            "--legacy_weights",
+            "--legacy-weights",
             is_flag=True,
             help=(
                 "Use BiG-SCAPE v1 class-based weights in distance calculations. "
                 "If not selected, the distance metric will be based on the 'mix' "
-                "weights distribution. Warning: these legacy weights are not recommended "
-                "for use with the record types 'protocluster'/'protocore', as they have "
-                "been optimized and validated only for the 'region' record type."
+                "weights distribution. Warning: these weights have not been validated "
+                "for record types other than region (see option --record_type, and wiki)."
             ),
         ),
         click.option(
-            "--alignment_mode",
+            "--alignment-mode",
             type=click.Choice(["global", "glocal", "local", "auto"]),
             default="glocal",
             callback=validate_alignment_mode,
             help=(
-                "Alignment mode for each pair of gene clusters. 'global': the whole "
-                "list of domains of each BGC are compared; 'local': Longest Common "
-                "Subcluster mode. Redefine the subset of the domains used to "
-                "calculate distance by trying to find the longest slice of common "
-                "domain content per gene in both BGCs, then extend each slice. "
-                "'glocal': Similar to local, but extension assumes full extension "
-                "of the shortest upstream/downstream arms in a compared pair. "
-                "'auto': use glocal when at least one of the BGCs in each pair "
-                "has the 'contig_edge' annotation from antiSMASH v4+, otherwise "
-                "use global mode on that pair. For an in depth description, see the wiki."
+                "Alignment mode for each pair of gene clusters. global: the whole list of domains of "
+                "each BGC record is compared; local: Seeds the subset of the domains used to calculate "
+                "distance by trying to find the longest slice of common domain content (Longest Common "
+                "Subcluster, LCS) between both records, then extends each side (see --extension_strategy). "
+                "glocal: Starts with performing local, but domain selection is then extended to the "
+                "shortest upstream/downstream arms in a compared record pair. "
+                "auto: use glocal when at least one of the BGCs in each pair has the contig_edge annotation "
+                "from antiSMASH v4+, otherwise use global mode on that pair. For an in depth description, see the wiki."
                 " (default: glocal)."
             ),
         ),
         click.option(
-            "--extend_strategy",
+            "--extend-strategy",
             type=click.Choice(["legacy", "greedy", "simple_match"]),
             default="legacy",
             callback=validate_extend_strategy,
-            help="Strategy to extend BGCs. 'legacy' will use the original BiG-SCAPE extension strategy, "
-            "while 'greedy' or 'simple_match' will use new extension strategies. For an in depth description,"
-            " see the wiki. (default: legacy).",
+            help=(
+                "Strategy to extend the BGCs record pair comparable region. legacy will use the original "
+                "BiG-SCAPE extend strategy, while greedy and simple match are newly introduced in BiG-SCAPE 2. "
+                "Legacy and simple match both examine the domain architecture of the record pair in order to "
+                "find the most relevant extended borders. Greedy is a very simple method that takes the "
+                "coordinates of the outermost matching domains as the extended borders. "
+                "For more detail see the wiki. (default: legacy)."
+            ),
         ),
         # networking parameters
         click.option(
-            "--gcf_cutoffs",
+            "--gcf-cutoffs",
             type=str,
             default=0.3,
             callback=validate_gcf_cutoffs,
             help=(
                 "A comma separated list of floats. "
-                "Generate networks using multiple raw distance cutoff values. "
+                "Generate networks using multiple distance cutoff values. "
                 "Values should be in the range [0.0, 1.0]. Example: --gcf_cutoffs 0.1,"
-                "0.25,0.5,1.0. (default: 0.3)."
+                "0.25,0.5,1.0. For more detail see the wiki. (default: 0.3)."
             ),
         ),
         # output parameters
         click.option(
-            "--profile_path",
+            "--profile-path",
             type=click.Path(path_type=Path, dir_okay=False),
             help="Path to output profile file. (default: output_dir/).",
         ),
         click.option(
             "-db",
-            "--db_path",
+            "--db-path",
             type=click.Path(path_type=Path, dir_okay=False),
             help="Path to sqlite db output file. (default: output_dir/output_dir.db).",
         ),
         # TODO: implement cand_cluster here and LCS-ext
         click.option(
             # TODO: double check that cand_cluster is proper implemented
-            "--record_type",
+            "--record-type",
             type=click.Choice(["region", "cand_cluster", "protocluster", "proto_core"]),
             default="region",
             callback=validate_record_type,
-            help="Use a specific type of antiSMASH record for comparison. (default: region).",
+            help=(
+                "Use a specific type of antiSMASH record for comparison. For every .gbk, "
+                "BiG-SCAPE will try to extract the requested record type, if this is not present, "
+                "BiG-SCAPE will try to extract the next higher level record type, i.e. if a "
+                "proto_core feature is not present, BiG-SCAPE will look for a protocluster feature, "
+                "and so on and so forth. The record type hierarchy is: region>cand_cluster>protocluster>proto_core."
+                ". For more detail, see the wiki. (default: region)."
+            ),
+        ),
+        click.option(
+            "--no-db-dump",
+            type=bool,
+            is_flag=True,
+            default=False,
+            help=(
+                "Do not dump the sqlite database to disk. This will speed up your run, "
+                "but in case of a crashed run no info will be stored and you'll have to "
+                "re-start the run from scratch"
+            ),
+        ),
+        click.option(
+            "--disk-only",
+            type=bool,
+            is_flag=True,
+            default=False,
+            help=(
+                "Do not store any results in memory, only on disk. This is almost certainly "
+                "slower than the default behavior, but will save some memory and can therefore be "
+                "useful for very large runs or runs with limited memory"
+            ),
+        ),
+        click.option(
+            "--db-only-output",
+            type=bool,
+            is_flag=True,
+            default=False,
+            help=(
+                "Do not generate any output besides the data stored in the database. "
+                "Suitable for advanced users that wish to only make use of the results "
+                "stored in the SQLite database."
+            ),
+        ),
+        click.option(
+            "--no-trees",
+            type=bool,
+            is_flag=True,
+            default=False,
+            help=(
+                "Do not generate any GCF newick trees. Suitable for users that do not "
+                "utilize our output visualization, but only make use of the output stored "
+                "in the .tsv files (which include the network files) and/or SQLite database."
+            ),
+        ),
+        click.option(
+            "--force-gbk",
+            type=bool,
+            is_flag=True,
+            default=False,
+            help=(
+                "Recommended for advanced users only. Allows BiG-SCAPE to use non-antiSMASH "
+                "processed .gbk files. If GBK files are found without antiSMASH annotations "
+                "(specifically, BiG-SCAPE checks for the absence of a antiSMASH version feature), "
+                "BiG-SCAPE will still read and parse these files, and will create internal gbk "
+                "record objects, each of which will have a region feature covering the full sequence "
+                "length and a product feature `other`. Warning: BiG-SCAPE still needs CDS features and "
+                "a sequence feature to work with non-antiSMASH .gbks. Furthermore, --include-gbk and "
+                "--exclude-gbk parameters might need to be adjusted if .gbk file names also do not follow "
+                "antiSMASH format. Disclaimer: this feature is still under development, use at own risk."
+            ),
         ),
     ]
 
diff --git a/big_scape/cli/cli_validations.py b/big_scape/cli/cli_validations.py
index bc4ba3a3..40dd6941 100644
--- a/big_scape/cli/cli_validations.py
+++ b/big_scape/cli/cli_validations.py
@@ -430,11 +430,13 @@ def validate_query_record(ctx) -> None:
     ):
         logging.error(
             "Missing option '--query_record_number'."
-            "A query record number is required when running query mode with a given record type."
+            "A query record number is required when running query mode with "
+            "a record type other than 'region'."
         )
         raise click.UsageError(
             "Missing option '--query_record_number'."
-            "A query record number is required when running query mode with a given record type."
+            "A query record number is required when running query mode with "
+            "a record type other than 'region'."
         )
 
     return None
diff --git a/big_scape/cli/cluster_cli.py b/big_scape/cli/cluster_cli.py
index 31060088..96153d69 100644
--- a/big_scape/cli/cluster_cli.py
+++ b/big_scape/cli/cluster_cli.py
@@ -36,16 +36,17 @@
 (e.g. T2PKS) or categories (e.g. PKS) to run analyses on class/category-based
 bins, respectively.
 
-'--classify legacy' which is based on BiG-SCAPE v1 predefined groups:
+'--classify legacy' is based on BiG-SCAPE v1 predefined groups:
 PKS1, PKSOther, NRPS, NRPS-PKS-hybrid, RiPP, Saccharide, Terpene, Others,
-and will automatically use complementary '--legacy_weights'.
-This feature is available for backwards compatibility with input .gbks
+and will automatically use complementary '--legacy-weights'.
+'--classify legacy' is available for backwards compatibility with input .gbks
 generated with antiSMASH versions up to version 7. For higher antiSMASH
 versions, use at your own risk, as BGC classes may have changed. All antiSMASH
 classes that this legacy mode does not recognise will be grouped in 'others'.
+To update the antiSMASH classes list yourself, see the config.yml file.
 
 '--classify class' and '--classify category' can be used in combination with
---legacy_weights if input .gbks have been generated by antiSMASH version 6 or
+--legacy-weights if input .gbks have been generated by antiSMASH version 6 or
 higher. For older antiSMASH versions, either use --classify 'legacy' or do not
 select --legacy_weights, which will perform the weighted distance calculations
 based on the generic 'mix' weights. For more detail, see wiki.
@@ -57,17 +58,17 @@
     is_flag=True,
     help=(
         "Calculate distances using a 'mix' bin, wherein no classification is applied. "
-        "This will do an all-vs-all comparison, and is likely going to take a long time. "
+        "This will do an all-vs-all comparison of all input BGC records. "
         "This bin will use weights from the 'mix' weights distribution: "
-        "{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}"
+        "{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}. For more detail, see wiki."
     ),
 )
 # comparison parameters
 @click.option(
-    "--hybrids_off",
+    "--hybrids-off",
     is_flag=True,
     help=(
-        "Toggle to add BGCs with hybrid predicted classes/categories to each "
+        "Toggle to add BGC records with hybrid predicted classes/categories to each "
         "subclass instead of a hybrid class/network (e.g. a 'terpene-nrps' BGC "
         "would be added to both the terpene and NRPS classes/networks instead of "
         "the terpene.nrps network). "
@@ -75,7 +76,7 @@
     ),
 )
 @click.option(
-    "--exclude_categories",
+    "--exclude-categories",
     callback=validate_class_category_filter,
     help=(
         "A comma separated list of categories. BGCs that have at least one of the product "
@@ -85,7 +86,7 @@
     ),
 )
 @click.option(
-    "--include_categories",
+    "--include-categories",
     callback=validate_class_category_filter,
     help=(
         "A comma separated list of categories. Only BGCs that have at least one of the "
@@ -95,7 +96,7 @@
     ),
 )
 @click.option(
-    "--exclude_classes",
+    "--exclude-classes",
     callback=validate_class_category_filter,
     help=(
         "A comma separated list of classes. BGCs that have at least one of the product "
@@ -104,7 +105,7 @@
     ),
 )
 @click.option(
-    "--include_classes",
+    "--include-classes",
     callback=validate_class_category_filter,
     help=(
         "A comma separated list of classes. Only BGCs that have at least one of the "
@@ -114,9 +115,12 @@
 )
 # networking parameters
 @click.option(
-    "--include_singletons",
+    "--include-singletons",
     is_flag=True,
-    help=("Include singletons in the network."),
+    help=(
+        "Include singletons in the networ and all respective output."
+        " Reference singletons will not be included even if this is toggled."
+    ),
 )
 @click.pass_context
 def cluster(ctx, *args, **kwargs):
diff --git a/big_scape/cli/query_cli.py b/big_scape/cli/query_cli.py
index 62268b05..ed571d3f 100644
--- a/big_scape/cli/query_cli.py
+++ b/big_scape/cli/query_cli.py
@@ -25,42 +25,29 @@
 @click.command()
 @common_all
 @common_cluster_query
-@click.option(
-    "--classify",
-    type=click.Choice(["none", "class", "category"]),
-    default="none",
-    callback=validate_classify,
-    help=(
-        "By default BiG-SCAPE will query against any other supplied BGCs regardless of "
-        "class/category. Instead, select 'class' or 'category' to run analyses on "
-        "class-based bins. Only gene clusters with the same class/category will be "
-        "compared. Can be used in combination with '--legacy_weights' for gbks "
-        "produced by antiSMASH version 6 or higher. For older antiSMASH versions, "
-        "deselect '--legacy_weights', leading to the use of a generic 'mix' weight: "
-        "{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}. (default: none)"
-    ),
-)
 @click.option(
     "-q",
-    "--query_bgc_path",
+    "--query-bgc-path",
     type=click.Path(exists=True, dir_okay=False, file_okay=True, path_type=Path),
     required=True,
     callback=validate_query_bgc,
     help=(
-        "Path to query BGC file. BiG-SCAPE will compare "
-        "all BGCs in the input and reference folders to the query"
+        "Path to query BGC .gbk file. BiG-SCAPE will compare "
+        "all BGCs records in the input and reference folders to the query"
         " in a one-vs-all mode."
     ),
 )
 @click.option(
     "-n",
-    "--query_record_number",
+    "--query-record-number",
     type=int,
     required=False,
     help=(
         "Query BGC record number. Used to select the specific record "
-        "from the query BGC gbk. Warning: if interleaved or chemical hybrid proto "
-        "cluster/cores are merged (see config), the relevant number is that of the "
+        "from the query BGC .gbk, and is only relevant when running "
+        "--record-type cand_cluster, protocluster or proto_core."
+        " Warning: if interleaved or chemical hybrid proto cluster/cores "
+        "are merged (see config.yml), the relevant number is that of the "
         "first record of the merged cluster (the one with the lowest number). "
         "e.g. if records 1 and 2 get merged, the relevant number is 1. "
     ),
@@ -70,9 +57,25 @@
     is_flag=True,
     help=(
         "By default, BiG-SCAPE will only generate edges between the query and reference"
-        " BGCs. With the propagate flag, BiG-SCAPE will go through multiple cycles of "
+        " BGC records. With the propagate flag, BiG-SCAPE will go through multiple cycles of "
         "edge generation until no new reference BGCs are connected to the query "
-        "connected component."
+        "connected component. For more details, see the Wiki."
+    ),
+)
+@click.option(
+    "--classify",
+    type=click.Choice(["none", "class", "category"]),
+    default="none",
+    callback=validate_classify,
+    help=(
+        "By default BiG-SCAPE will compare the query BGC record against any other "
+        "supplied reference BGC records regardless of antiSMASH  product class/category. "
+        "Instead, select 'class' or 'category' to run analyses on one class-specific bin, "
+        "in which case only reference BGC records with the same class/category as the "
+        "query record will be compared. Can be used in combination with --legacy-weights "
+        "for .gbks produced by antiSMASH version 6 or higher. For older antiSMASH versions "
+        "or if --legacy-weights is not selected, BiG-SCAPE will use the generic 'mix' weights: "
+        "{JC: 0.2, AI: 0.05, DSS: 0.75, Anchor boost: 2.0}. (default: none)"
     ),
 )
 @click.pass_context
diff --git a/big_scape/config.yml b/big_scape/config.yml
index 3fe6bb00..4bc257a5 100644
--- a/big_scape/config.yml
+++ b/big_scape/config.yml
@@ -1,9 +1,12 @@
+# For more details on the config options, see the documentation at
+# the github wiki (https://github.com/medema-group/BiG-SCAPE/wiki).
+
 # PROFILER
 # Update interval in seconds when profiler functionality is active.
 PROFILER_UPDATE_INTERVAL: 0.5
 
 # INPUT
-# list of cand_cluster types where subrecords will be merged.
+# List of cand_cluster types where subrecords will be merged.
 MERGED_CAND_CLUSTER_TYPE:
   - chemical_hybrid
   - interleaved
@@ -12,7 +15,7 @@ MIN_BGC_LENGTH: 0
 MAX_BGC_LENGTH: 500000
 
 # CDS and DOMAIN
-# Specify at which overlap percentage (as a decimal) two CDS in a gbk
+# Specify at which overlap percentage (as a decimal) two CDS in a .gbk
 # are considered to overlap. This preserves longest overlapping CDS.
 CDS_OVERLAP_CUTOFF: 0.1
 # Specify at which overlap percentage (as a decimal) two domains
@@ -38,7 +41,9 @@ REGION_MIN_EXTEND_LEN: 0.3
 REGION_MIN_EXTEND_LEN_BIO: 0.2
 # - Protoclusters or Proto_cores with at least one biosynthetic domain in the extended slice
 PROTO_MIN_EXTEND_LEN: 0.2
-# List of product classes that do not require a minimum length.
+# List of product classes that do not require a minimum length. In practice, this
+# means that an LCS and/or Extended slice of at least 1 domain will be accepted,
+# so long as this is a core biosynthetic domain.
 NO_MIN_CLASSES:
   - terpene
 # Integer scoring metrics used in the LCS extension algorithm for match, mismatch and gap.
@@ -54,8 +59,9 @@ EXTEND_MAX_MATCH_PERC: 0.1
 # of families created. Higher preference will result in more families and vice versa.
 PREFERENCE: 0.0
 
-# TREE
-# The number of common domains used to generate GCF trees in top frequencies of occurrence.
+# GCF TREE
+# The number of common domains (present in the exemplar BGC record) used to
+# generate GCF trees in top frequencies of occurrence.
 TOP_FREQS: 3
 
 # ANCHOR DOMAINS
@@ -75,7 +81,10 @@ ANCHOR_DOMAINS:
   - PF05147  # Lanthionine synthetase C-like protein
 
 # LEGACY ANTISMASH CLASSES
-# These are the classes that are used in the legacy classify modes
+# List and groupings of the antiSMASH classes that are used in the
+# --classify legacy mode and for which --legacy-weights have been
+# optimized.  These have been updated up to antiSMASH version 7.0,
+# and will not be further maintained.
 LEGACY_ANTISMASH_CLASSES:
     pks1_products:
       - t1pks