From 9fab756f99fd288743d2fc3c621751d7c53b0dac Mon Sep 17 00:00:00 2001 From: cb-Hades <81743695+cb-Hades@users.noreply.github.com> Date: Mon, 26 Aug 2024 08:41:31 +0200 Subject: [PATCH] Update cmd #118 --- docs/source/cmd_desc.rst | 58 +++++++++++++++++++++++++++++------- src/refinegems/cmd_access.py | 42 +++++++++++++++++++++----- 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/docs/source/cmd_desc.rst b/docs/source/cmd_desc.rst index 27c9804..d0b44e6 100644 --- a/docs/source/cmd_desc.rst +++ b/docs/source/cmd_desc.rst @@ -170,7 +170,7 @@ Add or update tables for additional namespaces/databases into/of the in-build da Options: -- ``--chunksize/-c`` Size (in kB) of data to download per chunk, if a download is required. +- ``--chunksize/-c``: Size (in kB) of data to download per chunk, if a download is required. .. code:: bash @@ -184,21 +184,57 @@ refinegems gaps .. code:: bash - refinegems gaps find [MODELPATH] [GFF_FILE] [ORGANISMID] [GAPFILL_PARAMS] [FILENAME] + refinegems gaps find [ALGORITHMN] [MODELPATH] [OPTIONS] -Find gaps in a model based on the genes/gene products of the underlying organism. +Find gaps in a model and optionally try to fill them. +Algorthmn for the gap filling is chosen by the first argument. -.. code:: bash - - refinegems gaps fill [MODEL] [GAP_ANALYSIS_RESULTS] +General Options -Fill the gaps in a model based on a user-provided input file. +- ``-o``, ``--outdir``: Path to a directory to write the output to. [default: ``./``] +- ``-f``, ``--fill``: If True, tries to fill the gaps in the model. +- ``--fc``, ``--formula-check``: [none,existence,wildcard,strict] Set the filter for which metabolite formulas are valid to be added to the model. [default: existence] +- ``--no-dna``: Exclude DNA reactions (name-based) from being added to the model. +- ``--no-rna``: Exclude RNA reactions (name-based) from being added to the model. +- ``-p``, ``--idprefix``: Prefix for the random IDs, if an ID does not exists for the given namespace. [default: refineGEMs] +- ``-n``, ``--namespace``: [BiGG] Namespace used in the model. [default: BiGG] -.. code:: bash - - refinegems gaps autofill [MODELPATH] [GAFILL_PARAMS] [FILENAME] +| KEGG required parameters: [all required if ``ALG="KEGG"``] +| Parameters required when running the KEGG gap filling algorithmn + +- ``--orgid``: KEGG organism ID + +| BioCyc required parameters: [all required if ``ALG="BioCyc"``] +| Parameters required when running the KEGG gap filling algorithmn + +- ``--gt``, ``--genetable``: Path to the BioCyc gene smart table. +- ``--rt``, ``--reactable``: Path to the BioCyc gene smart table. +- ``--gff-bc``: Path to the GFF. + +| Gene required parameters: [all required if ``ALG="Gene"``] +| Parameters required when running the GeneGapFiller algorithm + +- ``--gff-g``: Path to the GFF. + +| Gene optional parameters: +| Optional / conditionally interdependant parameters for the gene gap filling algorithm + +- ``--prot-prefix``: Prefix for pseudo-protein IDs. [default: refineGEMs] +- ``--mail``: Mail address for NCBI requests. +- ``--ncbi``, ``--check-ncbi``: Enable searching protein IDs in NCBI. This increases the runtime significantly. +- ``--fasta``: Path to the protein FASTA of the model. +- ``--dmnd-db``: Path to the SwissProt DIAMOND database. +- ``--sp-map``, ``--swissprot-mapping``: Path to the SwissProt mapping file (ID against EC and BRENDA) +- ``-s``, ``--sensitivity``: [sensitive,more-sensitive,very-sensitive,ultra-sensitive] Sensitivity mode for running DIAMOND. [default: more-sensitive] +- ``--cov``: Coverage value (passed to DIAMOND) [default: 90.0] +- ``--pid``: Percentage identity threshold value for filtering DIAMOND results. +- ``-t``, ``--threads``: Number of threads to be used by DIAMOND. [default: 2] + +Constraints: -Automatically find and fill the gaps based on the genes/gene products. +- ``--mail`` is required if ``--check-ncbi`` is set +- if one of ``--fasta``, ``--dmnd-db``, ``--swissprot-mapping`` is set, all need to be set +- ``--fasta``, ``--dmnd-db``, ``--swissprot-mapping`` are all required if any of ``--sensitivity``, ``--cov``, ``--pid`` and ``--threads`` is set refinegems media diff --git a/src/refinegems/cmd_access.py b/src/refinegems/cmd_access.py index febdf65..acdbaab 100644 --- a/src/refinegems/cmd_access.py +++ b/src/refinegems/cmd_access.py @@ -192,12 +192,12 @@ def run(model,email,path,id_db,refseq_gff,protein_fasta,lab_strain,kegg_organism # Find and fill gaps in a model automatically/Fill gaps with manually created tables # ---------------------------------------------------------------------------------- # @TODO gaps group still for the old gapfill - rewrite or delete - +# @TEST help is displayed alright but untested @cli.group() def gaps(): """Find and fill gaps in a model.""" -@gaps.command() +@gaps.command(show_constraints=True) @cloup.argument('alg', type=click.Choice(['KEGG','BioCyc','Gene']), help='Type of automated gap filling algorithm, that shall be used.') @cloup.argument('modelpath', type=click.Path(exists=True, dir_okay=False), @@ -255,7 +255,7 @@ def gaps(): show_default = True, help='Prefix for pseudo-protein IDs.'), cloup.option('--mail', type=str, default=None, help='Mail address for NCBI requests.'), - cloup.option('--nbci','--check-ncbi', is_flag=True, default=False, + cloup.option('--ncbi','--check-ncbi', is_flag=True, default=False, help='Enable searching protein IDs in NCBI. This increases the runtime significantly.'), cloup.option('--fasta', type=click.Path(exists=True, dir_okay=False), default=None, help='Path to the protein FASTA of the model.'), @@ -274,8 +274,8 @@ def gaps(): help='Number of threads to be used by DIAMOND.'), ) @cloup.constraints.constraint(cloup.constraints.If(cloup.constraints.IsSet('ncbi'), then=cloup.constraints.require_all), ['mail']) -@cloup.constraints.constraint(cloup.constraints.If(cloup.constraints.AnySet('fasta','dmnd_db','sp_map'),then=cloup.constraints.AllSet), ['fasta','dmnd_db','sp_map']) -@cloup.constraints.constraint(cloup.constraints.If(cloup.constraints.AnySet('s','cov','pid','t'),then=cloup.constraints.AllSet), ['fasta','dmnd_db','sp_map']) +@cloup.constraints.constraint(cloup.constraints.If(cloup.constraints.AnySet('fasta','dmnd_db','sp_map'),then=cloup.constraints.require_all), ['fasta','dmnd_db','sp_map']) +@cloup.constraints.constraint(cloup.constraints.If(cloup.constraints.AnySet('sensitivity','cov','pid','threads'),then=cloup.constraints.require_all), ['fasta','dmnd_db','sp_map']) def automated_gapfill(alg,modelpath,outdir,fill, formula_check, no_dna, no_rna, idprefix, namespace, @@ -290,16 +290,42 @@ def automated_gapfill(alg,modelpath,outdir,fill, cmodel = rg.utility.io.load_model(modelpath, 'cobra') model = rg.utility.io.load_model(modelpath, 'libsbml') - # find gaps - gapfiller = rg.classes.gapfill.KEGGapFiller(orgid) + # set class instance + match alg: + case 'KEGG': + gapfiller = rg.classes.gapfill.KEGGapFiller(orgid) + # find gaps + gapfiller.missing_genes(model) + gapfiller.missing_reacs(cmodel) + case 'BioCyc': + gapfiller = rg.classes.gapfill.BioCycGapFiller(genetable, + reactable, + gff_bc) + # find gaps + gapfiller.missing_genes(model) + gapfiller.missing_reacs(cmodel) + case 'Gene': + gapfiller = rg.classes.gapfill.GeneGapFiller() + # find gaps + gapfiller.missing_genes(gff_g,model) + gapfiller.missing_reacs(cmodel, prot_prefix, mail, ncbi, fasta, dmnd_db, sp_map, + sensitivity, cov, pid, threads) + case _: + mes = f'Unknown option for algorthmn type: {alg}' + raise ValueError(mes) + # find gaps gapfiller.missing_genes(model) gapfiller.missing_reacs(cmodel) # fill gaps if fill: - model = gapfiller.fill_model(model) + model = gapfiller.fill_model(model, + formula_check=formula_check, + exclude_dnae=no_dna, exclude_rna=no_rna, + idprefix=idprefix, namespace=namespace) # save model write_model_to_file(model, Path(outdir, 'gapfilled_model.xml')) + # @TODO report stats # @TODO report manual curation