Update gapfill #126 #52

Added functions for adding reac / metabs per database id
draeger-lab · Aug 9, 2024 · 1bbfdd3 · 1bbfdd3
1 parent ad9f3e6
commit 1bbfdd3
Show file tree

Hide file tree

Showing 8 changed files with 1,813 additions and 1,150 deletions.
diff --git a/dev/gapfill-testing.ipynb b/dev/gapfill-testing.ipynb
diff --git a/src/refinegems/__init__.py b/src/refinegems/__init__.py
@@ -1,7 +1,8 @@
-__all__ = ['analysis','classes','curation','utility', 'cmd_access']
+__all__ = ['analysis','classes','curation','decorators','utility', 'cmd_access']
 
 from . import analysis
 from . import classes
 from . import curation
+from . import decorators
 from . import utility
 from . import cmd_access
diff --git a/src/refinegems/classes/gapfill.py b/src/refinegems/classes/gapfill.py
@@ -10,21 +10,22 @@
 from abc import ABC, abstractmethod
 
 import cobra
-import pandas as pd
+import io
 import numpy as np
+import pandas as pd
+import re
+import warnings
 
+from bioservices.kegg import KEGG
+from libsbml import Model as libModel
 from typing import Literal, Union
 
-from libsbml import Model as libModel
-from bioservices.kegg import KEGG
-import io
-import re
 from tqdm import tqdm
 tqdm.pandas()
 
 from ..curation.db_access.db import get_ec_from_ncbi, get_ec_via_swissprot
 from ..utility.io import load_a_table_from_database, parse_fasta_headers, parse_gff_for_cds
-from ..utility.entities import get_model_reacs_or_metabs
+from ..utility.entities import get_model_reacs_or_metabs, create_gp, create_gpr
 from ..curation.db_access.kegg import parse_KEGG_gene, parse_KEGG_ec
 
 ############################################################################
@@ -145,6 +146,9 @@ def __init__(self) -> None:
     #@abstractmethod
     #def load_gene_list(self):
     #    pass
+
+    # abstract methods
+    # ----------------
 
     @abstractmethod
     def get_missing_genes(self, model):
@@ -153,6 +157,9 @@ def get_missing_genes(self, model):
     @abstractmethod
     def get_missing_reacs(self,model):
         pass
+
+    # finding the gaps
+    # ----------------
 
     def _find_reac_in_model(self, model: cobra.Model, eccode:str, id:str, 
                idtype:Literal['MetaNetX','KEGG','BiGG', 'BioCyc'], 
@@ -190,12 +197,88 @@ def _find_reac_in_model(self, model: cobra.Model, eccode:str, id:str,
 
         return None
 
-    # @abstractmethod
-    # def run(self,model):
-    #     pass
+
+    # actual "Filling" part
+    # ---------------------
+
+    # @TODO logging? or remove self?
+    def add_genes_from_table(self,model:libModel, gene_table:pd.DataFrame) -> None:
+        """Create new GeneProduct for a table of genes in the format:
+        
+        | ncbiprotein | locus_tag | UniProt | ... |
+        
+        The dots symbolise additional columns, that can be passed to the function, 
+        but will not be used by it. The other columns, except UniProt, are required.
+
+        Args:
+            - model (libModel): 
+                _description_
+            - gene_table (pd.DataFrame): 
+                The table with the genes to add. At least needs the columns
+                '','' and 'ec-code'.
+        """
+
+        # ncbiprotein | locus_tag | ...
+        # work on a copy to ensure input stays the same
+        gene_table = gene_table.copy()
+        # gene_table.drop(columns=['ec-code'],inplace=True)
+
+        # create gps from the table and add them to the model
+        if 'UniProt' in gene_table.columns:
+            for idx,x in gene_table.iterrows():
+                create_gp(model, x['ncbiprotein'], 
+                        locus_tag=x['locus_tag'],
+                        uniprot=(x['UniProt'],True))
+        else:
+            for idx,x in gene_table.iterrows():
+                create_gp(model, x['ncbiprotein'], 
+                        locus_tag=x['locus_tag'])
+
+
+    # @TODO seems very ridgid, better ways to find the ids?
+    def add_gene_reac_associations_from_table(model:libModel,
+                                            reac_table:pd.DataFrame) -> None:
+        """Using a table with at least the columns 'ncbiprotein' 
+        (containing e.g. NCBI protein identifier (lists), should be gene IDs in the model)
+        and 'add_to_GPR' (containing reactions identifier (lists)), add the gene IDs to the 
+        GPRs of the corresponding reactions.
+
+        Args:
+            - model (libModel): 
+                The model loaded with libSBML.
+            - reac_table (pd.DataFrame): 
+                The table containing at least the columns 'ncbiprotein' (gene IDs) and
+                'add_to_GPR' (reaction IDs)
+        """
+
+        model_gene_ids = [_.getId() for _ in model.getPlugin(0).getListOfGeneProducts()]
+
+        # get each unique ncbiprotein vs reaction mapping
+        reac_table = reac_table[['ncbiprotein','add_to_GPR']]
+        reac_table = reac_table.explode('ncbiprotein').explode('add_to_GPR')
+        reac_table.drop_duplicates(inplace=True)
+
+        # add the genes to the corresponding GPRs
+        for idx,row in reac_table.iterrows():
+            # check, if G_+ncbiprotein in model
+            # if yes, add gpr
+            geneid = 'G_'+row['ncbiprotein'].replace('.','_')
+            reacid = 'R_'+row['add_to_GPR']
+            if geneid in model_gene_ids:
+                create_gpr(model.getReaction(reacid),geneid)
+            # else, print warning
+            else:
+                mes = f'Cannot find {geneid} in model. Should be added to {reacid}'
+                warnings.warn(mes,UserWarning)
+
+
+
 
     def fill_model(self, model):
         pass
+
+    # reporting
+    # ---------
 
     def calculate_stats(self):
         pass
@@ -582,7 +665,11 @@ def get_missing_reacs(self):
 # def gff_gene_comp():
 #     pass
 # 
-# 
+#
+
+# ---------------------------------
+# GapFilling with GFF and Swissprot
+# ---------------------------------
 class GeneGapFiller(GapFiller):
 
     GFF_COLS = {'locus_tag':'locus_tag', 

diff --git a/src/refinegems/curation/db_access/db.py b/src/refinegems/curation/db_access/db.py
@@ -11,6 +11,7 @@
 # requirements
 ################################################################################
 
+import cobra
 import numpy as np
 import pandas as pd
 import re
@@ -97,6 +98,9 @@ def compare_ids(id1: str, id2: str) -> bool:
     return similar_ids
 
 
+# BiGG
+# ----
+
 @sleep_and_retry
 @limits(calls=10, period=1)
 def get_reaction_compartment(bigg_id: str) -> str:
@@ -377,6 +381,9 @@ def get_reactants_and_products_dicts(reaction_id: str) -> list[dict]:
     return missing_reacs
 
 
+ # @TODO : name is an issue
+
+
 # NCBI parser
 # -----------
 
@@ -434,6 +441,7 @@ def filter_DIAMOND_blastp_results(blasttsv:str, pid_theshold:float=90.0):
     diamond_results = diamond_results[['query_ID','subject_ID']]
 
     return diamond_results
+
 # Uniprot
 # -------
 
@@ -502,4 +510,110 @@ def get_ec_via_swissprot(fasta:str, db:str, missing_genes:pd.DataFrame,
     #         EC numbers and locus tags
     mapped_res = mapped_res.groupby(['locus_tag','ec-code']).agg({'UniProt': lambda x: x.tolist()}).reset_index()
 
-    return mapped_res
+    return mapped_res
+
+
+# general
+# -------
+
+# @TODO: BioCyc missing
+def parse_reac_str(equation:str, 
+                   type:Literal['BiGG','BioCyc','MetaNetX','KEGG']='MetaNetX') -> tuple[dict,dict,list,bool]:
+    """Parse a reaction string.
+
+    Args:
+        - equation (str): 
+            The equation of a reaction as a string (as saved in the database).
+        - type (Literal['BiGG','BioCyc','MetaNetX','KEGG'], optional): 
+            The name of the database the equation was taken from. 
+            Can be 'BiGG','BioCyc','MetaNetX','KEGG'.
+            Defaults to 'MetaNetX'.
+
+    Returns:
+        tuple: 
+            
+            Tuple of (1) dict, (2) dict, (3) list & (4) bool:
+            
+            1. Dictionary with the reactant IDs and their stoichiometric factors.
+            2. Dictionary with the product IDs and their stoichiometric factors.
+            3. List of compartment IDs or None, if they cannot be extract from the equation.
+            4. True, if the reaction is reversible, else False.
+    """
+
+    products = {}
+    reactants = {}
+    compartments = list()
+    is_product = False
+    reversible = True
+
+    match type:
+        case 'MetaNetX':
+            for s in equation.split(' '):
+                # switch from reactants to products
+                if s == '=':
+                    is_product = True
+                # found stoichiometric factor
+                elif s.isnumeric():
+                    factor = float(s)
+                # skip
+                elif s == '+':
+                    continue
+                # found metabolite
+                else:
+                    # get information from MetaNetX
+                    metabolite, compartment = s.split('@')
+                    compartments.append(compartment)
+
+                    if is_product:
+                        products[metabolite] = factor
+                    else:
+                        reactants[metabolite] = factor
+
+        case 'BiGG':
+            factor = 1.0 # BiGG does not use factor 1 in the quations
+            for s in equation.split(' '):
+                # found factor
+                if s.replace('.','').isdigit():
+                    factor = float(s)
+                # switch from reactants to products
+                elif s == '-->' :
+                    is_product = True
+                    reversible = False
+                elif s == '<->':
+                    is_product = True
+                # skip
+                elif s == '+':
+                    continue
+                # found metabolite
+                else:
+                    compartments.append(s.rsplit('_',1)[1])
+                    if is_product:
+                        products[s] = factor
+                    else:
+                        reactants[s] = factor
+                    factor = 1.0
+
+        case 'KEGG':
+            compartments = None
+            factor = 1.0
+            for s in equation.split(' '):
+                if s.isnumeric():
+                    factor = float(s)
+                elif s == '+':
+                    continue
+                elif s == '<=>': # @TODO are there more options?
+                    is_product = True
+                else:
+                    if is_product:
+                        products[s] = factor
+                    else:
+                        reactants[s] = factor
+                    factor = 1.0
+
+        case 'BioCyc':
+            pass
+
+    return (reactants,products,compartments,reversible)
+
+
+
diff --git a/src/refinegems/curation/db_access/kegg.py b/src/refinegems/curation/db_access/kegg.py
@@ -10,7 +10,7 @@
 Due to the KEGG REST API this is relatively slow (model of size 1500 reactions - 20 min).
 """
 
-__author__ = "Famke Baeuerle"
+__author__ = "Famke Baeuerle and Carolin Brune"
 
 ############################################################################
 # requirements

diff --git a/src/refinegems/developement/__init__.py b/src/refinegems/developement/__init__.py
@@ -0,0 +1,3 @@
+__all__ = ['decorators']
+
+from . import decorators
diff --git a/src/refinegems/developement/decorators.py b/src/refinegems/developement/decorators.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python
+"""This module contains decorator functions.
+"""
+
+__author__ = "Carolin Brune"
+
+################################################################################
+# requirements
+################################################################################
+
+################################################################################
+# variables
+################################################################################
+
+################################################################################
+# functions
+################################################################################
+
+def template(func):
+    """A decorator for template functions.
+    
+    Template functions are not executable and are only for developers.
+    """
+    def wrapper():
+        raise RuntimeError('This function is a template for developers.\nIt cannot be executed.')
+    return wrapper
+
+
+def implement(func):
+    """A decorator for functions, that should be implemented.
+    
+    Used to give a hint to other developers, that this function is not yet implemented, 
+    but should be to make the program executable.
+    """
+    def wrapper():
+        raise NotImplementedError('The current function is just a placeholder and will be implement in the fucture.')
+    return wrapper
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		__all__ = ['decorators']

		from . import decorators