Merge pull request #135 from draeger-lab/hot-fixes-2.0-pre-release

Hot fixes for 2.0.0a0 pre-release
draeger-lab · Sep 1, 2024 · 47f5aa3 · 47f5aa3
2 parents dd50de8 + 25e6990
commit 47f5aa3
Show file tree

Hide file tree

Showing 8 changed files with 1,461 additions and 250 deletions.
diff --git a/dev/gapfill-testing.ipynb b/dev/gapfill-testing.ipynb
diff --git a/dev/tutorial.ipynb b/dev/tutorial.ipynb
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -28,7 +28,7 @@
 author = 'Famke Bäuerle, Gwendolyn O. Döbel and Carolin Brune'
 
 # The full version, including alpha/beta/rc tags
-release = '2.0.0-alpha.0'
+release = '2.0.0-alpha.1'
 
 
 # -- General configuration ---------------------------------------------------

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -2,13 +2,6 @@ Welcome to refineGEMs!
 ======================================
 ``refineGEMs`` is a Python-based toolbox for the curation and analysis of genome-scale metabolic models (GEMS).
 
-.. warning:: 
-    | *Will be deprecated from version 2.0.0 onwards.*
-    | Due to massive restructuring and extension, all functions have been moved into new modules.
-    | If you used any functions from refineGEMs until now and want to use version 2.0.0 in the future, make sure to check your code after the version change.
-    | For the main pipeline in main.py see `SPECIMEN <https://github.com/draeger-lab/SPECIMEN>`__ as it will be deprecated from version 2.0.0 onwards.
-    | Beware that SPECIMEN only runs with refineGEMs 2.0.0 (starting with developmental versions).
-
 Overview
 --------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -5,7 +5,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "refineGEMs"
-version = "2.0.0-alpha.0"  
+version = "2.0.0-alpha.1"  
 requires-python = ">=3.10, <3.12"
 authors = [
     {name = "Famke Baeuerle", email = "[email protected]"},

diff --git a/src/refinegems/analysis/growth.py b/src/refinegems/analysis/growth.py
@@ -805,7 +805,8 @@ def test_auxotrophies(model:cobraModel, media_list:list[Medium], supplement_list
                 if len(entry) == 0:
                     warn_str = f'Amino acid {a} has no identifier for your chosen namespace {namespace}. Please contact support if you want to add one.'
                     warnings.warn(warn_str)
-                    auxotrophies[a] = m.optimize().objective_value
+                    growth_res = m.optimize()
+                    auxotrophies[a] = growth_res.objective_value if growth_res.status == 'optimal' else 0.0
                 else:
 
                     # create and check IDs for the chosen namespace
@@ -834,7 +835,8 @@ def test_auxotrophies(model:cobraModel, media_list:list[Medium], supplement_list
                             m.reactions.get_by_id(exchange_reac).lower_bound = 0.0
                             m.reactions.get_by_id(exchange_reac).upper_bound = 0.0
                     # and calculate the new objective
-                    auxotrophies[a] = m.optimize().objective_value
+                    growth_res = m.optimize()
+                    auxotrophies[a] = growth_res.objective_value if growth_res.status == 'optimal' else 0.0
 
             # add the current test results to the list of all results
             results[med.name] = auxotrophies

diff --git a/src/refinegems/classes/gapfill.py b/src/refinegems/classes/gapfill.py
@@ -213,7 +213,7 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> Union[tuple[pd.DataFrame, pd.Data
         # Transform id column to only contain BioCyc/MetaCyc IDs
         mnx2biocyc_reacs['id'] = mnx2biocyc_reacs['id'].str.split(':', n=1).str[-1]
 
-        # Crop table to contain BiGG IDs set per BioCyc ID
+        # Crop table to contain MetaNetX IDs set per BioCyc ID
         mnx_as_list = mnx2biocyc_reacs.groupby('id')['MetaNetX'].apply(set).reset_index(name='MetaNetX')
         mnx2biocyc_reacs.drop('MetaNetX', axis=1, inplace=True)
         mnx2biocyc_reacs = mnx_as_list.merge(mnx2biocyc_reacs, on='id')
@@ -235,11 +235,24 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> Union[tuple[pd.DataFrame, pd.Data
         # Step 2.1: Clean-up of table containing MetaNetX IDs
         # ---------------------------------------------------
         if not mnx_reacs.empty:
-            # Replace id with MetaNetX ID
-            mnx_reacs.rename(
-                {'id': 'metacyc.reaction', 'MetaNetX': 'id'}, 
-                axis=1, inplace=True
-                )
+            # Rename id to metacyc.reaction for simpler addition to column reference
+            mnx_reacs.rename({'id': 'metacyc.reaction'}, axis=1, inplace=True)
+
+            # Turn MetaNetX column in single value, rename column to id 
+            # & if multiple MetaNetX IDs exist add to column alias
+            mnx_reacs['id'] = mnx_reacs['MetaNetX'].map(list).str.get(0)
+            mnx_reacs['alias'] = mnx_reacs.apply(
+                lambda x: 
+                    x['MetaNetX'].remove(x['id']) if len(x['MetaNetX']) != 1 else None, 
+                    axis=1
+            )
+
+            # Specify origin of identified missing reactions & 
+            # Move BioCyc ID & origin to reference
+            mnx_reacs['origin'] = 'BioCyc'
+            mnx_reacs['reference'] = mnx_reacs[
+                ['metacyc.reaction', 'origin', 'alias']
+                ].to_dict('records')
 
             # Create list of EC codes in column ec-code_x, 
             # Join both ec-code columns into one & Create a set of ec-codes
@@ -249,15 +262,12 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> Union[tuple[pd.DataFrame, pd.Data
                 ).map(set).map(list)
             mnx_reacs['ec-code'] = (mnx_reacs['ec-code_x'] + mnx_reacs['ec-code_y']).map(set).map(list)
 
-            # Specify origin of identified missing reactions & 
-            # Move BioCyc ID, origin & ec-code to reference
-            mnx_reacs['origin'] = 'BioCyc'
-            mnx_reacs['reference'] = mnx_reacs[['metacyc.reaction', 'origin']].to_dict('records')
-
             # Drop all unnecessary columns
             mnx_reacs.drop(
-                ['origin', 'metacyc.reaction', 'ec-code_x', 'ec-code_y', 'equation'], 
-                axis=1, inplace=True
+                [
+                    'origin', 'metacyc.reaction', 'MetaNetX', 'alias', 'ec-code_x',
+                    'ec-code_y', 'equation'
+                 ], axis=1, inplace=True
                 )
 
             # Replace equation with mnx_equation
@@ -796,8 +806,8 @@ def add_gene_reac_associations_from_table(self,model:libModel,
 
         # get each unique ncbiprotein vs reaction mapping
         reac_table = reac_table[['ncbiprotein','add_to_GPR']]
-        reac_table = reac_table.explode('ncbiprotein').explode('add_to_GPR')
-        reac_table.drop_duplicates(inplace=True)
+        reac_table = reac_table.explode('ncbiprotein')
+        reac_table.drop_duplicates(subset='ncbiprotein', inplace=True)
 
         # add the genes to the corresponding GPRs
         for idx,row in reac_table.iterrows():
@@ -985,6 +995,13 @@ def fill_model(self, model:Union[cobra.Model,libModel],
             libModel: 
                 The gap-filled model.
         """
+
+        # Filter out reactions without ncbiprotein
+        # @DISCUSSION
+        # @TODO Add also reactions without GPR to model
+        self.manual_curation['Reactions no GPR'] = self.missing_reactions[self.missing_reactions['ncbiprotein'].isnull()]
+        self.missing_reactions = self.missing_reactions[~self.missing_reactions['ncbiprotein'].isnull()]
+
         # filter out duplicates genes to avoid duplicates IDs in the model
         # @TODO: Better idea to add duplicate genes
         if len(self.missing_genes) != len(self.missing_genes['ncbiprotein'].unique()):
@@ -1014,6 +1031,7 @@ def fill_model(self, model:Union[cobra.Model,libModel],
         genes_with_reacs_in_model = self.missing_genes[self.missing_genes['ncbiprotein'].isin(ncbiprot_with_reacs_in_model)]
         self._statistics['genes']['added'] = self._statistics['genes']['added'] + len(genes_with_reacs_in_model)
         if len(genes_with_reacs_in_model) > 0:
+
             # add genes as gene products to model
             self.add_genes_from_table(model, genes_with_reacs_in_model)
 

diff --git a/src/refinegems/utility/entities.py b/src/refinegems/utility/entities.py
@@ -1025,6 +1025,9 @@ def build_reaction_mnx(model:cobra.Model, id:str,
                 list: 
                     List of matching reaction IDs (in model).
     """
+
+    # Get origin if not reaction was found missing with MetaNetX
+    origin = references.pop('origin') if 'origin' in references.keys() else None
 
     # ---------------------
     # check, if ID in model
@@ -1121,6 +1124,12 @@ def build_reaction_mnx(model:cobra.Model, id:str,
             # update reactions direction, if MetaCyc has better information
             if db == 'metacyc.reaction' and len(db_matches[db_matches['source'].str.contains('-->')]):
                 new_reac.bounds = (0.0,cobra.Configuration().upper_bound)
+    # get alias annotations for MetaNetX if available
+    if 'alias' in references.keys(): 
+        if references['alias'] != None:
+            references['metanetx.reaction'] = list(references['alias'])
+        del references['alias']
+
     # add additional references from the parameter
     _add_annotations_from_dict_cobra(references,new_reac)
 
@@ -1130,6 +1139,7 @@ def build_reaction_mnx(model:cobra.Model, id:str,
 
     # add notes
     # ---------
+    if origin: new_reac.notes['found with'] = f'refineGEMs GapFiller, {origin}'
     new_reac.notes['created with'] = 'refineGEMs GapFiller, MetaNetX'
 
     # match ID to namespace
@@ -1348,6 +1358,9 @@ def build_reaction_bigg(model:cobra.Model, id:str,
                 list: 
                     List of matching reaction IDs (in model).
     """
+
+    # Get origin if not reaction was found missing with MetaNetX
+    origin = references.pop('origin') if 'origin' in references.keys() else None
 
     # ---------------------
     # check, if ID in model
@@ -1413,11 +1426,17 @@ def build_reaction_bigg(model:cobra.Model, id:str,
     # add infos from BiGG
     new_reac.annotation['bigg.reaction'] = [id]
     _add_annotations_from_bigg_reac_row(bigg_reac_info, new_reac)
+    # get alias annotations for MetaNetX if available
+    if 'alias' in references.keys(): 
+        if references['alias'] != None:
+            references['metanetx.reaction'] = list(references['alias'])
+        del references['alias']
     # add additional references from the parameter
     _add_annotations_from_dict_cobra(references,new_reac)
 
     # add notes
     # ---------
+    if origin: new_reac.notes['found with'] = f'refineGEMs GapFiller, {origin}'
     new_reac.notes['created with'] = 'refineGEMs GapFiller, BiGG'
 
     # match ID to namespace
@@ -1596,6 +1615,7 @@ def old_create_gp(model: libModel, model_id: str, name: str, locus_tag: str, pro
 # @NEW - substitues the one above 
 # --> WARNING: Output is different now
 # @TODO generalise addition of references -> maybe kwargs
+# @TODO: Check if ncbiprotein leads to valid ID -> Otherwise, replace invalid chars with '_'
 def create_gp(model:libModel, protein_id:str, 
               model_id:str=None,
               name:str=None, locus_tag:str=None,
@@ -1634,6 +1654,7 @@ def create_gp(model:libModel, protein_id:str,
     gp.setSBOTerm('SBO:0000243')            # SBOterm
     gp.setMetaId(f'meta_G_{protein_id}')    # Meta ID
     # test for NCBI/RefSeq
+    id_db = None
     if re.fullmatch('^(((AC|AP|NC|NG|NM|NP|NR|NT|NW|WP|XM|XP|XR|YP|ZP)_\d+)|(NZ_[A-Z]{2,4}\d+))(\.\d+)?$', protein_id, re.IGNORECASE):
         id_db = 'REFSEQ'
     elif re.fullmatch('^(\w+\d+(\.\d+)?)|(NP_\d+)$', protein_id, re.IGNORECASE): id_db = 'NCBI'