Skip to content

Commit

Permalink
Update gapfill #126 #52
Browse files Browse the repository at this point in the history
Added functions for adding reac / metabs per database id
  • Loading branch information
cb-Hades committed Aug 9, 2024
1 parent ad9f3e6 commit 1bbfdd3
Show file tree
Hide file tree
Showing 8 changed files with 1,813 additions and 1,150 deletions.
1,408 changes: 296 additions & 1,112 deletions dev/gapfill-testing.ipynb

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion src/refinegems/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
__all__ = ['analysis','classes','curation','utility', 'cmd_access']
__all__ = ['analysis','classes','curation','decorators','utility', 'cmd_access']

from . import analysis
from . import classes
from . import curation
from . import decorators
from . import utility
from . import cmd_access
107 changes: 97 additions & 10 deletions src/refinegems/classes/gapfill.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,21 +10,22 @@
from abc import ABC, abstractmethod

import cobra
import pandas as pd
import io
import numpy as np
import pandas as pd
import re
import warnings

from bioservices.kegg import KEGG
from libsbml import Model as libModel
from typing import Literal, Union

from libsbml import Model as libModel
from bioservices.kegg import KEGG
import io
import re
from tqdm import tqdm
tqdm.pandas()

from ..curation.db_access.db import get_ec_from_ncbi, get_ec_via_swissprot
from ..utility.io import load_a_table_from_database, parse_fasta_headers, parse_gff_for_cds
from ..utility.entities import get_model_reacs_or_metabs
from ..utility.entities import get_model_reacs_or_metabs, create_gp, create_gpr
from ..curation.db_access.kegg import parse_KEGG_gene, parse_KEGG_ec

############################################################################
Expand Down Expand Up @@ -145,6 +146,9 @@ def __init__(self) -> None:
#@abstractmethod
#def load_gene_list(self):
# pass

# abstract methods
# ----------------

@abstractmethod
def get_missing_genes(self, model):
Expand All @@ -153,6 +157,9 @@ def get_missing_genes(self, model):
@abstractmethod
def get_missing_reacs(self,model):
pass

# finding the gaps
# ----------------

def _find_reac_in_model(self, model: cobra.Model, eccode:str, id:str,
idtype:Literal['MetaNetX','KEGG','BiGG', 'BioCyc'],
Expand Down Expand Up @@ -190,12 +197,88 @@ def _find_reac_in_model(self, model: cobra.Model, eccode:str, id:str,

return None

# @abstractmethod
# def run(self,model):
# pass

# actual "Filling" part
# ---------------------

# @TODO logging? or remove self?
def add_genes_from_table(self,model:libModel, gene_table:pd.DataFrame) -> None:
"""Create new GeneProduct for a table of genes in the format:
| ncbiprotein | locus_tag | UniProt | ... |
The dots symbolise additional columns, that can be passed to the function,
but will not be used by it. The other columns, except UniProt, are required.
Args:
- model (libModel):
_description_
- gene_table (pd.DataFrame):
The table with the genes to add. At least needs the columns
'','' and 'ec-code'.
"""

# ncbiprotein | locus_tag | ...
# work on a copy to ensure input stays the same
gene_table = gene_table.copy()
# gene_table.drop(columns=['ec-code'],inplace=True)

# create gps from the table and add them to the model
if 'UniProt' in gene_table.columns:
for idx,x in gene_table.iterrows():
create_gp(model, x['ncbiprotein'],
locus_tag=x['locus_tag'],
uniprot=(x['UniProt'],True))
else:
for idx,x in gene_table.iterrows():
create_gp(model, x['ncbiprotein'],
locus_tag=x['locus_tag'])


# @TODO seems very ridgid, better ways to find the ids?
def add_gene_reac_associations_from_table(model:libModel,
reac_table:pd.DataFrame) -> None:
"""Using a table with at least the columns 'ncbiprotein'
(containing e.g. NCBI protein identifier (lists), should be gene IDs in the model)
and 'add_to_GPR' (containing reactions identifier (lists)), add the gene IDs to the
GPRs of the corresponding reactions.
Args:
- model (libModel):
The model loaded with libSBML.
- reac_table (pd.DataFrame):
The table containing at least the columns 'ncbiprotein' (gene IDs) and
'add_to_GPR' (reaction IDs)
"""

model_gene_ids = [_.getId() for _ in model.getPlugin(0).getListOfGeneProducts()]

# get each unique ncbiprotein vs reaction mapping
reac_table = reac_table[['ncbiprotein','add_to_GPR']]
reac_table = reac_table.explode('ncbiprotein').explode('add_to_GPR')
reac_table.drop_duplicates(inplace=True)

# add the genes to the corresponding GPRs
for idx,row in reac_table.iterrows():
# check, if G_+ncbiprotein in model
# if yes, add gpr
geneid = 'G_'+row['ncbiprotein'].replace('.','_')
reacid = 'R_'+row['add_to_GPR']
if geneid in model_gene_ids:
create_gpr(model.getReaction(reacid),geneid)
# else, print warning
else:
mes = f'Cannot find {geneid} in model. Should be added to {reacid}'
warnings.warn(mes,UserWarning)




def fill_model(self, model):
pass

# reporting
# ---------

def calculate_stats(self):
pass
Expand Down Expand Up @@ -582,7 +665,11 @@ def get_missing_reacs(self):
# def gff_gene_comp():
# pass
#
#
#

# ---------------------------------
# GapFilling with GFF and Swissprot
# ---------------------------------
class GeneGapFiller(GapFiller):

GFF_COLS = {'locus_tag':'locus_tag',
Expand Down
116 changes: 115 additions & 1 deletion src/refinegems/curation/db_access/db.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
# requirements
################################################################################

import cobra
import numpy as np
import pandas as pd
import re
Expand Down Expand Up @@ -97,6 +98,9 @@ def compare_ids(id1: str, id2: str) -> bool:
return similar_ids


# BiGG
# ----

@sleep_and_retry
@limits(calls=10, period=1)
def get_reaction_compartment(bigg_id: str) -> str:
Expand Down Expand Up @@ -377,6 +381,9 @@ def get_reactants_and_products_dicts(reaction_id: str) -> list[dict]:
return missing_reacs


# @TODO : name is an issue


# NCBI parser
# -----------

Expand Down Expand Up @@ -434,6 +441,7 @@ def filter_DIAMOND_blastp_results(blasttsv:str, pid_theshold:float=90.0):
diamond_results = diamond_results[['query_ID','subject_ID']]

return diamond_results

# Uniprot
# -------

Expand Down Expand Up @@ -502,4 +510,110 @@ def get_ec_via_swissprot(fasta:str, db:str, missing_genes:pd.DataFrame,
# EC numbers and locus tags
mapped_res = mapped_res.groupby(['locus_tag','ec-code']).agg({'UniProt': lambda x: x.tolist()}).reset_index()

return mapped_res
return mapped_res


# general
# -------

# @TODO: BioCyc missing
def parse_reac_str(equation:str,
type:Literal['BiGG','BioCyc','MetaNetX','KEGG']='MetaNetX') -> tuple[dict,dict,list,bool]:
"""Parse a reaction string.
Args:
- equation (str):
The equation of a reaction as a string (as saved in the database).
- type (Literal['BiGG','BioCyc','MetaNetX','KEGG'], optional):
The name of the database the equation was taken from.
Can be 'BiGG','BioCyc','MetaNetX','KEGG'.
Defaults to 'MetaNetX'.
Returns:
tuple:
Tuple of (1) dict, (2) dict, (3) list & (4) bool:
1. Dictionary with the reactant IDs and their stoichiometric factors.
2. Dictionary with the product IDs and their stoichiometric factors.
3. List of compartment IDs or None, if they cannot be extract from the equation.
4. True, if the reaction is reversible, else False.
"""

products = {}
reactants = {}
compartments = list()
is_product = False
reversible = True

match type:
case 'MetaNetX':
for s in equation.split(' '):
# switch from reactants to products
if s == '=':
is_product = True
# found stoichiometric factor
elif s.isnumeric():
factor = float(s)
# skip
elif s == '+':
continue
# found metabolite
else:
# get information from MetaNetX
metabolite, compartment = s.split('@')
compartments.append(compartment)

if is_product:
products[metabolite] = factor
else:
reactants[metabolite] = factor

case 'BiGG':
factor = 1.0 # BiGG does not use factor 1 in the quations
for s in equation.split(' '):
# found factor
if s.replace('.','').isdigit():
factor = float(s)
# switch from reactants to products
elif s == '-->' :
is_product = True
reversible = False
elif s == '<->':
is_product = True
# skip
elif s == '+':
continue
# found metabolite
else:
compartments.append(s.rsplit('_',1)[1])
if is_product:
products[s] = factor
else:
reactants[s] = factor
factor = 1.0

case 'KEGG':
compartments = None
factor = 1.0
for s in equation.split(' '):
if s.isnumeric():
factor = float(s)
elif s == '+':
continue
elif s == '<=>': # @TODO are there more options?
is_product = True
else:
if is_product:
products[s] = factor
else:
reactants[s] = factor
factor = 1.0

case 'BioCyc':
pass

return (reactants,products,compartments,reversible)



2 changes: 1 addition & 1 deletion src/refinegems/curation/db_access/kegg.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
Due to the KEGG REST API this is relatively slow (model of size 1500 reactions - 20 min).
"""

__author__ = "Famke Baeuerle"
__author__ = "Famke Baeuerle and Carolin Brune"

############################################################################
# requirements
Expand Down
3 changes: 3 additions & 0 deletions src/refinegems/developement/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
__all__ = ['decorators']

from . import decorators
37 changes: 37 additions & 0 deletions src/refinegems/developement/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
#!/usr/bin/env python
"""This module contains decorator functions.
"""

__author__ = "Carolin Brune"

################################################################################
# requirements
################################################################################

################################################################################
# variables
################################################################################

################################################################################
# functions
################################################################################

def template(func):
"""A decorator for template functions.
Template functions are not executable and are only for developers.
"""
def wrapper():
raise RuntimeError('This function is a template for developers.\nIt cannot be executed.')
return wrapper


def implement(func):
"""A decorator for functions, that should be implemented.
Used to give a hint to other developers, that this function is not yet implemented,
but should be to make the program executable.
"""
def wrapper():
raise NotImplementedError('The current function is just a placeholder and will be implement in the fucture.')
return wrapper
Loading

0 comments on commit 1bbfdd3

Please sign in to comment.