diff --git a/dev/gapfill-testing.ipynb b/dev/gapfill-testing.ipynb index 6a4361f..9943ece 100644 --- a/dev/gapfill-testing.ipynb +++ b/dev/gapfill-testing.ipynb @@ -246,9 +246,19 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/doebel/miniconda3/envs/refineGEMs/lib/python3.10/site-packages/pydantic/_internal/_config.py:322: UserWarning: Valid config keys have changed in V2:\n", + "* 'underscore_attrs_are_private' has been removed\n", + " warnings.warn(message, UserWarning)\n" + ] + } + ], "source": [ "from refinegems.classes.gapfill import BioCycGapFiller\n", "from refinegems.utility.io import load_model\n", @@ -273,7 +283,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -297,120 +307,65 @@ " \n", " \n", " \n", - " id\n", - " ncbiprotein\n", - " equation\n", - " ec-code\n", - " via\n", " add_to_GPR\n", - " references\n", - " \n", - " \n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - "Empty DataFrame\n", - "Columns: [id, ncbiprotein, equation, ec-code, via, add_to_GPR, references]\n", - "Index: []" - ] - }, - "execution_count": 20, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gfbc.missing_reacs" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", + " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", + " \n", " \n", " \n", " \n", @@ -423,115 +378,128 @@ " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
ec-codeequationidncbiproteinequationec-codereferenceviaadd_to_GPRreferences
01-ACYLGLYCEROL-3-P-ACYLTRANSFER-RXN[WP_011275497.1]acyl-[acyl-carrier protein] + a 2-lysophosphat...[EC-2.3.1.51]BioCyc[3HAD80][4.2.1.59](3R)-3-hydroxyoctanoyl-[acp] -> (2E)-oct-2-e...4.2.1.59-RXN[WP_011275247.1]None{'Reaction-Direction': 'LEFT-TO-RIGHT'}BioCyc
11.5.1.11-RXN[WP_011275081.1]D-octopine + NAD+ + H2O = L-arginine + pyruv...[EC-1.5.1.11]BioCyc[AMMQLT8][2.1.1.163]S-adenosyl-L-methionine + demethylmenaquinol-8...ADOMET-DMK-METHYLTRANSFER-RXN[WP_011275738.1]None{'Reaction-Direction': nan}BioCyc
21.8.1.4-RXN[WP_037558251.1]a [lipoyl-carrier protein]-N6-[(R)-dihydrolipo...[EC-1.8.1.4]BioCyc[NTP1, ATPM][3.6.1.3, 3.6.1.15]ATP + H2O -> ADP + phosphate + H+ATPASE-RXN[WP_016931380.1]None{'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'}BioCyc
32-HALOACID-DEHALOGENASE-RXN[WP_037559480.1, WP_011274778.1]an (S)-2-halocarboxylate + H2O -> a (2R)-2-h...[EC-3.8.1.10]BioCyc[LDH_D][1.1.1.28](R)-lactate + NAD+ <- pyruvate + NADH + H+DLACTDEHYDROGNAD-RXN[WP_011274818.1]None{'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'}BioCyc
42-HALOACID-DEHALOGENASE-RXN[WP_037559480.1, WP_011274778.1]an (S)-2-halocarboxylate + H2O -> a (2R)-2-h...[EC-3.8.1.10]BioCyc[DHNAOT4][2.5.1.74]all-trans-octaprenyl diphosphate + 1,4-dihydro...DMK-RXN[WP_016931372.1]None{'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'}BioCyc
......
272TRYPTOPHAN--TRNA-LIGASE-RXN[WP_011276231.1]L-tryptophan + tRNATrp + ATP -> AMP + L-tryp...[EC-6.1.1.2]BioCyc233None{'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'}NaN1 MNXM11@MNXD1 + 1 MNXM167418@MNXD1 + 1 MNXM72...{MNXR124075}[WP_011276231.1]{'metacyc.reaction': 'TRYPTOPHAN--TRNA-LIGASE-...MetaNetX
273TYROSINE--TRNA-LIGASE-RXN[WP_011275493.1]tRNATyr + L-tyrosine + ATP -> L-tyrosyl-[tRN...[EC-6.1.1.1]BioCyc234None{'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'}NaN1 MNXM11@MNXD1 + 1 MNXM16655@MNXD1 + 1 MNXM728...{MNXR124080}[WP_011275493.1]{'metacyc.reaction': 'TYROSINE--TRNA-LIGASE-RX...MetaNetX
274UDP-NACMURALGLDAPAALIG-RXN[WP_011275264.1]UDP-<I>N</I>-acetylmuramoyl-L-alanyl-D-glutamy...[EC-6.3.2.10]BioCyc235None{'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'}NaN1 MNXM1104679@MNXD1 + 1 MNXM1446@MNXD1 + 1 MNX...{MNXR146695}[WP_011275264.1]{'metacyc.reaction': 'UDP-NACMURALGLDAPAALIG-R...MetaNetX
275UDPGLCNACEPIM-RXN[WP_033079611.1]UDP-N-acetyl-alpha-D-glucosamine -> UDP-N-ac...[EC-5.1.3.14]BioCyc236None{'Reaction-Direction': 'LEFT-TO-RIGHT'}NaN1 MNXM1101258@MNXD1 = 1 MNXM1104529@MNXD1{MNXR151539}[WP_033079611.1]{'metacyc.reaction': 'UDPGLCNACEPIM-RXN', 'ori...MetaNetX
276VALINE--TRNA-LIGASE-RXN[WP_011275561.1]tRNAVal + L-valine + ATP -> L-valyl-[tRNAVal...[EC-6.1.1.9]BioCyc237None{'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'}NaN1 MNXM11@MNXD1 + 1 MNXM167419@MNXD1 + 1 MNXM72...{MNXR124096}[WP_011275561.1]{'metacyc.reaction': 'VALINE--TRNA-LIGASE-RXN'...MetaNetX
\n", - "

277 rows × 7 columns

\n", + "

238 rows × 7 columns

\n", "
" ], "text/plain": [ - " id ncbiprotein \\\n", - "0 1-ACYLGLYCEROL-3-P-ACYLTRANSFER-RXN [WP_011275497.1] \n", - "1 1.5.1.11-RXN [WP_011275081.1] \n", - "2 1.8.1.4-RXN [WP_037558251.1] \n", - "3 2-HALOACID-DEHALOGENASE-RXN [WP_037559480.1, WP_011274778.1] \n", - "4 2-HALOACID-DEHALOGENASE-RXN [WP_037559480.1, WP_011274778.1] \n", - ".. ... ... \n", - "272 TRYPTOPHAN--TRNA-LIGASE-RXN [WP_011276231.1] \n", - "273 TYROSINE--TRNA-LIGASE-RXN [WP_011275493.1] \n", - "274 UDP-NACMURALGLDAPAALIG-RXN [WP_011275264.1] \n", - "275 UDPGLCNACEPIM-RXN [WP_033079611.1] \n", - "276 VALINE--TRNA-LIGASE-RXN [WP_011275561.1] \n", + " add_to_GPR ec-code \\\n", + "0 [3HAD80] [4.2.1.59] \n", + "1 [AMMQLT8] [2.1.1.163] \n", + "2 [NTP1, ATPM] [3.6.1.3, 3.6.1.15] \n", + "3 [LDH_D] [1.1.1.28] \n", + "4 [DHNAOT4] [2.5.1.74] \n", + ".. ... ... \n", + "233 None NaN \n", + "234 None NaN \n", + "235 None NaN \n", + "236 None NaN \n", + "237 None NaN \n", + "\n", + " equation \\\n", + "0 (3R)-3-hydroxyoctanoyl-[acp] -> (2E)-oct-2-e... \n", + "1 S-adenosyl-L-methionine + demethylmenaquinol-8... \n", + "2 ATP + H2O -> ADP + phosphate + H+ \n", + "3 (R)-lactate + NAD+ <- pyruvate + NADH + H+ \n", + "4 all-trans-octaprenyl diphosphate + 1,4-dihydro... \n", + ".. ... \n", + "233 1 MNXM11@MNXD1 + 1 MNXM167418@MNXD1 + 1 MNXM72... \n", + "234 1 MNXM11@MNXD1 + 1 MNXM16655@MNXD1 + 1 MNXM728... \n", + "235 1 MNXM1104679@MNXD1 + 1 MNXM1446@MNXD1 + 1 MNX... \n", + "236 1 MNXM1101258@MNXD1 = 1 MNXM1104529@MNXD1 \n", + "237 1 MNXM11@MNXD1 + 1 MNXM167419@MNXD1 + 1 MNXM72... \n", "\n", - " equation ec-code via \\\n", - "0 acyl-[acyl-carrier protein] + a 2-lysophosphat... [EC-2.3.1.51] BioCyc \n", - "1 D-octopine + NAD+ + H2O = L-arginine + pyruv... [EC-1.5.1.11] BioCyc \n", - "2 a [lipoyl-carrier protein]-N6-[(R)-dihydrolipo... [EC-1.8.1.4] BioCyc \n", - "3 an (S)-2-halocarboxylate + H2O -> a (2R)-2-h... [EC-3.8.1.10] BioCyc \n", - "4 an (S)-2-halocarboxylate + H2O -> a (2R)-2-h... [EC-3.8.1.10] BioCyc \n", - ".. ... ... ... \n", - "272 L-tryptophan + tRNATrp + ATP -> AMP + L-tryp... [EC-6.1.1.2] BioCyc \n", - "273 tRNATyr + L-tyrosine + ATP -> L-tyrosyl-[tRN... [EC-6.1.1.1] BioCyc \n", - "274 UDP-N-acetylmuramoyl-L-alanyl-D-glutamy... [EC-6.3.2.10] BioCyc \n", - "275 UDP-N-acetyl-alpha-D-glucosamine -> UDP-N-ac... [EC-5.1.3.14] BioCyc \n", - "276 tRNAVal + L-valine + ATP -> L-valyl-[tRNAVal... [EC-6.1.1.9] BioCyc \n", + " id ncbiprotein \\\n", + "0 4.2.1.59-RXN [WP_011275247.1] \n", + "1 ADOMET-DMK-METHYLTRANSFER-RXN [WP_011275738.1] \n", + "2 ATPASE-RXN [WP_016931380.1] \n", + "3 DLACTDEHYDROGNAD-RXN [WP_011274818.1] \n", + "4 DMK-RXN [WP_016931372.1] \n", + ".. ... ... \n", + "233 {MNXR124075} [WP_011276231.1] \n", + "234 {MNXR124080} [WP_011275493.1] \n", + "235 {MNXR146695} [WP_011275264.1] \n", + "236 {MNXR151539} [WP_033079611.1] \n", + "237 {MNXR124096} [WP_011275561.1] \n", "\n", - " add_to_GPR references \n", - "0 None {'Reaction-Direction': 'LEFT-TO-RIGHT'} \n", - "1 None {'Reaction-Direction': nan} \n", - "2 None {'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'} \n", - "3 None {'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'} \n", - "4 None {'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'} \n", - ".. ... ... \n", - "272 None {'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'} \n", - "273 None {'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'} \n", - "274 None {'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'} \n", - "275 None {'Reaction-Direction': 'LEFT-TO-RIGHT'} \n", - "276 None {'Reaction-Direction': 'PHYSIOL-LEFT-TO-RIGHT'} \n", + " reference via \n", + "0 None BioCyc \n", + "1 None BioCyc \n", + "2 None BioCyc \n", + "3 None BioCyc \n", + "4 None BioCyc \n", + ".. ... ... \n", + "233 {'metacyc.reaction': 'TRYPTOPHAN--TRNA-LIGASE-... MetaNetX \n", + "234 {'metacyc.reaction': 'TYROSINE--TRNA-LIGASE-RX... MetaNetX \n", + "235 {'metacyc.reaction': 'UDP-NACMURALGLDAPAALIG-R... MetaNetX \n", + "236 {'metacyc.reaction': 'UDPGLCNACEPIM-RXN', 'ori... MetaNetX \n", + "237 {'metacyc.reaction': 'VALINE--TRNA-LIGASE-RXN'... MetaNetX \n", "\n", - "[277 rows x 7 columns]" + "[238 rows x 7 columns]" ] }, - "execution_count": 21, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "gfbc.manual_curation['BioCyc reactions unmappable']" + "gfbc.missing_reacs" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -555,204 +523,102 @@ " \n", " \n", " \n", - " locus_tag\n", + " add_to_GPR\n", + " ec-code\n", + " equation\n", " id\n", + " ncbiprotein\n", + " reference\n", + " via\n", " \n", " \n", " \n", + " \n", + " 238\n", + " None\n", + " NaN\n", + " (1,4-alpha-D-galacturonosyl)(n+m) -> (1,4-al...\n", + " 4.2.2.2-RXN\n", + " [WP_011275342.1]\n", + " {'ec-code': ['4.2.2.2']}\n", + " BioCyc\n", + " \n", + " \n", + " 239\n", + " None\n", + " NaN\n", + " a [protein]-L-proline (omega = 180) <--> a [...\n", + " PEPTIDYLPROLYL-ISOMERASE-RXN\n", + " [WP_016931161.1, WP_011275422.1, WP_016930839.1]\n", + " {'ec-code': ['5.2.1.8']}\n", + " BioCyc\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [locus_tag, id]\n", - "Index: []" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "gfbc.manual_curation['BioCyc genes unmappable']" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "gfbc.manual_curation['BioCyc reactions unmappable'].to_csv('./test_files/current_output_missing_reacs.tsv', sep='\\t')" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "gfbc.missing_genes.to_csv('./test_files/missing_genes_gff.tsv', sep='\\t')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Assemble code for BioCyc Mapping" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "from refinegems.classes.gapfill import map_biocyc_to_reac\n", - "old_result = gfbc.manual_curation['BioCyc reactions unmappable']" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Empty DataFrame\n", - "Columns: [id, BiGG, name, reaction_string]\n", - "Index: []\n" - ] - }, - { - "data": { - "text/plain": [ - "( id BiGG \\\n", - " 0 1-ACYLGLYCEROL-3-P-ACYLTRANSFER-RXN NaN \n", - " 1 1.5.1.11-RXN NaN \n", - " 2 1.8.1.4-RXN NaN \n", - " 3 2-HALOACID-DEHALOGENASE-RXN NaN \n", - " 4 2-OXOBUTYRATE-SYNTHASE-RXN NaN \n", - " .. ... ... \n", - " 223 TRYPTOPHAN--TRNA-LIGASE-RXN NaN \n", - " 224 TYROSINE--TRNA-LIGASE-RXN NaN \n", - " 225 UDP-NACMURALGLDAPAALIG-RXN NaN \n", - " 226 UDPGLCNACEPIM-RXN {UAG2E} \n", - " 227 VALINE--TRNA-LIGASE-RXN NaN \n", - " \n", - " name reaction_string \\\n", - " 0 NaN NaN \n", - " 1 NaN NaN \n", - " 2 NaN NaN \n", - " 3 NaN NaN \n", - " 4 NaN NaN \n", - " .. ... ... \n", - " 223 NaN NaN \n", - " 224 NaN NaN \n", - " 225 NaN NaN \n", - " 226 UDP-N-acetylglucosamine 2-epimerase uacgam_c <-> uacmam_c \n", - " 227 NaN NaN \n", - " \n", - " ncbiprotein \\\n", - " 0 [WP_011275497.1] \n", - " 1 [WP_011275081.1] \n", - " 2 [WP_037558251.1] \n", - " 3 [WP_037559480.1, WP_011274778.1] \n", - " 4 [WP_011275909.1] \n", - " .. ... \n", - " 223 [WP_011276231.1] \n", - " 224 [WP_011275493.1] \n", - " 225 [WP_011275264.1] \n", - " 226 [WP_033079611.1] \n", - " 227 [WP_011275561.1] \n", - " \n", - " equation ec-code via \\\n", - " 0 acyl-[acyl-carrier protein] + a 2-lysophosphat... [EC-2.3.1.51] BioCyc \n", - " 1 D-octopine + NAD+ + H2O = L-arginine + pyruv... [EC-1.5.1.11] BioCyc \n", - " 2 a [lipoyl-carrier protein]-N6-[(R)-dihydrolipo... [EC-1.8.1.4] BioCyc \n", - " 3 an (S)-2-halocarboxylate + H2O -> a (2R)-2-h... [EC-3.8.1.10] BioCyc \n", - " 4 2-oxobutanoate + 2 oxidized ferredoxin [iron-s... [EC-1.2.7.7] BioCyc \n", - " .. ... ... ... \n", - " 223 L-tryptophan + tRNATrp + ATP -> AMP + L-tryp... [EC-6.1.1.2] BioCyc \n", - " 224 tRNATyr + L-tyrosine + ATP -> L-tyrosyl-[tRN... [EC-6.1.1.1] BioCyc \n", - " 225 UDP-N-acetylmuramoyl-L-alanyl-D-glutamy... [EC-6.3.2.10] BioCyc \n", - " 226 UDP-N-acetyl-alpha-D-glucosamine -> UDP-N-ac... [EC-5.1.3.14] BioCyc \n", - " 227 tRNAVal + L-valine + ATP -> L-valyl-[tRNAVal... [EC-6.1.1.9] BioCyc \n", - " \n", - " add_to_GPR references \n", - " 0 None None \n", - " 1 None None \n", - " 2 None None \n", - " 3 None None \n", - " 4 None None \n", - " .. ... ... \n", - " 223 None None \n", - " 224 None None \n", - " 225 None None \n", - " 226 None None \n", - " 227 None None \n", - " \n", - " [228 rows x 10 columns],\n", - " {'mapped2MNX': 0, 'mapped2BiGG': 32, 'remaining_unmapped': 0})" + " add_to_GPR ec-code equation \\\n", + "238 None NaN (1,4-alpha-D-galacturonosyl)(n+m) -> (1,4-al... \n", + "239 None NaN a [protein]-L-proline (omega = 180) <--> a [... \n", + "\n", + " id \\\n", + "238 4.2.2.2-RXN \n", + "239 PEPTIDYLPROLYL-ISOMERASE-RXN \n", + "\n", + " ncbiprotein \\\n", + "238 [WP_011275342.1] \n", + "239 [WP_016931161.1, WP_011275422.1, WP_016930839.1] \n", + "\n", + " reference via \n", + "238 {'ec-code': ['4.2.2.2']} BioCyc \n", + "239 {'ec-code': ['5.2.1.8']} BioCyc " ] }, - "execution_count": 20, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "new_result = map_biocyc_to_reac(old_result)\n", - "new_result" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [], - "source": [ - "new_result[0].to_csv('./test_files/new_missing_reacs.tsv', sep='\\t', index=False)" + "gfbc.manual_curation['BioCyc reactions unmappable']" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Index(['id', 'BiGG', 'name', 'reaction_string', 'ncbiprotein', 'equation',\n", - " 'ec-code', 'via', 'add_to_GPR', 'references'],\n", - " dtype='object')" + "{'genes': {'missing (before)': 235,\n", + " 'duplicates': 0,\n", + " 'added': 0,\n", + " 'missing (after)': 0,\n", + " 'missing (unmappable)': 0},\n", + " 'reactions': {'added (total)': 0,\n", + " 'failed to build': 0,\n", + " 'missing (before)': 240,\n", + " 'add to GPR (BioCyc)': 11,\n", + " 'mapped2MNX': 227,\n", + " 'mapped2BiGG': 0,\n", + " 'remaining_unmapped': 2},\n", + " 'metabolites': {}}" ] }, - "execution_count": 25, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "new_result[0].columns" + "gfbc._statistics" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "from refinegems.utility.io import load_a_table_from_database" - ] - }, - { - "cell_type": "code", - "execution_count": 30, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -776,146 +642,28 @@ " \n", " \n", " \n", - " source\n", + " locus_tag\n", " id\n", - " mnx_equation\n", - " reference\n", " \n", " \n", " \n", - " \n", - " 0\n", - " metacyc.reaction:+-BORNEOL-DEHYDROGENASE-RXN\n", - " MNXR147924\n", - " 1 MNXM10@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM732948@...\n", - " rheaR:17329\n", - " \n", - " \n", - " 1\n", - " metacyc.reaction:+-NEOMENTHOL-DEHYDROGENASE-RXN\n", - " MNXR107571\n", - " 1 MNXM1107651@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM73...\n", - " rheaR:23812\n", - " \n", - " \n", - " 2\n", - " metacyc.reaction:+-SABINOL-DEHYDROGENASE-RXN\n", - " MNXR108392\n", - " 1 MNXM10@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM3738@MN...\n", - " rheaR:18329\n", - " \n", - " \n", - " 3\n", - " metacyc.reaction:--BORNEOL-DEHYDROGENASE-RXN\n", - " MNXR147925\n", - " 1 MNXM10@MNXD1 + 1 MNXM1108054@MNXD1 + 1 MNXM1...\n", - " rheaR:22128\n", - " \n", - " \n", - " 4\n", - " metacyc.reaction:--LIMONENE-3-MONOOXYGENASE-RXN\n", - " MNXR147674\n", - " 1 MNXM2038@MNXD1 + 1 MNXM729100@MNXD1 + 1 WATE...\n", - " metacycR:--LIMONENE-3-MONOOXYGENASE-RXN\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 37135\n", - " metacycR:XYLONATE-DEHYDRATASE-RXN\n", - " MNXR147660\n", - " 1 MNXM1105052@MNXD1 = 1 MNXM727325@MNXD1 + 1 W...\n", - " rheaR:19157\n", - " \n", - " \n", - " 37136\n", - " metacycR:XYLULOKIN-RXN\n", - " MNXR146759\n", - " 1 MNXM186@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM40333@...\n", - " rheaR:10964\n", - " \n", - " \n", - " 37137\n", - " metacycR:YIAE1-RXN\n", - " MNXR97491\n", - " 1 MNXM1@MNXD1 + 1 MNXM738702@MNXD1 + 1 MNXM936...\n", - " rheaR:27634\n", - " \n", - " \n", - " 37138\n", - " metacycR:ZEATIN-O-BETA-D-XYLOSYLTRANSFERASE-RXN\n", - " MNXR141922\n", - " 1 MNXM1102128@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM72...\n", - " keggR:R02119\n", - " \n", - " \n", - " 37139\n", - " metacycR:ZEATIN-REDUCTASE-RXN\n", - " MNXR149281\n", - " 1 MNXM1@MNXD1 + 1 MNXM733189@MNXD1 + 1 MNXM738...\n", - " keggR:R05702\n", - " \n", " \n", "\n", - "

37140 rows × 4 columns

\n", "" ], "text/plain": [ - " source id \\\n", - "0 metacyc.reaction:+-BORNEOL-DEHYDROGENASE-RXN MNXR147924 \n", - "1 metacyc.reaction:+-NEOMENTHOL-DEHYDROGENASE-RXN MNXR107571 \n", - "2 metacyc.reaction:+-SABINOL-DEHYDROGENASE-RXN MNXR108392 \n", - "3 metacyc.reaction:--BORNEOL-DEHYDROGENASE-RXN MNXR147925 \n", - "4 metacyc.reaction:--LIMONENE-3-MONOOXYGENASE-RXN MNXR147674 \n", - "... ... ... \n", - "37135 metacycR:XYLONATE-DEHYDRATASE-RXN MNXR147660 \n", - "37136 metacycR:XYLULOKIN-RXN MNXR146759 \n", - "37137 metacycR:YIAE1-RXN MNXR97491 \n", - "37138 metacycR:ZEATIN-O-BETA-D-XYLOSYLTRANSFERASE-RXN MNXR141922 \n", - "37139 metacycR:ZEATIN-REDUCTASE-RXN MNXR149281 \n", - "\n", - " mnx_equation \\\n", - "0 1 MNXM10@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM732948@... \n", - "1 1 MNXM1107651@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM73... \n", - "2 1 MNXM10@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM3738@MN... \n", - "3 1 MNXM10@MNXD1 + 1 MNXM1108054@MNXD1 + 1 MNXM1... \n", - "4 1 MNXM2038@MNXD1 + 1 MNXM729100@MNXD1 + 1 WATE... \n", - "... ... \n", - "37135 1 MNXM1105052@MNXD1 = 1 MNXM727325@MNXD1 + 1 W... \n", - "37136 1 MNXM186@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM40333@... \n", - "37137 1 MNXM1@MNXD1 + 1 MNXM738702@MNXD1 + 1 MNXM936... \n", - "37138 1 MNXM1102128@MNXD1 + 1 MNXM1@MNXD1 + 1 MNXM72... \n", - "37139 1 MNXM1@MNXD1 + 1 MNXM733189@MNXD1 + 1 MNXM738... \n", - "\n", - " reference \n", - "0 rheaR:17329 \n", - "1 rheaR:23812 \n", - "2 rheaR:18329 \n", - "3 rheaR:22128 \n", - "4 metacycR:--LIMONENE-3-MONOOXYGENASE-RXN \n", - "... ... \n", - "37135 rheaR:19157 \n", - "37136 rheaR:10964 \n", - "37137 rheaR:27634 \n", - "37138 keggR:R02119 \n", - "37139 keggR:R05702 \n", - "\n", - "[37140 rows x 4 columns]" + "Empty DataFrame\n", + "Columns: [locus_tag, id]\n", + "Index: []" ] }, - "execution_count": 30, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "mnx_tbl = load_a_table_from_database('SELECT x.source, x.id, p.mnx_equation, p.reference FROM mnx_reac_xref x INNER JOIN mnx_reac_prop p USING(id) WHERE x.source LIKE \\'%metacyc%\\' OR x.source LIKE \\'%biocyc%\\'')\n", - "mnx_tbl" + "gfbc.manual_curation['BioCyc genes unmappable']" ] }, { diff --git a/src/refinegems/classes/gapfill.py b/src/refinegems/classes/gapfill.py index 322511e..97c8d6b 100644 --- a/src/refinegems/classes/gapfill.py +++ b/src/refinegems/classes/gapfill.py @@ -154,7 +154,7 @@ def map_biocyc_to_reac(biocyc_reacs: pd.DataFrame) -> tuple[pd.DataFrame, dict[s # Drop NaNs in relevant columns biocyc_reacs.dropna(subset=['id', 'ec-code'], inplace=True) - def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]: + def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> Union[tuple[pd.DataFrame, pd.DataFrame], pd.DataFrame]: """Maps Biocyc IDs in a table to the corresponding MetaNetX IDs & Adds information obtained via the MetaNetX IDs to the resulting table @@ -163,9 +163,17 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame] Table containing BioCyc IDs Returns: - pd.DataFrame: - Table with BioCyc IDs mapped to MetaNetX IDs & more information - obtained from MetaNetX + Case incomplete mapping: + + tuple: + pd.DataFrame (1) & pd.DataFrame (2) + (1) Table with BioCyc IDs mapped to MetaNetX IDs & more information obtained from MetaNetX + (2) Table with BioCyc IDs without mapping to MNX + + Case complete/no mapping: + + pd.DataFrame: + Table (1)/ Table (2) from case incomplete mapping """ # Step 1: Mapping & Obtain information @@ -175,7 +183,7 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame] # Get MetaNetX BioCyc mnx2biocyc_reacs = load_a_table_from_database( ''' - SELECT x.source, x.id, p.mnx_equation, p.reference, p.\'ec-code\' + SELECT x.source, x.id, p.mnx_equation, p.\'ec-code\' FROM mnx_reac_xref x INNER JOIN mnx_reac_prop p USING(id) WHERE x.source LIKE \'%metacyc%\' @@ -184,10 +192,13 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame] ) # Change column names to fit input table - mnx2biocyc_reacs.rename({'id': 'MetaNetX', 'source': 'id'}, axis=1, inplace=True) + mnx2biocyc_reacs.rename( + {'id': 'MetaNetX', 'source': 'id'}, + axis=1, inplace=True + ) # Transform id column to only contain BioCyc/MetaCyc IDs - mnx2biocyc_reacs['id'] = mnx2biocyc_reacs['id'].str.split(':', 1).str[-1] + mnx2biocyc_reacs['id'] = mnx2biocyc_reacs['id'].str.split(':', n=1).str[-1] # Crop table to contain BiGG IDs set per BioCyc ID mnx_as_list = mnx2biocyc_reacs.groupby('id')['MetaNetX'].apply(set).reset_index(name='MetaNetX') @@ -197,26 +208,72 @@ def _map_to_mnx(biocyc_reacs: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame] # Drop duplicates to get the unique BioCyc IDs mnx2biocyc_reacs.drop_duplicates(subset='id', inplace=True) - print(mnx2biocyc_reacs[mnx2biocyc_reacs['id'].duplicated()]) - # Merge mnx2biocyc_reacs with biocyc_reacs to get new information biocyc_reacs = mnx2biocyc_reacs.merge(biocyc_reacs, on='id', how='right') - # Get amount of missing reactions that have a MetaNetX ID - statistics['mapped2MNX'] = len(biocyc_reacs.loc[biocyc_reacs['MetaNetX'].notna(), 'id'].unique().tolist()) - - # @TODO # Split biocyc_reacs into MNX part & non-mapped part + mask = biocyc_reacs['MetaNetX'].notna() + mnx_reacs = biocyc_reacs[mask] + unmapped_reacs = biocyc_reacs[~mask] - # @TODO - # Step 2: Clean-up - # ---------------- - # Move BioCyc ID to references & replace id with MetaNetX ID - - # Replace equation with mnx_equation if BioCyc could be mapped to MetaNetX + # Get amount of missing reactions that have a MetaNetX ID + statistics['mapped2MNX'] = len(mnx_reacs['id'].unique().tolist()) + + # Step 2.1: Clean-up of table containing MetaNetX IDs + # --------------------------------------------------- + if not mnx_reacs.empty: + # Replace id with MetaNetX ID + mnx_reacs.rename( + {'id': 'metacyc.reaction', 'MetaNetX': 'id'}, + axis=1, inplace=True + ) + + # Create list of EC codes in column ec-code_x, + # Join both ec-code columns into one & Create a set of ec-codes + mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].str.split('\s*;\s*') + mnx_reacs['ec-code_x'] = mnx_reacs['ec-code_x'].fillna( + {i: [] for i in mnx_reacs.index} + ).map(set).map(list) + mnx_reacs['ec-code'] = (mnx_reacs['ec-code_x'] + mnx_reacs['ec-code_y']).map(set).map(list) + + # Specify origin of identified missing reactions & + # Move BioCyc ID, origin & ec-code to reference + mnx_reacs['origin'] = 'BioCyc' + mnx_reacs['reference'] = mnx_reacs[['metacyc.reaction', 'origin', 'ec-code']].to_dict('records') + + # Drop all unnecessary columns + mnx_reacs.drop( + ['origin', 'metacyc.reaction', 'ec-code_x', 'ec-code_y', 'ec-code', 'equation'], + axis=1, inplace=True + ) + + # Replace equation with mnx_equation + mnx_reacs.rename({'mnx_equation': 'equation'}, axis=1, inplace=True) + + # Replace BioCyc in via column with MetaNetX + mnx_reacs['via'] = 'MetaNetX' + + # @DISCUSSION: Map remaining via ec-code to MetaNetX before BiGG? + # Step 2.2: Clean-up of table containing unmapped IDs + # --------------------------------------------------- + if not unmapped_reacs.empty: + # Remove unnecessary columns + unmapped_reacs.dropna(axis=1, how='all', inplace=True) + + # Clean-up ec-code column + unmapped_reacs.rename({'ec-code_y': 'ec-code'}, axis=1, inplace=True) + unmapped_reacs['reference'] = unmapped_reacs[['ec-code']].to_dict('records') + unmapped_reacs.drop('ec-code', axis=1, inplace=True) + + # Step 3: Get result(s) + # --------------------- + if not mnx_reacs.empty and not unmapped_reacs.empty: + return (mnx_reacs, unmapped_reacs) + elif not mnx_reacs.empty and unmapped_reacs.empty: + return mnx_reacs + else: + return unmapped_reacs - return mnx_reacs, unmapped_reacs - def _map_to_bigg(biocyc_reacs: pd.DataFrame) -> pd.DataFrame: """Maps Biocyc IDs in a table to the corresponding BiGG IDs & Adds information obtained via the BiGG IDs to the resulting table @@ -226,9 +283,15 @@ def _map_to_bigg(biocyc_reacs: pd.DataFrame) -> pd.DataFrame: Table containing BioCyc IDs Returns: - pd.DataFrame: - Table with BioCyc IDs mapped to BiGG IDs & more information - obtained from BiGG + pd.DataFrame: + Case complete mapping: + Table with BioCyc IDs mapped to BiGG IDs & more information + obtained from BiGG + Case no mapping: + Table with unmapped BioCyc IDs + Case incomplete mapping: + Combined table containing BioCyc IDs mapped to BiGG IDs with + more information obtained from BiGG & unmapped BioCyc IDs """ # Step 1: Mapping & Obtain information @@ -252,30 +315,77 @@ def _map_to_bigg(biocyc_reacs: pd.DataFrame) -> pd.DataFrame: # Remove BiGG IDs for reactions in wrong compartments # Merge bigg2biocyc_reacs with biocyc_reacs to get new information - biocyc_reacs = bigg2biocyc_reacs.merge(biocyc_reacs, on='id') + biocyc_reacs = bigg2biocyc_reacs.merge(biocyc_reacs, on='id', how='right') + + # Split biocyc_reacs into MNX part & non-mapped part + mask = biocyc_reacs['BiGG'].notna() + bigg_reacs = biocyc_reacs[mask] + unmapped_reacs = biocyc_reacs[~mask] # Get amount of missing reactions that have a BiGG ID - statistics['mapped2BiGG'] = len(biocyc_reacs.loc[biocyc_reacs['BiGG'].notna(), 'id'].unique().tolist()) + statistics['mapped2BiGG'] = len(bigg_reacs['id'].unique().tolist()) - # @TODO - # Step 2: Clean-up - # ---------------- - # Move BioCyc ID to references & replace id with BiGG ID - - # Replace equation with reaction_string if BioCyc could be mapped to BiGG + # Step 2.1: Clean-up of table containing BiGG IDs + # ----------------------------------------------- + if not bigg_reacs.empty: + # Rename id to metacyc.reaction for simpler addition to column reference + bigg_reacs.rename({'id': 'metacyc.reaction'}, axis=1, inplace=True) + + # Turn BiGG column in single value, rename column to id + # & if multiple BiGG IDs exist add to column alias + bigg_reacs['id'] = bigg_reacs['BiGG'].map(list).str.get(0) + bigg_reacs['alias'] = bigg_reacs.apply( + lambda x: + x['BiGG'].remove(x['id']) if len(x['BiGG']) != 1 else None, + axis=1 + ) - # @DEPRECATE: Transfer this to model filling! - # Add reactants & products dictionary with stoichiometric values to the reactions table - # biocyc_reacs = add_stoichiometric_values_to_reacs_from_bigg(biocyc_reacs) + # Specify origin of identified missing reactions & + # Move BioCyc ID & origin to reference + bigg_reacs['origin'] = 'BioCyc' + bigg_reacs['reference'] = bigg_reacs[ + ['metacyc.reaction', 'origin', 'alias', 'ec-code'] + ].to_dict('records') + + # Drop all unnecessary columns + bigg_reacs.drop( + [ + 'origin', 'metacyc.reaction', 'equation', + 'BiGG', 'alias', 'ec-code' + ], axis=1, inplace=True + ) + + # Replace equation with reaction_string if BioCyc could be mapped to BiGG + bigg_reacs.rename({'reaction_string': 'equation'}, axis=1, inplace=True) + + # Replace BioCyc in via column with BiGG + bigg_reacs['via'] = 'BiGG' + + # Step 2.2: Clean-up of table containing unmapped IDs + # --------------------------------------------------- + if not unmapped_reacs.empty: + # Remove unnecessary columns + unmapped_reacs.dropna(axis=1, how='all', inplace=True) + + # Step 3: Get result(s) + # --------------------- + if not bigg_reacs.empty and not unmapped_reacs.empty: + return pd.concat([bigg_reacs, unmapped_reacs], sort=True, ignore_index=True) + elif not bigg_reacs.empty and unmapped_reacs.empty: + return bigg_reacs + else: + return unmapped_reacs - return biocyc_reacs + mapped2MNX, unmapped_reacs = _map_to_mnx(biocyc_reacs) + if not unmapped_reacs.empty: + mapped2BiGG = _map_to_bigg(unmapped_reacs) - mapped2MNX = _map_to_mnx(biocyc_reacs) - # mapped2BiGG = _map_to_bigg(mapped2MNX[mapped2MNX['via'] == 'BioCyc']) + missing_bc_reacs = ( + pd.concat([mapped2MNX, mapped2BiGG], sort=True, ignore_index=True) if not unmapped_reacs.empty + else mapped2MNX + ) - # @NOTE - # @TODO - missing_bc_reacs = mapped2MNX + statistics['remaining_unmapped'] = len(unmapped_reacs['id'].unique().tolist()) return missing_bc_reacs, statistics @@ -505,7 +615,7 @@ def find_missing_reacs(self,model): # finding the gaps # ---------------- - + # @TODO: Add handling of ec-code as list & ec-code as entry in reference dict def _find_reac_in_model(self, model: cobra.Model, eccode:str, id:str, idtype:Literal['MetaNetX','KEGG','BiGG', 'BioCyc'], include_ec_match:bool=False) -> Union[None, list]: @@ -1279,8 +1389,11 @@ def find_missing_reacs(self, model: cobra.Model): self.biocyc_rxn_tbl, on='id' ) - # Turn ec-code entries with '//' into lists - self.missing_reacs['ec-code'] = self.missing_reacs['ec-code'].str.split('\s*//\s*') + # Turn ec-code entries with '//' into lists, remove prefix 'EC-' & get unique ec=-odes + self.missing_reacs['ec-code'] = self.missing_reacs['ec-code'].str.replace('EC-', '').str.split('\s*//\s*') + self.missing_reacs['ec-code'] = self.missing_reacs['ec-code'].fillna( + {i: [] for i in self.missing_reacs.index} + ).map(set).map(list) # Add 'G_spontaneous' as gene product if marked as spontaneous & # drop is_spontaneous column @@ -1308,32 +1421,37 @@ def find_missing_reacs(self, model: cobra.Model): self._find_reac_in_model(model,x['ec-code'],x['id'],x['via']), axis=1 ) - # Add column 'references' - self.missing_reacs['references'] = None + # Add column 'reference' + self.missing_reacs['reference'] = None # @TODO # Step 4: Map missing reactions without entries in column 'add_to_GPR' # to other databases to get a parsable reaction equation # -------------------------------------------------------------------- - # Map to MetaNetX, then to BiGG - mapped_reacs = None # map_biocyc_to_reac(self.missing_reacs[self.missing_reacs['add_to_GPR'].isna()]) + # Map to MetaNetX, then to BiGG & Merge all results tables into one + mask = self.missing_reacs['add_to_GPR'].isna() + mapped_reacs, statistics = map_biocyc_to_reac(self.missing_reacs[mask]) + + # Get amount of missing_reacs with add_to_GPR + add_to_GPR_before_mapping = self.missing_reacs[~mask] + self._statistics['reactions']['add to GPR (BioCyc)'] = len(add_to_GPR_before_mapping['id'].unique().tolist()) + # Get statistics from mapping + self._statistics['reactions'].update(statistics) # Filter reacs for already in model - ''' mapped_reacs['add_to_GPR'] = mapped_reacs.apply( lambda x: - self._find_reac_in_model(model,x['ec-code'],x['id'],x['via']), axis=1 + self._find_reac_in_model(model,x['reference'],x['id'],x['via']), axis=1 ) - ''' - # Merge self.missing_reacs with the mapped_reacs - # pd.concat(self.missing_reacs[self.missing_reacs['add_to_GPR'].isna()], mapped_reacs) + # Merge self.missing_reacs with add_to_gpr with the mapped_reacs + self.missing_reacs = pd.concat([add_to_GPR_before_mapping, mapped_reacs], sort=True, ignore_index=True) # Step 5: Get results # ------------------- # Split missing reactios based on entries in 'via' & 'add_to_GPR' - mask = (self.missing_reacs['via'] == 'BioCyc') | (self.missing_reacs['add_to_GPR'].isna()) + mask = (self.missing_reacs['via'] == 'BioCyc') & (self.missing_reacs['add_to_GPR'].isna()) # DataFrame with unmappable BioCyc IDs & No entries in 'add_to_GPR' self.manual_curation['BioCyc reactions unmappable'] = self.missing_reacs[mask]