cleaned

PyEED · Apr 15, 2024 · bcea70a · bcea70a
1 parent a79abd4
commit bcea70a
Show file tree

Hide file tree

Showing 6 changed files with 346 additions and 92 deletions.
diff --git a/pyeed/core/alignment.py b/pyeed/core/alignment.py
@@ -1,8 +1,11 @@
 import sdRDM
+from rich.status import Status, Console
 from tqdm import tqdm
 from itertools import combinations
 from typing import List, Optional, Union, Tuple, TYPE_CHECKING
 from pydantic import Field, validator
+from IPython.display import clear_output
+
 from sdRDM.base.listplus import ListPlus
 from sdRDM.base.utils import forge_signature, IDGenerator
 from Bio.Align import Alignment as BioAlignment
@@ -367,8 +370,16 @@ def from_sequences(
             input_sequences=sequences,
         )
 
-        if aligner is not None:
-            return alignment.align(aligner, **kwargs)
+        with Status("Running ClustalOmega...", console=Console(force_terminal=False)):
+
+            if aligner is not None:
+
+                result = alignment.align(aligner, **kwargs)
+
+        clear_output()
+        if result:
+            print("✅ Alignment completed")
+            return result
 
         return alignment
 

diff --git a/pyeed/core/proteininfo.py b/pyeed/core/proteininfo.py
@@ -1,23 +1,26 @@
-import re
 import os
+import asyncio
 from typing import List, Optional
+from IPython.display import clear_output
+from rich.status import Status, Console
+from concurrent.futures import ThreadPoolExecutor
 import warnings
 from pydantic import Field
 from sdRDM.base.listplus import ListPlus
 from sdRDM.base.utils import forge_signature, IDGenerator
-from Bio.Blast import NCBIWWW, NCBIXML
-
-
-from .dnainfo import DNAInfo
-from .proteinregion import ProteinRegion
-from .abstractsequence import AbstractSequence
-from .site import Site
-from .citation import Citation
-from .span import Span
-from .proteinregiontype import ProteinRegionType
-from .substrate import Substrate
-from .dnaregion import DNARegion
-from .proteinsitetype import ProteinSiteType
+from Bio.Blast import NCBIXML
+
+
+from pyeed.core.dnainfo import DNAInfo
+from pyeed.core.proteinregion import ProteinRegion
+from pyeed.core.abstractsequence import AbstractSequence
+from pyeed.core.site import Site
+from pyeed.core.citation import Citation
+from pyeed.core.span import Span
+from pyeed.core.proteinregiontype import ProteinRegionType
+from pyeed.core.substrate import Substrate
+from pyeed.core.dnaregion import DNARegion
+from pyeed.core.proteinsitetype import ProteinSiteType
 from pyeed.container.abstract_container import Blastp
 
 
@@ -164,10 +167,13 @@ def add_to_substrates(
 
     @classmethod
     def get_id(cls, protein_id: str) -> "ProteinInfo":
-        from pyeed.fetch import NCBIProteinFetcher
+        from pyeed.fetch.proteinfetcher import ProteinFetcher
+        import nest_asyncio
+
+        nest_asyncio.apply()
 
         """
-        This method creates a 'ProteinInfo' object from a given NCBI ID.
+        This method creates a 'ProteinInfo' object from a given protein accession ID.
 
         Args:
             protein_id (str): ID of the protein in NCBI or UniProt database.
@@ -180,89 +186,146 @@ def get_id(cls, protein_id: str) -> "ProteinInfo":
             warnings.warn("For getting multiple sequences by ID use `get_ids` instead.")
             return cls.get_ids(protein_id)
 
-        return NCBIProteinFetcher(protein_id).fetch(cls)[0]
+        sequences = asyncio.run(ProteinFetcher(ids=[protein_id]).fetch(quiet=True))[0]
+        clear_output()
+        return sequences
 
     @classmethod
-    def get_ids(
-        cls, accession_ids: List[str], email: str = None, api_key: str = None
-    ) -> List["ProteinInfo"]:
-        from pyeed.fetch import NCBIProteinFetcher
+    def get_ids(cls, accession_ids: List[str]) -> List["ProteinInfo"]:
+        from pyeed.fetch.proteinfetcher import ProteinFetcher
+        import nest_asyncio
+
+        nest_asyncio.apply()
 
-        proteins = NCBIProteinFetcher(accession_ids, email, api_key).fetch(cls)
+        return asyncio.run(
+            ProteinFetcher(ids=accession_ids).fetch(force_terminal=False)
+        )
+
+    @classmethod
+    def from_sequence(
+        cls,
+        sequence: str,
+        exact_match: bool = True,
+        database: str = "nr",
+        matrix: str = "BLOSUM62",
+    ):
+        """
+        Creates a 'ProteinInfo' object from a given protein sequence by
+        performing a BLAST search on NCBI server.
+
+        Args:
+            sequence (str): The protein sequence to search for.
+            exact_match (bool, optional): If True, only exact matches will be considered.
+                If False, approximate matches will also be included. Defaults to True.
+            database (str, optional): The database to search against. Must be one of
+                the supported databases: 'nr', 'swissprot', 'pdb', 'refseq_protein'.
+                Defaults to 'nr'.
 
-        return proteins
+        Returns:
+            ProteinInfo: A 'ProteinInfo' object representing the protein sequence
+                found in the database.
+
+        Raises:
+            AssertionError: If the specified database is not supported.
+        """
+
+        import nest_asyncio
+        from pyeed.fetch.blast import Blast, NCBIDataBase, BlastProgram
+        from pyeed.fetch.proteinfetcher import ProteinFetcher
+
+        nest_asyncio.apply()
+
+        assert (
+            database in NCBIDataBase
+        ), f"Database needs to be one of {NCBIDataBase.__members__.keys()}"
+
+        identity = 1 if exact_match else 0
+
+        blaster = Blast(
+            query=sequence,
+            n_hits=1,
+            identity=identity,
+            matrix=matrix,
+        )
 
-    def ncbi_blastp(
+        with Status("Running BLAST", console=Console(force_terminal=False)) as status:
+            result = asyncio.run(
+                blaster.async_run(
+                    NCBIDataBase.NR.value,
+                    BlastProgram.BLASTP.value,
+                )
+            )
+            clear_output()
+
+            accession = blaster.extract_accession(result)
+
+            status.update("Fetching protein data")
+
+            if accession:
+                return asyncio.run(
+                    ProteinFetcher(ids=accession).fetch(force_terminal=False)
+                )[0]
+
+        return
+
+    def ncbi_blast(
         self,
         n_hits: int,
         e_value: float = 10.0,
-        api_key: str = None,
         database: str = "nr",
+        matrix: str = "BLOSUM62",
+        identity: float = 0.0,
         **kwargs,
     ) -> List["ProteinInfo"]:
-        """Run protein blast for a `ProteinInfo`.
-        Additional keyword arguments can be pass according to the blast [specifications](https://biopython.org/docs/1.75/api/Bio.Blast.NCBIWWW.html).
+        """
+        Runs a BLAST search using the NCBI BLAST service to find similar protein sequences.
 
         Args:
-            n_hits (int): Number of hits to return.
-            e_value (float, optional): E-value threshold. Defaults to 10.0.
-            api_key (str, optional): NCBI API key for sequence retrieval. Defaults to None.
-            database (str, optional): Database to search. Defaults to "nr" (Non Redundant).
-
+            n_hits (int): The number of hits to retrieve.
+            e_value (float, optional): The maximum E-value threshold for reporting hits. Defaults to 10.0.
+            database (str, optional): The database to search against. Defaults to "nr".
+            matrix (str, optional): The substitution matrix to use. Defaults to "BLOSUM62".
+            identity (float, optional): The minimum sequence identity threshold for reporting hits. Defaults to 0.0.
+            **kwargs: Additional keyword arguments.
 
         Returns:
-            List[ProteinInfo]: List of 'ProteinInfo' objects that are the result of the blast search.
-        """
-        from pyeed.fetch import NCBIProteinFetcher
-
-        print("🏃🏼‍♀️ Running PBLAST")
-        print(f"╭── protein name: {self.name}")
-        print(f"├── accession: {self.source_id}")
-        print(f"├── organism: {self.organism.name}")
-        print(f"├── e-value: {e_value}")
-        print(f"╰── max hits: {n_hits}")
-
-        result_handle = NCBIWWW.qblast(
-            "blastp",
-            database,
-            self.sequence,
-            hitlist_size=n_hits,
-            expect=e_value,
-            **kwargs,
-        )
-        blast_record = NCBIXML.read(result_handle)
-
-        accessions = self._get_accessions(blast_record)
-        uniprot_accessions = self._filter_uniprot_accessions(accessions)
-        ncbi_accessions = list(set(accessions) - set(uniprot_accessions))
+            List[ProteinInfo]: A list of ProteinInfo objects representing the similar protein sequences found.
 
-        print(f"🔍 Found {len(ncbi_accessions)} NCBI accessions")
-        print(f"🔍 Found {len(uniprot_accessions)} UniProt accessions")
+        Raises:
+            AssertionError: If the specified database is not supported.
 
-        protein_infos = NCBIProteinFetcher(
-            foreign_id=ncbi_accessions, api_key=api_key
-        ).fetch(ProteinInfo)
-        protein_infos.insert(0, self)
+        Example:
+            protein_info = ProteinInfo()
+            similar_proteins = protein_info.ncbi_blast(n_hits=10, e_value=0.001, database="swissprot")
+        """
 
-        if uniprot_accessions:
-            from pyeed.fetch.uniprotmapper import UniprotFetcher
+        from pyeed.fetch.proteinfetcher import ProteinFetcher
+        from pyeed.fetch.blast import Blast, NCBIDataBase, BlastProgram
+        import nest_asyncio
 
-            uniprot_proteins = UniprotFetcher(foreign_id=uniprot_accessions).fetch()
-            protein_infos.extend(uniprot_proteins)
+        nest_asyncio.apply()
 
-        print("🎉 Done\n")
-        return protein_infos
+        assert database in NCBIDataBase
 
-    def _filter_uniprot_accessions(self, accessions: List[str]) -> List[str]:
-        uniprot_pattern = re.compile(
-            r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
+        program = BlastProgram.BLASTP.value
+        executor = ThreadPoolExecutor(max_workers=1)
+        blaster = Blast(
+            query=self.sequence,
+            n_hits=n_hits,
+            evalue=e_value,
+            matrix=matrix,
+            identity=identity,
         )
 
-        return [
-            uniprot_pattern.match(acc)[0]
-            for acc in accessions
-            if uniprot_pattern.match(acc)
-        ]
+        with Status(
+            "Running BLAST", console=Console(force_terminal=False, force_jupyter=True)
+        ):
+            result = asyncio.run(blaster.async_run(database, program, executor))
+            clear_output()
+
+        accessions = blaster.extract_accession(result)
+
+        return asyncio.run(ProteinFetcher(ids=accessions).fetch(force_terminal=False))
 
     def blastp(
         self,
@@ -320,3 +383,10 @@ def from_ncbi(self):
 
     def from_accessions(self):
         raise DeprecationWarning("This method is deprecated. Use `get_ids` instead.")
+
+
+if __name__ == "__main__":
+    seq_string = "MSDRNIRVEPVVGRAVEEQDVEIVERKGLGHPDSLCDGIAEHVSQALARAYIDRVGKVLHYNTDETQLVAGTAAPAFGGGEVVDPIYLLITGRATKEYEGTKIPAETIALRAAREYINETLPFLEFGTDVVVDVKLGEGSGDLQEVFGEDGKQVPMSNDTSFGVGHAPLTETERIVLEAERALNGDYSDDNPAVGQDIKVMGKREGDDIDVTVAVAMVDRYVDDLDGYEAAVAGVREFVADLATDYTDRNVSVHVNTADDYDEGAIYLTTTGTSAEQGDDGSVGRGNRSNGLITPNRSMSMEATSGKNPVNHIGKIYNLLSTEIARTVVDEVDGIREIRIRLLSQIGQPIDKPHVADANLVTEDGIEIADIEDEVEAIIDAELENVTSITERVIDGELTTF"
+
+    seq = ProteinInfo.from_sequence(seq_string)
+    print(seq)