Dna fetcher (#90)

* added dna fetcher * fixed testing * added ncbi blastn running dna results * latest update * added parallel test * max runnning error * all in working order * updated doc * added missing import --------- Co-authored-by: Niklas Abraham - INFlux <[email protected]> Co-authored-by: Max Häußler <[email protected]>
PyEED · Aug 14, 2024 · da02f48 · da02f48
1 parent 4610e89
commit da02f48
Show file tree

Hide file tree

Showing 10 changed files with 577 additions and 30 deletions.
diff --git a/docs/quick_start/basics.md b/docs/quick_start/basics.md
@@ -37,14 +37,14 @@ Besides adding sequence information manually, PyEED also allows searching for se
 === "DNA"
 
     ``` py
-    # Not implemented
+    dna_record = DNARecord.get_id('AF188200.1')
     ```
 
 ## ⬇️ Save a sequence
 
 ### To file
 
-The sequence can be stored in a `FASTA`, `JSON`, `YAML`, or `XML` file format. Therefore, the respective method can be used.
+The sequence can be stored in a `FASTA`, `JSON`, `YAML`, or `XML` file format. Therefore, the respective method can be used. The methods below are written for protein, but can be used the same for `dna_record`
 The file path is passed as an argument to the method.
 
 === "JSON"

diff --git a/docs/quick_start/blast.md b/docs/quick_start/blast.md
@@ -9,15 +9,36 @@ NCBI offers a web interface for blasting. With PyEED this can be programmaticall
 - matrix (str): The matrix to use for the search. The default is `BLOSUM62`.
 - identity (float): The minimum identity percentage for the search. The default is `0.0`.
 
-``` py
-from pyeed.core import ProteinRecord
 
-# Create a ProteinInfo object
-protein = ProteinRecord.get_id("UCS38941.1")
 
-# Perform a BLAST search
-blast_results = protein.ncbi_blast()
-```
-!!! info "NCBI BLAST performance"
+=== "Protein"
+
+    ``` py
+    from pyeed.core import ProteinRecord
+
+    # Create a ProteinInfo object
+    protein = ProteinRecord.get_id("UCS38941.1")
+
+    # Perform a BLAST search
+    blast_results = protein.ncbi_blast()
+    ```
+    !!! info "NCBI BLAST performance"
+
+        Due to server-side limitations of NCBI, BLAST searches might be slowed down or even be blocked, if multiple searches are performed in a short time.
+
+=== "DNA"
+
+    ``` py 
+    from pyeed.core import DNARecord
+
+    # Create a dna record object
+    dna_record = DNARecord.get_id('AF188200.1')
+
+    # Perform the blast search
+    similar_dna_records = dna_record.ncbi_blast(n_hits=100, db="nt")
+
+    ```
+    !!! info "NCBI BLAST performance"
+
+            Due to server-side limitations of NCBI, BLAST searches might be slowed down or even be blocked, if multiple searches are performed in a short time.
 
-    Due to server-side limitations of NCBI, BLAST searches might be slowed down or even be blocked, if multiple searches are performed in a short time.
diff --git a/pyeed/core/dnarecord.py b/pyeed/core/dnarecord.py
@@ -1,13 +1,23 @@
-from typing import Dict, Optional
+from concurrent.futures import ThreadPoolExecutor
+from typing import Dict, Optional, List
 from uuid import uuid4
 
 from lxml.etree import _Element
 from pydantic import PrivateAttr, model_validator
 from pydantic_xml import attr, element
+from rich.status import Status
+from rich.console import Console
 from sdRDM.base.listplus import ListPlus
+from IPython.display import clear_output
 from sdRDM.tools.utils import elem2dict
+import asyncio
+import warnings
 
-from .sequencerecord import SequenceRecord
+from pyeed.fetch.blast import Blast, BlastProgram
+from pyeed.fetch.dnafetcher import DNAFetcher
+
+
+from pyeed.core.sequencerecord import SequenceRecord
 
 
 class DNARecord(
@@ -48,3 +58,118 @@ def _parse_raw_xml_data(self):
                 self._raw_xml_data[attr] = elem2dict(value)
 
         return self
+
+
+    @classmethod
+    def get_id(cls, dna_id: str) -> "DNARecord":
+        """
+        This method creates a 'DNARecord' object from a given DNA accession ID.
+
+        Args:
+            dna_id (str): ID of the DNA in NCBI database.
+
+        Returns:
+            DNARecord: 'DNARecord' with information of the corresponding dna_id.
+        """
+
+        import nest_asyncio
+
+        nest_asyncio.apply()
+
+        if isinstance(dna_id, list) and all(isinstance(x, str) for x in dna_id):
+            warnings.warn("For getting multiple sequences by ID use `get_ids` instead.")
+            return cls.get_ids(dna_id)
+
+        sequences = asyncio.run(DNAFetcher(ids=[dna_id]).fetch(quiet=False))[0]
+        clear_output()
+        return sequences
+
+    @classmethod
+    def get_ids(cls, accession_ids: List[str]) -> List["DNARecord"]:
+        """Creates a list of 'DNARecord' objects from a list of dna accession IDs.
+
+        Returns:
+            List[DNARecord]: A list of 'DNARecord' objects representing the dna sequences found.
+        """
+
+        import nest_asyncio
+
+        nest_asyncio.apply()
+
+        return asyncio.run(
+            DNAFetcher(ids=accession_ids).fetch(force_terminal=False)
+        )
+
+    def ncbi_blast(
+        self,
+        n_hits: int,
+        e_value: float = 10.0,
+        db: str = "nt",
+        identity: float = 0.0,
+        **kwargs,
+    ) -> List["DNARecord"]:
+        """
+        Runs a BLAST search using the NCBI BLAST service to find similar dna sequences.
+
+        Args:
+            n_hits (int): The number of hits to retrieve.
+            e_value (float, optional): The maximum E-value threshold for reporting hits. Defaults to 10.0.
+            db (str, optional): The database to search against. Defaults to "nt".
+            match/mismatch (int, optional): Match/mismatch score. Defaults to 1/-2.
+            gap_cost (int, optional): Cost to open a gap. Default is 0 and means linear
+            identity (float, optional): The minimum sequence identity threshold for reporting hits. Defaults to 0.0.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            List[DNARecord]: A list of DNARecord objects representing the similar dna sequences found.
+
+        Raises:
+            AssertionError: If the specified database is not supported.
+
+        Example:
+            dna_info = DNARecord()
+            similar_proteins = dna_info.ncbi_blast(n_hits=10, e_value=0.001, database="nt")
+        """
+
+        import nest_asyncio
+
+        from pyeed.fetch.blast import NCBIDataBase
+
+        nest_asyncio.apply()
+
+        assert (
+            db in NCBIDataBase
+        ), f"Database needs to be one of {NCBIDataBase.__members__.keys()}"
+
+        program = BlastProgram.BLASTN.value
+        executor = ThreadPoolExecutor(max_workers=1)
+        blaster = Blast(
+            query=self.sequence,
+            n_hits=n_hits,
+            evalue=e_value,
+            identity=identity,
+        )
+
+
+        with Status(
+            "Running BLAST", console=Console(force_terminal=False, force_jupyter=False)
+        ):
+             result = asyncio.run(blaster.async_run(db, program, executor))
+             clear_output()
+
+        # result = blaster.run(program, db)
+
+
+        accessions = blaster.extract_accession(result)
+
+        return self.get_ids(accessions)
+
+
+if __name__ == "__main__":
+    dna_record = DNARecord.get_id('AF188200.1')
+
+    print(f"DNA Record: {dna_record}")
+
+    print('Running blast...')
+    similar_dna_records = dna_record.ncbi_blast(n_hits=100, db="nt")
+    print(similar_dna_records)
diff --git a/pyeed/core/proteinrecord.py b/pyeed/core/proteinrecord.py
@@ -335,10 +335,15 @@ def ncbi_blast(
     #     return protein_infos
 
     def get_dna(self):
-        if not self.coding_sequence_ref:
-            return
+        try:
+            if not self.coding_sequence:
+                return
+
+            return DNARecord.get_id(self.coding_sequence[0].id)
 
-        return DNARecord.from_ncbi(self.coding_sequence_ref.id)
+        except Exception as e:
+            print('The DNA sequence could not be retrieved. The error is: ', e)
+            return
 
     def _nblast(sequence: str, n_hits: int = None) -> List["ProteinRecord"]:
         # blast_record = NCBIXML.read(result_handle)

diff --git a/pyeed/fetch/blast.py b/pyeed/fetch/blast.py
@@ -38,6 +38,7 @@ class NCBIDataBase(BaseEnum):
     UNIPROTKB = "swissprot"
     PDB = "pdb"
     REFSEQ = "refseq_protein"
+    NT = "nt"
 
 
 class SubstitutionMatrix(BaseEnum):
@@ -84,14 +85,26 @@ def run(self, program: str, ncbi_db: str) -> io.StringIO:
             ncbi_db in NCBIDataBase
         ), f"Invalid database: {ncbi_db}, valid databases: {NCBIDataBase}"
 
-        return NCBIWWW.qblast(
-            program,
-            ncbi_db,
-            self.query,
-            expect=self.evalue,
-            matrix_name=self.matrix,
-            hitlist_size=self.n_hits,
-        )
+        if program == BlastProgram.BLASTP.value:
+
+            return NCBIWWW.qblast(
+                program,
+                ncbi_db,
+                self.query,
+                expect=self.evalue,
+                matrix_name=self.matrix,
+                hitlist_size=self.n_hits,
+            )
+
+        elif program == BlastProgram.BLASTN.value:
+
+            return NCBIWWW.qblast(
+                program=program,
+                database=ncbi_db,
+                sequence=self.query,
+                expect=self.evalue,
+                hitlist_size=self.n_hits,
+            )
 
     async def async_run(
         self,
@@ -146,12 +159,9 @@ async def main():
         with Status("Running BLAST"):
             result = await blast_task
 
-        parsed_result = blast.parse(result)
-        print(parsed_result)
-
         executor.shutdown()
 
-        return parsed_result
+        return result
 
     results = asyncio.run(main())
 

diff --git a/pyeed/fetch/dnafetcher.py b/pyeed/fetch/dnafetcher.py
@@ -0,0 +1,92 @@
+import asyncio
+import json
+import logging
+from typing import List
+
+import nest_asyncio
+from rich.console import Console
+from rich.progress import Progress
+
+from pyeed.fetch.requester import AsyncRequester, AsyncParamRequester
+
+from .ncbidnamapper import NCBIDNAMapper
+
+LOGGER = logging.getLogger(__name__)
+
+
+class DNAFetcher:
+    def __init__(self, ids: List[str]):
+        self.ids = ids
+        nest_asyncio.apply()
+
+    async def fetch(self, **console_kwargs):
+        """
+        Fetches DNA data from various databases based on the provided IDs.
+
+        Parameters:
+            force_terminal (bool): Whether to force the use of a terminal
+            for progress tracking.
+
+        Returns:
+            List[dnarecord]: A list of dnaRecord objects containing the fetched dna data.
+
+        Raises:
+            Exception: If there is an error during the fetching process.
+
+        """
+        # right now in the first batch version we just fetch from NCBI
+        param_requester = None
+
+        with Progress(
+            console=Console(**console_kwargs),
+        ) as progress:
+            requesters: List[AsyncRequester] = []
+
+            # 
+            task_id = progress.add_task(
+                f"Requesting sequences from NCBI...", total=len(self.ids)
+            )
+            requesters.append(
+                AsyncRequester(
+                    ids=self.ids,
+                    url="https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&retmode=text&rettype=genbank&id=",
+                    task_id=task_id,
+                    progress=progress,
+                    batch_size=10,
+                    rate_limit=2,
+                    n_concurrent=5,
+                )
+            )
+
+
+            responses = await asyncio.gather(
+                *[requester.make_request() for requester in requesters]
+            )
+
+
+            # in case of multiple databases, identify the source of the data
+            ncbi_responses, uniprot_response = self.identify_data_source(responses)
+
+            # map data to objects
+            ncbi_entries = NCBIDNAMapper().map(responses=ncbi_responses)
+
+            return ncbi_entries
+
+
+    def identify_data_source(self, responses: List[str]) -> tuple:
+        """
+        Identifies the source of the data based on the response content.
+        """
+        ncbi_responses = []
+        uniprot_response = []
+
+        for response in responses:
+            if response[0].startswith("LOCUS"):
+                ncbi_responses.append(response)
+            else:
+                uniprot_response.append(response)
+
+        return ncbi_responses, uniprot_response
+
+
+