Skip to content

Commit

Permalink
cleaned
Browse files Browse the repository at this point in the history
  • Loading branch information
haeussma committed Apr 15, 2024
1 parent a79abd4 commit bcea70a
Show file tree
Hide file tree
Showing 6 changed files with 346 additions and 92 deletions.
15 changes: 13 additions & 2 deletions pyeed/core/alignment.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
import sdRDM
from rich.status import Status, Console
from tqdm import tqdm
from itertools import combinations
from typing import List, Optional, Union, Tuple, TYPE_CHECKING
from pydantic import Field, validator
from IPython.display import clear_output

from sdRDM.base.listplus import ListPlus
from sdRDM.base.utils import forge_signature, IDGenerator
from Bio.Align import Alignment as BioAlignment
Expand Down Expand Up @@ -367,8 +370,16 @@ def from_sequences(
input_sequences=sequences,
)

if aligner is not None:
return alignment.align(aligner, **kwargs)
with Status("Running ClustalOmega...", console=Console(force_terminal=False)):

if aligner is not None:

result = alignment.align(aligner, **kwargs)

clear_output()
if result:
print("✅ Alignment completed")
return result

return alignment

Expand Down
222 changes: 146 additions & 76 deletions pyeed/core/proteininfo.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,26 @@
import re
import os
import asyncio
from typing import List, Optional
from IPython.display import clear_output
from rich.status import Status, Console
from concurrent.futures import ThreadPoolExecutor
import warnings
from pydantic import Field
from sdRDM.base.listplus import ListPlus
from sdRDM.base.utils import forge_signature, IDGenerator
from Bio.Blast import NCBIWWW, NCBIXML


from .dnainfo import DNAInfo
from .proteinregion import ProteinRegion
from .abstractsequence import AbstractSequence
from .site import Site
from .citation import Citation
from .span import Span
from .proteinregiontype import ProteinRegionType
from .substrate import Substrate
from .dnaregion import DNARegion
from .proteinsitetype import ProteinSiteType
from Bio.Blast import NCBIXML


from pyeed.core.dnainfo import DNAInfo
from pyeed.core.proteinregion import ProteinRegion
from pyeed.core.abstractsequence import AbstractSequence
from pyeed.core.site import Site
from pyeed.core.citation import Citation
from pyeed.core.span import Span
from pyeed.core.proteinregiontype import ProteinRegionType
from pyeed.core.substrate import Substrate
from pyeed.core.dnaregion import DNARegion
from pyeed.core.proteinsitetype import ProteinSiteType
from pyeed.container.abstract_container import Blastp


Expand Down Expand Up @@ -164,10 +167,13 @@ def add_to_substrates(

@classmethod
def get_id(cls, protein_id: str) -> "ProteinInfo":
from pyeed.fetch import NCBIProteinFetcher
from pyeed.fetch.proteinfetcher import ProteinFetcher
import nest_asyncio

nest_asyncio.apply()

"""
This method creates a 'ProteinInfo' object from a given NCBI ID.
This method creates a 'ProteinInfo' object from a given protein accession ID.
Args:
protein_id (str): ID of the protein in NCBI or UniProt database.
Expand All @@ -180,89 +186,146 @@ def get_id(cls, protein_id: str) -> "ProteinInfo":
warnings.warn("For getting multiple sequences by ID use `get_ids` instead.")
return cls.get_ids(protein_id)

return NCBIProteinFetcher(protein_id).fetch(cls)[0]
sequences = asyncio.run(ProteinFetcher(ids=[protein_id]).fetch(quiet=True))[0]
clear_output()
return sequences

@classmethod
def get_ids(
cls, accession_ids: List[str], email: str = None, api_key: str = None
) -> List["ProteinInfo"]:
from pyeed.fetch import NCBIProteinFetcher
def get_ids(cls, accession_ids: List[str]) -> List["ProteinInfo"]:
from pyeed.fetch.proteinfetcher import ProteinFetcher
import nest_asyncio

nest_asyncio.apply()

proteins = NCBIProteinFetcher(accession_ids, email, api_key).fetch(cls)
return asyncio.run(
ProteinFetcher(ids=accession_ids).fetch(force_terminal=False)
)

@classmethod
def from_sequence(
cls,
sequence: str,
exact_match: bool = True,
database: str = "nr",
matrix: str = "BLOSUM62",
):
"""
Creates a 'ProteinInfo' object from a given protein sequence by
performing a BLAST search on NCBI server.
Args:
sequence (str): The protein sequence to search for.
exact_match (bool, optional): If True, only exact matches will be considered.
If False, approximate matches will also be included. Defaults to True.
database (str, optional): The database to search against. Must be one of
the supported databases: 'nr', 'swissprot', 'pdb', 'refseq_protein'.
Defaults to 'nr'.
return proteins
Returns:
ProteinInfo: A 'ProteinInfo' object representing the protein sequence
found in the database.
Raises:
AssertionError: If the specified database is not supported.
"""

import nest_asyncio
from pyeed.fetch.blast import Blast, NCBIDataBase, BlastProgram
from pyeed.fetch.proteinfetcher import ProteinFetcher

nest_asyncio.apply()

assert (
database in NCBIDataBase
), f"Database needs to be one of {NCBIDataBase.__members__.keys()}"

identity = 1 if exact_match else 0

blaster = Blast(
query=sequence,
n_hits=1,
identity=identity,
matrix=matrix,
)

def ncbi_blastp(
with Status("Running BLAST", console=Console(force_terminal=False)) as status:
result = asyncio.run(
blaster.async_run(
NCBIDataBase.NR.value,
BlastProgram.BLASTP.value,
)
)
clear_output()

accession = blaster.extract_accession(result)

status.update("Fetching protein data")

if accession:
return asyncio.run(
ProteinFetcher(ids=accession).fetch(force_terminal=False)
)[0]

return

def ncbi_blast(
self,
n_hits: int,
e_value: float = 10.0,
api_key: str = None,
database: str = "nr",
matrix: str = "BLOSUM62",
identity: float = 0.0,
**kwargs,
) -> List["ProteinInfo"]:
"""Run protein blast for a `ProteinInfo`.
Additional keyword arguments can be pass according to the blast [specifications](https://biopython.org/docs/1.75/api/Bio.Blast.NCBIWWW.html).
"""
Runs a BLAST search using the NCBI BLAST service to find similar protein sequences.
Args:
n_hits (int): Number of hits to return.
e_value (float, optional): E-value threshold. Defaults to 10.0.
api_key (str, optional): NCBI API key for sequence retrieval. Defaults to None.
database (str, optional): Database to search. Defaults to "nr" (Non Redundant).
n_hits (int): The number of hits to retrieve.
e_value (float, optional): The maximum E-value threshold for reporting hits. Defaults to 10.0.
database (str, optional): The database to search against. Defaults to "nr".
matrix (str, optional): The substitution matrix to use. Defaults to "BLOSUM62".
identity (float, optional): The minimum sequence identity threshold for reporting hits. Defaults to 0.0.
**kwargs: Additional keyword arguments.
Returns:
List[ProteinInfo]: List of 'ProteinInfo' objects that are the result of the blast search.
"""
from pyeed.fetch import NCBIProteinFetcher

print("🏃🏼‍♀️ Running PBLAST")
print(f"╭── protein name: {self.name}")
print(f"├── accession: {self.source_id}")
print(f"├── organism: {self.organism.name}")
print(f"├── e-value: {e_value}")
print(f"╰── max hits: {n_hits}")

result_handle = NCBIWWW.qblast(
"blastp",
database,
self.sequence,
hitlist_size=n_hits,
expect=e_value,
**kwargs,
)
blast_record = NCBIXML.read(result_handle)

accessions = self._get_accessions(blast_record)
uniprot_accessions = self._filter_uniprot_accessions(accessions)
ncbi_accessions = list(set(accessions) - set(uniprot_accessions))
List[ProteinInfo]: A list of ProteinInfo objects representing the similar protein sequences found.
print(f"🔍 Found {len(ncbi_accessions)} NCBI accessions")
print(f"🔍 Found {len(uniprot_accessions)} UniProt accessions")
Raises:
AssertionError: If the specified database is not supported.
protein_infos = NCBIProteinFetcher(
foreign_id=ncbi_accessions, api_key=api_key
).fetch(ProteinInfo)
protein_infos.insert(0, self)
Example:
protein_info = ProteinInfo()
similar_proteins = protein_info.ncbi_blast(n_hits=10, e_value=0.001, database="swissprot")
"""

if uniprot_accessions:
from pyeed.fetch.uniprotmapper import UniprotFetcher
from pyeed.fetch.proteinfetcher import ProteinFetcher
from pyeed.fetch.blast import Blast, NCBIDataBase, BlastProgram
import nest_asyncio

uniprot_proteins = UniprotFetcher(foreign_id=uniprot_accessions).fetch()
protein_infos.extend(uniprot_proteins)
nest_asyncio.apply()

print("🎉 Done\n")
return protein_infos
assert database in NCBIDataBase

def _filter_uniprot_accessions(self, accessions: List[str]) -> List[str]:
uniprot_pattern = re.compile(
r"[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}"
program = BlastProgram.BLASTP.value
executor = ThreadPoolExecutor(max_workers=1)
blaster = Blast(
query=self.sequence,
n_hits=n_hits,
evalue=e_value,
matrix=matrix,
identity=identity,
)

return [
uniprot_pattern.match(acc)[0]
for acc in accessions
if uniprot_pattern.match(acc)
]
with Status(
"Running BLAST", console=Console(force_terminal=False, force_jupyter=True)
):
result = asyncio.run(blaster.async_run(database, program, executor))
clear_output()

accessions = blaster.extract_accession(result)

return asyncio.run(ProteinFetcher(ids=accessions).fetch(force_terminal=False))

def blastp(
self,
Expand Down Expand Up @@ -320,3 +383,10 @@ def from_ncbi(self):

def from_accessions(self):
raise DeprecationWarning("This method is deprecated. Use `get_ids` instead.")


if __name__ == "__main__":
seq_string = "MSDRNIRVEPVVGRAVEEQDVEIVERKGLGHPDSLCDGIAEHVSQALARAYIDRVGKVLHYNTDETQLVAGTAAPAFGGGEVVDPIYLLITGRATKEYEGTKIPAETIALRAAREYINETLPFLEFGTDVVVDVKLGEGSGDLQEVFGEDGKQVPMSNDTSFGVGHAPLTETERIVLEAERALNGDYSDDNPAVGQDIKVMGKREGDDIDVTVAVAMVDRYVDDLDGYEAAVAGVREFVADLATDYTDRNVSVHVNTADDYDEGAIYLTTTGTSAEQGDDGSVGRGNRSNGLITPNRSMSMEATSGKNPVNHIGKIYNLLSTEIARTVVDEVDGIREIRIRLLSQIGQPIDKPHVADANLVTEDGIEIADIEDEVEAIIDAELENVTSITERVIDGELTTF"

seq = ProteinInfo.from_sequence(seq_string)
print(seq)
Loading

0 comments on commit bcea70a

Please sign in to comment.