-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #21 from Daethyra/working
Working
- Loading branch information
Showing
16 changed files
with
394 additions
and
115 deletions.
There are no files selected for viewing
7 changes: 1 addition & 6 deletions
7
Basic-GPT-GUI/.env.template → .archive/Basic-GPT-GUI/.env.template
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,9 +1,4 @@ | ||
[OpenAI] | ||
OPENAI_API_KEY = sk- | ||
MODEL = gpt-4-32k # gpt-3.5-turbo # gpt-4 # gpt-4-32k | ||
TEMPERATURE = 0 | ||
|
||
[Pinecone] | ||
PINECONE_API_KEY = | ||
PINECONE_ENVIRONMENT = | ||
PINEDEX = | ||
TEMPERATURE = 0.5 |
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
2 changes: 1 addition & 1 deletion
2
Basic-GPT-GUI/src/openai_chat.py → .archive/Basic-GPT-GUI/src/openai_chat.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
class OpenAI_Chat: | ||
def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', 0)): | ||
def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', .5)): | ||
self.model = model | ||
self.temperature = float(temperature) | ||
self.messages = [] |
File renamed without changes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
[OpenAI] | ||
OPENAI_API_KEY = sk-[...] | ||
MODEL = text-embeddings-ada-002 | ||
|
||
[Pinecone] | ||
PINECONE_API_KEY = | ||
PINECONE_ENVIRONMENT = us-central1-gcp | ||
PINEDEX = default_name | ||
DROPCOLUMNS = metadata,sparse_values |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Streamline calls to OpenAI and Pinecone | Automate the OP stack | ||
|
||
## What's this? | ||
|
||
This single pluggable module named [pinembed.py](pinembed.py) provides a data-pipe using the OP stack. | ||
It automates the retrieval of vector embeddings from OpenAI's `text-embeddings-ada-002` model as well the uploading of said data to a Pinecone index. | ||
|
||
It does the following: | ||
|
||
- Ingests data | ||
- Sends data to 'Ada-002' at OpenAI to receive embeddings | ||
- Automatically [upserts](https://docs.pinecone.io/reference/upsert "Upsert documentation") received embedding data in real time | ||
|
||
## Why should I care? | ||
|
||
- Skip the programming! | ||
- Provides a modular multi-class structure for isolating and using specific functionality, like asynchronous embedding retrieval. | ||
- Eases the process of building Large Language Models | ||
- Enables semantic similarity searches | ||
- [Empowers](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings#:~:text=To%20see%20embeddings%20in%20action%2C%20check%20out%20our%20code%20samples "Reference Documentation"): | ||
- Classification | ||
- Topic clustering | ||
- Search | ||
- Recommendations | ||
|
||
### Requirements | ||
|
||
- OpenAI | ||
- Pinecone | ||
- Python-dotenv | ||
|
||
## Roadmap | ||
|
||
1) Create pseudocode for more functionality, namely further querying the Pinecone index | ||
2) Draft Python logic for ['similarity'](https://docs.pinecone.io/reference/query) queries | ||
3) Remove 0.3 data-stream cooldown. | This is literally an async pluggable module -- don't need that. | ||
4) Create LangChain class on top of `DataStreamHandler` with the goal of testing it as a Question/Answering service | ||
* LangChain `DirectoryLoader` | ||
5) Extend package to enable [Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/ "Agent Documentation") & [Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/ "Memory Documentation") for large language models | ||
|
||
#### Self-asked Dev-questions | ||
|
||
- How will someone easily query their index? | ||
- Automating 'similarity' queries is a good starting point | ||
- How can this module be even easier to side-load for *any* project? | ||
- Did I properly write docstrings that accurately reflect the expected data types for Pinecone specifically? I know I checked for Ada-002. | ||
- Is it worth having multiple data streams for different processes an end-user might have? Especially if they're an organization, with multiple keys running? | ||
- I'd also therefore need to make room for more keys, etc. I will use organizational ID management to help further differentiate where necessary. | ||
|
||
## Official Reference Documentation | ||
|
||
- [OpenAI Documentation](https://platform.openai.com/docs/guides/embeddings) | ||
- [Embeddings API Reference](https://platform.openai.com/docs/api-reference) | ||
- [Pinecone Example Projects](https://docs.pinecone.io/page/examples) | ||
- [Pinecone API Reference](https://docs.pinecone.io/reference) | ||
- [LangChain / Pinecone "Getting Startetd"](https://www.pinecone.io/learn/series/langchain/langchain-intro/) | ||
- [LangChain Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/) | ||
- [LangChain Conversational Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/) | ||
|
||
## [LICENSE](../LICENSE) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
"""Easily automate the retrieval from OpenAI and storage of embeddings in Pinecone.""" | ||
|
||
import os | ||
import logging | ||
import asyncio | ||
from dotenv import load_dotenv | ||
from datetime import datetime | ||
from typing import Dict, Union, List | ||
import openai | ||
import pinecone | ||
import backoff | ||
|
||
# Load environment variables from .env file | ||
load_dotenv() | ||
|
||
# Initialize logging | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
class EnvConfig: | ||
"""Class for handling environment variables and API keys.""" | ||
|
||
def __init__(self) -> None: | ||
"""Initialize environment variables.""" | ||
self.openai_key: str = os.getenv("OPENAI_API_KEY") | ||
self.pinecone_key: str = os.getenv("PINECONE_API_KEY") | ||
self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT") | ||
self.pinecone_index: str = os.getenv("PINEDEX") | ||
self.drop_columns: List[str] = os.getenv("DROPCOLUMNS", "").split(",") | ||
|
||
# Remove any empty strings that may appear if "DROPCOLUMNS" is empty or has trailing commas | ||
self.drop_columns = [col.strip() for col in self.drop_columns if col.strip()] | ||
|
||
class OpenAIHandler: | ||
"""Class for handling OpenAI operations.""" | ||
|
||
def __init__(self, config: EnvConfig) -> None: | ||
"""Initialize OpenAI API key.""" | ||
openai.api_key = config.openai_key | ||
|
||
@backoff.on_exception(backoff.expo, Exception, max_tries=3) | ||
async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[float]]]: | ||
""" | ||
Create an embedding using OpenAI. | ||
Parameters: | ||
input_text (str): The text to be embedded. | ||
Returns: | ||
Dict[str, Union[int, List[float]]]: The embedding response. | ||
""" | ||
response = openai.Embedding.create( | ||
model="text-embedding-ada-002", | ||
input=input_text, | ||
# Might be useful to add the user parameter | ||
) | ||
return response | ||
|
||
class PineconeHandler: | ||
"""Class for handling Pinecone operations.""" | ||
|
||
def __init__(self, config: "EnvConfig") -> None: | ||
""" | ||
Initialize Pinecone API key, environment, and index name. | ||
Args: | ||
config (EnvConfig): An instance of the EnvConfig class containing environment variables and API keys. | ||
""" | ||
pinecone.init(api_key=config.pinecone_key, environment=config.pinecone_environment) | ||
self.index_name = config.pinecone_index | ||
self.drop_columns = config.drop_columns | ||
|
||
@backoff.on_exception(backoff.expo, Exception, max_tries=3) | ||
async def upload_embedding(self, embedding: Dict[str, Union[int, List[float]]]) -> None: | ||
""" | ||
Asynchronously uploads an embedding to the Pinecone index specified during initialization. | ||
This method will retry up to 3 times in case of failure, using exponential back-off. | ||
Args: | ||
embedding (Dict): A dictionary containing the following keys: | ||
- 'id': A unique identifier for the embedding (str). | ||
- 'values': A list of numerical values for the embedding (List[float]). | ||
- 'metadata' (Optional): Any additional metadata as a dictionary (Dict). | ||
- 'sparse_values' (Optional): Sparse values of the embedding as a dictionary with 'indices' and 'values' (Dict). | ||
""" | ||
# Initialize Pinecone index | ||
index = pinecone.Index(self.index_name) | ||
|
||
# Prepare the item for upsert | ||
item = { | ||
'id': embedding['id'], | ||
'values': embedding['values'], | ||
'metadata': embedding.get('metadata', {}), | ||
'sparse_values': embedding.get('sparse_values', {}) | ||
} | ||
|
||
# Perform the upsert operation | ||
index.upsert(vectors=[item]) | ||
|
||
class DataStreamHandler: | ||
"""Class for handling data streams.""" | ||
|
||
def __init__(self, openai_handler: OpenAIHandler, pinecone_handler: PineconeHandler) -> None: | ||
"""Initialize DataStreamHandler.""" | ||
self.openai_handler = openai_handler | ||
self.pinecone_handler = pinecone_handler | ||
self.last_run_time: datetime = datetime.now() | ||
|
||
async def process_data(self, data: str) -> None: | ||
""" | ||
Process data to create and upload embeddings. | ||
Parameters: | ||
data (str): The data to be processed. | ||
""" | ||
if type(data) != str: | ||
raise ValueError("Invalid data type.") | ||
|
||
current_time = datetime.now() | ||
elapsed_time = (current_time - self.last_run_time).total_seconds() | ||
if elapsed_time < 0.3: | ||
await asyncio.sleep(0.3 - elapsed_time) | ||
|
||
self.last_run_time = datetime.now() | ||
embedding = await self.openai_handler.create_embedding(data) | ||
await self.pinecone_handler.upload_embedding(embedding) | ||
|
||
if __name__ == "__main__": | ||
config = EnvConfig() | ||
openai_handler = OpenAIHandler(config) | ||
pinecone_handler = PineconeHandler(config) | ||
data_streams = [DataStreamHandler(openai_handler, pinecone_handler) for _ in range(3)] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
openai | ||
pinecone-client | ||
python-dotenv | ||
langchain |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
import unittest | ||
from unittest.mock import patch | ||
import os | ||
import logging | ||
import asyncio | ||
from dotenv import load_dotenv | ||
from datetime import datetime | ||
from typing import Dict, Union, List | ||
import openai | ||
import pinecone | ||
import backoff | ||
|
||
# Load environment variables from .env file | ||
load_dotenv() | ||
|
||
# Initialize logging | ||
logging.basicConfig(level=logging.INFO) | ||
logger = logging.getLogger(__name__) | ||
|
||
class EnvConfig: | ||
"""Class for handling environment variables and API keys.""" | ||
|
||
def __init__(self) -> None: | ||
"""Initialize environment variables.""" | ||
self.openai_key: str = os.getenv("OPENAI_API_KEY") | ||
self.pinecone_key: str = os.getenv("PINECONE_API_KEY") | ||
self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT") | ||
self.pinecone_environment: str = os.getenv("PINEDEX") | ||
|
||
class OpenAIHandler: | ||
"""Class for handling OpenAI operations.""" | ||
|
||
def __init__(self, config: EnvConfig) -> None: | ||
"""Initialize OpenAI API key.""" | ||
openai.api_key = config.openai_key | ||
|
||
@backoff.on_exception(backoff.expo, Exception, max_tries=3) | ||
async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[float]]]: | ||
""" | ||
Create an embedding using OpenAI. | ||
Parameters: | ||
input_text (str): The text to be embedded. | ||
Returns: | ||
Dict[str, Union[int, List[float]]]: The embedding response. | ||
""" | ||
response = openai.Embedding.create( | ||
model="text-embedding-ada-002",engine="ada", | ||
text=input_text, | ||
) | ||
return response | ||
|
||
# Create test class | ||
class TestOpenAIHandler(unittest.TestCase): | ||
# Set up test environment | ||
def setUp(self): | ||
self.config = EnvConfig() | ||
self.openai_handler = OpenAIHandler(self.config) | ||
|
||
# Test create_embedding method | ||
@patch('openai.Embedding.create') | ||
def test_create_embedding(self, mock_create): | ||
input_text = 'This is a test' | ||
expected_response = {'id': 12345, 'embedding': [1.0, 2.0, 3.0]} | ||
mock_create.return_value = expected_response | ||
response = self.openai_handler.create_embedding(input_text) | ||
self.assertEqual(response, expected_response) | ||
|
||
if __name__ == "__main__": | ||
unittest.main() |
Oops, something went wrong.