Merge pull request #21 from Daethyra/working

Working
Daethyra · Oct 4, 2023 · a4260e7 · a4260e7
2 parents 24a83fa + 74144d7
commit a4260e7
Show file tree

Hide file tree

Showing 16 changed files with 394 additions and 115 deletions.
diff --git a/Basic-GPT-GUI/.env.template → .archive/Basic-GPT-GUI/.env.template b/Basic-GPT-GUI/.env.template → .archive/Basic-GPT-GUI/.env.template
@@ -1,9 +1,4 @@
 [OpenAI]
 OPENAI_API_KEY = sk-
 MODEL = gpt-4-32k # gpt-3.5-turbo # gpt-4 # gpt-4-32k
-TEMPERATURE = 0
-
-[Pinecone]
-PINECONE_API_KEY = 
-PINECONE_ENVIRONMENT = 
-PINEDEX = 
+TEMPERATURE = 0.5
diff --git a/Basic-GPT-GUI/.gitignore → .archive/Basic-GPT-GUI/.gitignore b/Basic-GPT-GUI/.gitignore → .archive/Basic-GPT-GUI/.gitignore
diff --git a/Basic-GPT-GUI/main.py → .archive/Basic-GPT-GUI/main.py b/Basic-GPT-GUI/main.py → .archive/Basic-GPT-GUI/main.py
diff --git a/Basic-GPT-GUI/requirements.txt → .archive/Basic-GPT-GUI/requirements.txt b/Basic-GPT-GUI/requirements.txt → .archive/Basic-GPT-GUI/requirements.txt
diff --git a/Basic-GPT-GUI/src/gui.py → .archive/Basic-GPT-GUI/src/gui.py b/Basic-GPT-GUI/src/gui.py → .archive/Basic-GPT-GUI/src/gui.py
diff --git a/Basic-GPT-GUI/src/openai_chat.py → .archive/Basic-GPT-GUI/src/openai_chat.py b/Basic-GPT-GUI/src/openai_chat.py → .archive/Basic-GPT-GUI/src/openai_chat.py
@@ -1,5 +1,5 @@
 class OpenAI_Chat:
-    def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', 0)):
+    def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', .5)):
         self.model = model
         self.temperature = float(temperature)
         self.messages = []
diff --git a/GPT-Prompt-Examples/TLDR.md → .archive/TLDR.md b/GPT-Prompt-Examples/TLDR.md → .archive/TLDR.md
diff --git a/.env.template b/.env.template
@@ -0,0 +1,9 @@
+[OpenAI]
+OPENAI_API_KEY = sk-[...]
+MODEL = text-embeddings-ada-002
+
+[Pinecone]
+PINECONE_API_KEY = 
+PINECONE_ENVIRONMENT = us-central1-gcp
+PINEDEX = default_name
+DROPCOLUMNS = metadata,sparse_values
diff --git a/.github/mindmap.png b/.github/mindmap.png
diff --git a/Auto-Embedder/README.md b/Auto-Embedder/README.md
@@ -0,0 +1,60 @@
+# Streamline calls to OpenAI and Pinecone | Automate the OP stack
+
+## What's this?
+
+This single pluggable module named [pinembed.py](pinembed.py) provides a data-pipe using the OP stack.
+It automates the retrieval of vector embeddings from OpenAI's `text-embeddings-ada-002` model as well the uploading of said data to a Pinecone index.
+
+It does the following:
+
+- Ingests data
+- Sends data to 'Ada-002' at OpenAI to receive embeddings
+- Automatically [upserts](https://docs.pinecone.io/reference/upsert "Upsert documentation") received embedding data in real time
+
+## Why should I care?
+
+- Skip the programming!
+- Provides a modular multi-class structure for isolating and using specific functionality, like asynchronous embedding retrieval.
+- Eases the process of building Large Language Models
+- Enables semantic similarity searches
+- [Empowers](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings#:~:text=To%20see%20embeddings%20in%20action%2C%20check%20out%20our%20code%20samples "Reference Documentation"):
+  - Classification
+  - Topic clustering
+  - Search
+  - Recommendations
+
+### Requirements
+
+- OpenAI
+- Pinecone
+- Python-dotenv
+
+## Roadmap
+
+1) Create pseudocode for more functionality, namely further querying the Pinecone index
+2) Draft Python logic for ['similarity'](https://docs.pinecone.io/reference/query) queries
+3) Remove 0.3 data-stream cooldown. | This is literally an async pluggable module -- don't need that.
+4) Create LangChain class on top of `DataStreamHandler` with the goal of testing it as a Question/Answering service
+   * LangChain `DirectoryLoader`
+5) Extend package to enable [Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/ "Agent Documentation") & [Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/ "Memory Documentation") for large language models
+
+#### Self-asked Dev-questions
+
+- How will someone easily query their index?
+  - Automating 'similarity' queries is a good starting point
+- How can this module be even easier to side-load for *any* project?
+- Did I properly write docstrings that accurately reflect the expected data types for Pinecone specifically? I know I checked for Ada-002.
+- Is it worth having multiple data streams for different processes an end-user might have? Especially if they're an organization, with multiple keys running?
+  - I'd also therefore need to make room for more keys, etc. I will use organizational ID management to help further differentiate where necessary.
+
+## Official Reference Documentation
+
+- [OpenAI Documentation](https://platform.openai.com/docs/guides/embeddings)
+- [Embeddings API Reference](https://platform.openai.com/docs/api-reference)
+- [Pinecone Example Projects](https://docs.pinecone.io/page/examples)
+- [Pinecone API Reference](https://docs.pinecone.io/reference)
+- [LangChain / Pinecone "Getting Startetd"](https://www.pinecone.io/learn/series/langchain/langchain-intro/)
+- [LangChain Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/)
+- [LangChain Conversational Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/)
+
+## [LICENSE](../LICENSE)
diff --git a/Auto-Embedder/pinembed.py b/Auto-Embedder/pinembed.py
@@ -0,0 +1,133 @@
+"""Easily automate the retrieval from OpenAI and storage of embeddings in Pinecone."""
+
+import os
+import logging
+import asyncio
+from dotenv import load_dotenv
+from datetime import datetime
+from typing import Dict, Union, List
+import openai  
+import pinecone  
+import backoff  
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Initialize logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class EnvConfig:
+    """Class for handling environment variables and API keys."""
+
+    def __init__(self) -> None:
+        """Initialize environment variables."""
+        self.openai_key: str = os.getenv("OPENAI_API_KEY")
+        self.pinecone_key: str = os.getenv("PINECONE_API_KEY")
+        self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT")
+        self.pinecone_index: str = os.getenv("PINEDEX")
+        self.drop_columns: List[str] = os.getenv("DROPCOLUMNS", "").split(",")
+
+        # Remove any empty strings that may appear if "DROPCOLUMNS" is empty or has trailing commas
+        self.drop_columns = [col.strip() for col in self.drop_columns if col.strip()]
+
+class OpenAIHandler:
+    """Class for handling OpenAI operations."""
+
+    def __init__(self, config: EnvConfig) -> None:
+        """Initialize OpenAI API key."""
+        openai.api_key = config.openai_key
+
+    @backoff.on_exception(backoff.expo, Exception, max_tries=3)
+    async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[float]]]:
+        """
+        Create an embedding using OpenAI.
+        
+        Parameters:
+            input_text (str): The text to be embedded.
+            
+        Returns:
+            Dict[str, Union[int, List[float]]]: The embedding response.
+        """
+        response = openai.Embedding.create(
+            model="text-embedding-ada-002",
+            input=input_text,
+            # Might be useful to add the user parameter
+        )
+        return response
+
+class PineconeHandler:
+    """Class for handling Pinecone operations."""
+
+    def __init__(self, config: "EnvConfig") -> None:
+        """
+        Initialize Pinecone API key, environment, and index name.
+
+        Args:
+            config (EnvConfig): An instance of the EnvConfig class containing environment variables and API keys.
+        """
+        pinecone.init(api_key=config.pinecone_key, environment=config.pinecone_environment)
+        self.index_name = config.pinecone_index
+        self.drop_columns = config.drop_columns
+
+    @backoff.on_exception(backoff.expo, Exception, max_tries=3)
+    async def upload_embedding(self, embedding: Dict[str, Union[int, List[float]]]) -> None:
+        """
+        Asynchronously uploads an embedding to the Pinecone index specified during initialization.
+        
+        This method will retry up to 3 times in case of failure, using exponential back-off.
+
+        Args:
+            embedding (Dict): A dictionary containing the following keys:
+                - 'id': A unique identifier for the embedding (str).
+                - 'values': A list of numerical values for the embedding (List[float]).
+                - 'metadata' (Optional): Any additional metadata as a dictionary (Dict).
+                - 'sparse_values' (Optional): Sparse values of the embedding as a dictionary with 'indices' and 'values' (Dict).
+        """
+        # Initialize Pinecone index
+        index = pinecone.Index(self.index_name)
+
+        # Prepare the item for upsert
+        item = {
+            'id': embedding['id'],
+            'values': embedding['values'],
+            'metadata': embedding.get('metadata', {}),
+            'sparse_values': embedding.get('sparse_values', {})
+        }
+
+        # Perform the upsert operation
+        index.upsert(vectors=[item])
+
+class DataStreamHandler:
+    """Class for handling data streams."""
+
+    def __init__(self, openai_handler: OpenAIHandler, pinecone_handler: PineconeHandler) -> None:
+        """Initialize DataStreamHandler."""
+        self.openai_handler = openai_handler
+        self.pinecone_handler = pinecone_handler
+        self.last_run_time: datetime = datetime.now()
+
+    async def process_data(self, data: str) -> None:
+        """
+        Process data to create and upload embeddings.
+        
+        Parameters:
+            data (str): The data to be processed.
+        """
+        if type(data) != str:
+            raise ValueError("Invalid data type.")
+
+        current_time = datetime.now()
+        elapsed_time = (current_time - self.last_run_time).total_seconds()
+        if elapsed_time < 0.3:
+            await asyncio.sleep(0.3 - elapsed_time)
+
+        self.last_run_time = datetime.now()
+        embedding = await self.openai_handler.create_embedding(data)
+        await self.pinecone_handler.upload_embedding(embedding)
+
+if __name__ == "__main__":
+    config = EnvConfig()
+    openai_handler = OpenAIHandler(config)
+    pinecone_handler = PineconeHandler(config)
+    data_streams = [DataStreamHandler(openai_handler, pinecone_handler) for _ in range(3)]
diff --git a/Auto-Embedder/requirements.txt b/Auto-Embedder/requirements.txt
@@ -0,0 +1,4 @@
+openai
+pinecone-client
+python-dotenv
+langchain
diff --git a/Auto-Embedder/test.py b/Auto-Embedder/test.py
@@ -0,0 +1,71 @@
+import unittest
+from unittest.mock import patch
+import os
+import logging
+import asyncio
+from dotenv import load_dotenv
+from datetime import datetime
+from typing import Dict, Union, List
+import openai  
+import pinecone  
+import backoff  
+
+# Load environment variables from .env file
+load_dotenv()
+
+# Initialize logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+class EnvConfig:
+    """Class for handling environment variables and API keys."""
+
+    def __init__(self) -> None:
+        """Initialize environment variables."""
+        self.openai_key: str = os.getenv("OPENAI_API_KEY")
+        self.pinecone_key: str = os.getenv("PINECONE_API_KEY")
+        self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT")
+        self.pinecone_environment: str = os.getenv("PINEDEX")
+
+class OpenAIHandler:
+    """Class for handling OpenAI operations."""
+
+    def __init__(self, config: EnvConfig) -> None:
+        """Initialize OpenAI API key."""
+        openai.api_key = config.openai_key
+
+    @backoff.on_exception(backoff.expo, Exception, max_tries=3)
+    async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[float]]]:
+        """
+        Create an embedding using OpenAI.
+        
+        Parameters:
+            input_text (str): The text to be embedded.
+            
+        Returns:
+            Dict[str, Union[int, List[float]]]: The embedding response.
+        """
+        response = openai.Embedding.create(
+            model="text-embedding-ada-002",engine="ada",
+            text=input_text,
+        )
+        return response
+
+# Create test class
+class TestOpenAIHandler(unittest.TestCase):
+    # Set up test environment
+    def setUp(self):
+        self.config = EnvConfig()
+        self.openai_handler = OpenAIHandler(self.config)
+
+    # Test create_embedding method
+    @patch('openai.Embedding.create')
+    def test_create_embedding(self, mock_create):
+        input_text = 'This is a test'
+        expected_response = {'id': 12345, 'embedding': [1.0, 2.0, 3.0]}
+        mock_create.return_value = expected_response
+        response = self.openai_handler.create_embedding(input_text)
+        self.assertEqual(response, expected_response)
+
+if __name__ == "__main__":
+    unittest.main()