From 3a785249462f235c881648696d98d0c4e2df7bb0 Mon Sep 17 00:00:00 2001
From: Daemon <109057945+Daethyra@users.noreply.github.com>
Date: Mon, 2 Oct 2023 22:04:28 -0700
Subject: [PATCH] Overhauled OP stack automation & archived the GUI modules 
 renamed:    Basic-GPT-GUI/.env.template ->
 .archive/Basic-GPT-GUI/.env.template 	renamed:    Basic-GPT-GUI/.gitignore
 -> .archive/Basic-GPT-GUI/.gitignore 	renamed:    Basic-GPT-GUI/main.py ->
 .archive/Basic-GPT-GUI/main.py 	renamed:   
 Basic-GPT-GUI/requirements.txt -> .archive/Basic-GPT-GUI/requirements.txt 
 renamed:    Basic-GPT-GUI/src/gui.py -> .archive/Basic-GPT-GUI/src/gui.py 
 renamed:    Basic-GPT-GUI/src/openai_chat.py ->
 .archive/Basic-GPT-GUI/src/openai_chat.py 	renamed:   
 GPT-Prompt-Examples/TLDR.md -> .archive/TLDR.md 	modified:  
 Auto-Embedder/.env.template 	new file:   Auto-Embedder/README.md 
 modified:   Auto-Embedder/pinembed.py 	new file:  
 Auto-Embedder/requirements.txt

---
 .../Basic-GPT-GUI}/.env.template              |  7 +--
 .../Basic-GPT-GUI}/.gitignore                 |  0
 .../Basic-GPT-GUI}/main.py                    |  0
 .../Basic-GPT-GUI}/requirements.txt           |  0
 .../Basic-GPT-GUI}/src/gui.py                 |  0
 .../Basic-GPT-GUI}/src/openai_chat.py         |  2 +-
 {GPT-Prompt-Examples => .archive}/TLDR.md     |  0
 Auto-Embedder/.env.template                   |  6 +-
 Auto-Embedder/README.md                       | 60 +++++++++++++++++++
 Auto-Embedder/pinembed.py                     | 50 ++++++++++++----
 Auto-Embedder/requirements.txt                |  4 ++
 11 files changed, 109 insertions(+), 20 deletions(-)
 rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/.env.template (50%)
 rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/.gitignore (100%)
 rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/main.py (100%)
 rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/requirements.txt (100%)
 rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/src/gui.py (100%)
 rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/src/openai_chat.py (80%)
 rename {GPT-Prompt-Examples => .archive}/TLDR.md (100%)
 create mode 100644 Auto-Embedder/README.md
 create mode 100644 Auto-Embedder/requirements.txt

diff --git a/Basic-GPT-GUI/.env.template b/.archive/Basic-GPT-GUI/.env.template
similarity index 50%
rename from Basic-GPT-GUI/.env.template
rename to .archive/Basic-GPT-GUI/.env.template
index 4b00211..94cace8 100644
--- a/Basic-GPT-GUI/.env.template
+++ b/.archive/Basic-GPT-GUI/.env.template
@@ -1,9 +1,4 @@
 [OpenAI]
 OPENAI_API_KEY = sk-
 MODEL = gpt-4-32k # gpt-3.5-turbo # gpt-4 # gpt-4-32k
-TEMPERATURE = 0
-
-[Pinecone]
-PINECONE_API_KEY = 
-PINECONE_ENVIRONMENT = 
-PINEDEX = 
\ No newline at end of file
+TEMPERATURE = 0.5
\ No newline at end of file
diff --git a/Basic-GPT-GUI/.gitignore b/.archive/Basic-GPT-GUI/.gitignore
similarity index 100%
rename from Basic-GPT-GUI/.gitignore
rename to .archive/Basic-GPT-GUI/.gitignore
diff --git a/Basic-GPT-GUI/main.py b/.archive/Basic-GPT-GUI/main.py
similarity index 100%
rename from Basic-GPT-GUI/main.py
rename to .archive/Basic-GPT-GUI/main.py
diff --git a/Basic-GPT-GUI/requirements.txt b/.archive/Basic-GPT-GUI/requirements.txt
similarity index 100%
rename from Basic-GPT-GUI/requirements.txt
rename to .archive/Basic-GPT-GUI/requirements.txt
diff --git a/Basic-GPT-GUI/src/gui.py b/.archive/Basic-GPT-GUI/src/gui.py
similarity index 100%
rename from Basic-GPT-GUI/src/gui.py
rename to .archive/Basic-GPT-GUI/src/gui.py
diff --git a/Basic-GPT-GUI/src/openai_chat.py b/.archive/Basic-GPT-GUI/src/openai_chat.py
similarity index 80%
rename from Basic-GPT-GUI/src/openai_chat.py
rename to .archive/Basic-GPT-GUI/src/openai_chat.py
index d9d9ebe..b0b5851 100644
--- a/Basic-GPT-GUI/src/openai_chat.py
+++ b/.archive/Basic-GPT-GUI/src/openai_chat.py
@@ -1,5 +1,5 @@
 class OpenAI_Chat:
-    def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', 0)):
+    def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', .5)):
         self.model = model
         self.temperature = float(temperature)
         self.messages = []
diff --git a/GPT-Prompt-Examples/TLDR.md b/.archive/TLDR.md
similarity index 100%
rename from GPT-Prompt-Examples/TLDR.md
rename to .archive/TLDR.md
diff --git a/Auto-Embedder/.env.template b/Auto-Embedder/.env.template
index d1463c6..afb27fb 100644
--- a/Auto-Embedder/.env.template
+++ b/Auto-Embedder/.env.template
@@ -1,9 +1,9 @@
 [OpenAI]
 OPENAI_API_KEY = sk-[...]
-MODEL = text-embeddings-ada-002 # gpt-3.5-turbo # gpt-4 # gpt-4-32k
-TEMPERATURE = 0
+MODEL = text-embeddings-ada-002
 
 [Pinecone]
 PINECONE_API_KEY = 
 PINECONE_ENVIRONMENT = us-central1-gcp
-PINEDEX = default_name
\ No newline at end of file
+PINEDEX = default_name
+DROPCOLUMNS = metadata,sparse_values
\ No newline at end of file
diff --git a/Auto-Embedder/README.md b/Auto-Embedder/README.md
new file mode 100644
index 0000000..f97547d
--- /dev/null
+++ b/Auto-Embedder/README.md
@@ -0,0 +1,60 @@
+# Streamline calls to OpenAI and Pinecone | Automate the OP stack
+
+## What's this?
+
+This single pluggable module named [pinembed.py](pinembed.py) provides a data-pipe using the OP stack.
+It automates the retrieval of vector embeddings from OpenAI's `text-embeddings-ada-002` model as well the uploading of said data to a Pinecone index.
+
+It does the following:
+
+- Ingests data
+- Sends data to 'Ada-002' at OpenAI to receive embeddings
+- Automatically [upserts](https://docs.pinecone.io/reference/upsert "Upsert documentation") received embedding data in real time
+
+## Why should I care?
+
+- Skip the programming!
+- Provides a modular multi-class structure for isolating and using specific functionality, like asynchronous embedding retrieval.
+- Eases the process of building Large Language Models
+- Enables semantic similarity searches
+- [Empowers](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings#:~:text=To%20see%20embeddings%20in%20action%2C%20check%20out%20our%20code%20samples "Reference Documentation"):
+  - Classification
+  - Topic clustering
+  - Search
+  - Recommendations
+
+### Requirements
+
+- OpenAI
+- Pinecone
+- Python-dotenv
+
+## Roadmap
+
+1) Create pseudocode for more functionality, namely further querying the Pinecone index
+2) Draft Python logic for ['similarity'](https://docs.pinecone.io/reference/query) queries
+3) Remove 0.3 data-stream cooldown. | This is literally an async pluggable module -- don't need that.
+4) Create LangChain class on top of `DataStreamHandler` with the goal of testing it as a Question/Answering service
+   * LangChain `DirectoryLoader`
+5) Extend package to enable [Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/ "Agent Documentation") & [Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/ "Memory Documentation") for large language models
+
+#### Self-asked Dev-questions
+
+- How will someone easily query their index?
+  - Automating 'similarity' queries is a good starting point
+- How can this module be even easier to side-load for *any* project?
+- Did I properly write docstrings that accurately reflect the expected data types for Pinecone specifically? I know I checked for Ada-002.
+- Is it worth having multiple data streams for different processes an end-user might have? Especially if they're an organization, with multiple keys running?
+  - I'd also therefore need to make room for more keys, etc. I will use organizational ID management to help further differentiate where necessary.
+
+## Official Reference Documentation
+
+- [OpenAI Documentation](https://platform.openai.com/docs/guides/embeddings)
+- [Embeddings API Reference](https://platform.openai.com/docs/api-reference)
+- [Pinecone Example Projects](https://docs.pinecone.io/page/examples)
+- [Pinecone API Reference](https://docs.pinecone.io/reference)
+- [LangChain / Pinecone "Getting Startetd"](https://www.pinecone.io/learn/series/langchain/langchain-intro/)
+- [LangChain Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/)
+- [LangChain Conversational Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/)
+
+## [LICENSE](../LICENSE)
diff --git a/Auto-Embedder/pinembed.py b/Auto-Embedder/pinembed.py
index 4534ffc..a1411bd 100644
--- a/Auto-Embedder/pinembed.py
+++ b/Auto-Embedder/pinembed.py
@@ -25,7 +25,11 @@ def __init__(self) -> None:
         self.openai_key: str = os.getenv("OPENAI_API_KEY")
         self.pinecone_key: str = os.getenv("PINECONE_API_KEY")
         self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT")
-        self.pinecone_environment: str = os.getenv("PINEDEX")
+        self.pinecone_index: str = os.getenv("PINEDEX")
+        self.drop_columns: List[str] = os.getenv("DROPCOLUMNS", "").split(",")
+        
+        # Remove any empty strings that may appear if "DROPCOLUMNS" is empty or has trailing commas
+        self.drop_columns = [col.strip() for col in self.drop_columns if col.strip()]
 
 class OpenAIHandler:
     """Class for handling OpenAI operations."""
@@ -47,26 +51,52 @@ async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[f
         """
         response = openai.Embedding.create(
             model="text-embedding-ada-002",
-            input=input_text
+            input=input_text,
+            # Might be useful to add the user parameter
         )
         return response
 
 class PineconeHandler:
     """Class for handling Pinecone operations."""
 
-    def __init__(self, config: EnvConfig) -> None:
-        """Initialize Pinecone API key."""
-        pinecone.init(api_key=config.pinecone_key)
-    
+    def __init__(self, config: "EnvConfig") -> None:
+        """
+        Initialize Pinecone API key, environment, and index name.
+
+        Args:
+            config (EnvConfig): An instance of the EnvConfig class containing environment variables and API keys.
+        """
+        pinecone.init(api_key=config.pinecone_key, environment=config.pinecone_environment)
+        self.index_name = config.pinecone_index
+        self.drop_columns = config.drop_columns
+
     @backoff.on_exception(backoff.expo, Exception, max_tries=3)
     async def upload_embedding(self, embedding: Dict[str, Union[int, List[float]]]) -> None:
         """
-        Upload an embedding to Pinecone index.
+        Asynchronously uploads an embedding to the Pinecone index specified during initialization.
         
-        Parameters:
-            embedding (Dict): The embedding to be uploaded.
+        This method will retry up to 3 times in case of failure, using exponential back-off.
+
+        Args:
+            embedding (Dict): A dictionary containing the following keys:
+                - 'id': A unique identifier for the embedding (str).
+                - 'values': A list of numerical values for the embedding (List[float]).
+                - 'metadata' (Optional): Any additional metadata as a dictionary (Dict).
+                - 'sparse_values' (Optional): Sparse values of the embedding as a dictionary with 'indices' and 'values' (Dict).
         """
-        pinecone.upsert(index_name="your-index", items=embedding)
+        # Initialize Pinecone index
+        index = pinecone.Index(self.index_name)
+
+        # Prepare the item for upsert
+        item = {
+            'id': embedding['id'],
+            'values': embedding['values'],
+            'metadata': embedding.get('metadata', {}),
+            'sparse_values': embedding.get('sparse_values', {})
+        }
+
+        # Perform the upsert operation
+        index.upsert(vectors=[item])
 
 class DataStreamHandler:
     """Class for handling data streams."""
diff --git a/Auto-Embedder/requirements.txt b/Auto-Embedder/requirements.txt
new file mode 100644
index 0000000..866309a
--- /dev/null
+++ b/Auto-Embedder/requirements.txt
@@ -0,0 +1,4 @@
+openai
+pinecone-client
+python-dotenv
+langchain
\ No newline at end of file