From 3a785249462f235c881648696d98d0c4e2df7bb0 Mon Sep 17 00:00:00 2001 From: Daemon <109057945+Daethyra@users.noreply.github.com> Date: Mon, 2 Oct 2023 22:04:28 -0700 Subject: [PATCH] Overhauled OP stack automation & archived the GUI modules renamed: Basic-GPT-GUI/.env.template -> .archive/Basic-GPT-GUI/.env.template renamed: Basic-GPT-GUI/.gitignore -> .archive/Basic-GPT-GUI/.gitignore renamed: Basic-GPT-GUI/main.py -> .archive/Basic-GPT-GUI/main.py renamed: Basic-GPT-GUI/requirements.txt -> .archive/Basic-GPT-GUI/requirements.txt renamed: Basic-GPT-GUI/src/gui.py -> .archive/Basic-GPT-GUI/src/gui.py renamed: Basic-GPT-GUI/src/openai_chat.py -> .archive/Basic-GPT-GUI/src/openai_chat.py renamed: GPT-Prompt-Examples/TLDR.md -> .archive/TLDR.md modified: Auto-Embedder/.env.template new file: Auto-Embedder/README.md modified: Auto-Embedder/pinembed.py new file: Auto-Embedder/requirements.txt --- .../Basic-GPT-GUI}/.env.template | 7 +-- .../Basic-GPT-GUI}/.gitignore | 0 .../Basic-GPT-GUI}/main.py | 0 .../Basic-GPT-GUI}/requirements.txt | 0 .../Basic-GPT-GUI}/src/gui.py | 0 .../Basic-GPT-GUI}/src/openai_chat.py | 2 +- {GPT-Prompt-Examples => .archive}/TLDR.md | 0 Auto-Embedder/.env.template | 6 +- Auto-Embedder/README.md | 60 +++++++++++++++++++ Auto-Embedder/pinembed.py | 50 ++++++++++++---- Auto-Embedder/requirements.txt | 4 ++ 11 files changed, 109 insertions(+), 20 deletions(-) rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/.env.template (50%) rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/.gitignore (100%) rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/main.py (100%) rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/requirements.txt (100%) rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/src/gui.py (100%) rename {Basic-GPT-GUI => .archive/Basic-GPT-GUI}/src/openai_chat.py (80%) rename {GPT-Prompt-Examples => .archive}/TLDR.md (100%) create mode 100644 Auto-Embedder/README.md create mode 100644 Auto-Embedder/requirements.txt diff --git a/Basic-GPT-GUI/.env.template b/.archive/Basic-GPT-GUI/.env.template similarity index 50% rename from Basic-GPT-GUI/.env.template rename to .archive/Basic-GPT-GUI/.env.template index 4b00211..94cace8 100644 --- a/Basic-GPT-GUI/.env.template +++ b/.archive/Basic-GPT-GUI/.env.template @@ -1,9 +1,4 @@ [OpenAI] OPENAI_API_KEY = sk- MODEL = gpt-4-32k # gpt-3.5-turbo # gpt-4 # gpt-4-32k -TEMPERATURE = 0 - -[Pinecone] -PINECONE_API_KEY = -PINECONE_ENVIRONMENT = -PINEDEX = \ No newline at end of file +TEMPERATURE = 0.5 \ No newline at end of file diff --git a/Basic-GPT-GUI/.gitignore b/.archive/Basic-GPT-GUI/.gitignore similarity index 100% rename from Basic-GPT-GUI/.gitignore rename to .archive/Basic-GPT-GUI/.gitignore diff --git a/Basic-GPT-GUI/main.py b/.archive/Basic-GPT-GUI/main.py similarity index 100% rename from Basic-GPT-GUI/main.py rename to .archive/Basic-GPT-GUI/main.py diff --git a/Basic-GPT-GUI/requirements.txt b/.archive/Basic-GPT-GUI/requirements.txt similarity index 100% rename from Basic-GPT-GUI/requirements.txt rename to .archive/Basic-GPT-GUI/requirements.txt diff --git a/Basic-GPT-GUI/src/gui.py b/.archive/Basic-GPT-GUI/src/gui.py similarity index 100% rename from Basic-GPT-GUI/src/gui.py rename to .archive/Basic-GPT-GUI/src/gui.py diff --git a/Basic-GPT-GUI/src/openai_chat.py b/.archive/Basic-GPT-GUI/src/openai_chat.py similarity index 80% rename from Basic-GPT-GUI/src/openai_chat.py rename to .archive/Basic-GPT-GUI/src/openai_chat.py index d9d9ebe..b0b5851 100644 --- a/Basic-GPT-GUI/src/openai_chat.py +++ b/.archive/Basic-GPT-GUI/src/openai_chat.py @@ -1,5 +1,5 @@ class OpenAI_Chat: - def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', 0)): + def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', .5)): self.model = model self.temperature = float(temperature) self.messages = [] diff --git a/GPT-Prompt-Examples/TLDR.md b/.archive/TLDR.md similarity index 100% rename from GPT-Prompt-Examples/TLDR.md rename to .archive/TLDR.md diff --git a/Auto-Embedder/.env.template b/Auto-Embedder/.env.template index d1463c6..afb27fb 100644 --- a/Auto-Embedder/.env.template +++ b/Auto-Embedder/.env.template @@ -1,9 +1,9 @@ [OpenAI] OPENAI_API_KEY = sk-[...] -MODEL = text-embeddings-ada-002 # gpt-3.5-turbo # gpt-4 # gpt-4-32k -TEMPERATURE = 0 +MODEL = text-embeddings-ada-002 [Pinecone] PINECONE_API_KEY = PINECONE_ENVIRONMENT = us-central1-gcp -PINEDEX = default_name \ No newline at end of file +PINEDEX = default_name +DROPCOLUMNS = metadata,sparse_values \ No newline at end of file diff --git a/Auto-Embedder/README.md b/Auto-Embedder/README.md new file mode 100644 index 0000000..f97547d --- /dev/null +++ b/Auto-Embedder/README.md @@ -0,0 +1,60 @@ +# Streamline calls to OpenAI and Pinecone | Automate the OP stack + +## What's this? + +This single pluggable module named [pinembed.py](pinembed.py) provides a data-pipe using the OP stack. +It automates the retrieval of vector embeddings from OpenAI's `text-embeddings-ada-002` model as well the uploading of said data to a Pinecone index. + +It does the following: + +- Ingests data +- Sends data to 'Ada-002' at OpenAI to receive embeddings +- Automatically [upserts](https://docs.pinecone.io/reference/upsert "Upsert documentation") received embedding data in real time + +## Why should I care? + +- Skip the programming! +- Provides a modular multi-class structure for isolating and using specific functionality, like asynchronous embedding retrieval. +- Eases the process of building Large Language Models +- Enables semantic similarity searches +- [Empowers](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings#:~:text=To%20see%20embeddings%20in%20action%2C%20check%20out%20our%20code%20samples "Reference Documentation"): + - Classification + - Topic clustering + - Search + - Recommendations + +### Requirements + +- OpenAI +- Pinecone +- Python-dotenv + +## Roadmap + +1) Create pseudocode for more functionality, namely further querying the Pinecone index +2) Draft Python logic for ['similarity'](https://docs.pinecone.io/reference/query) queries +3) Remove 0.3 data-stream cooldown. | This is literally an async pluggable module -- don't need that. +4) Create LangChain class on top of `DataStreamHandler` with the goal of testing it as a Question/Answering service + * LangChain `DirectoryLoader` +5) Extend package to enable [Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/ "Agent Documentation") & [Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/ "Memory Documentation") for large language models + +#### Self-asked Dev-questions + +- How will someone easily query their index? + - Automating 'similarity' queries is a good starting point +- How can this module be even easier to side-load for *any* project? +- Did I properly write docstrings that accurately reflect the expected data types for Pinecone specifically? I know I checked for Ada-002. +- Is it worth having multiple data streams for different processes an end-user might have? Especially if they're an organization, with multiple keys running? + - I'd also therefore need to make room for more keys, etc. I will use organizational ID management to help further differentiate where necessary. + +## Official Reference Documentation + +- [OpenAI Documentation](https://platform.openai.com/docs/guides/embeddings) +- [Embeddings API Reference](https://platform.openai.com/docs/api-reference) +- [Pinecone Example Projects](https://docs.pinecone.io/page/examples) +- [Pinecone API Reference](https://docs.pinecone.io/reference) +- [LangChain / Pinecone "Getting Startetd"](https://www.pinecone.io/learn/series/langchain/langchain-intro/) +- [LangChain Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/) +- [LangChain Conversational Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/) + +## [LICENSE](../LICENSE) diff --git a/Auto-Embedder/pinembed.py b/Auto-Embedder/pinembed.py index 4534ffc..a1411bd 100644 --- a/Auto-Embedder/pinembed.py +++ b/Auto-Embedder/pinembed.py @@ -25,7 +25,11 @@ def __init__(self) -> None: self.openai_key: str = os.getenv("OPENAI_API_KEY") self.pinecone_key: str = os.getenv("PINECONE_API_KEY") self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT") - self.pinecone_environment: str = os.getenv("PINEDEX") + self.pinecone_index: str = os.getenv("PINEDEX") + self.drop_columns: List[str] = os.getenv("DROPCOLUMNS", "").split(",") + + # Remove any empty strings that may appear if "DROPCOLUMNS" is empty or has trailing commas + self.drop_columns = [col.strip() for col in self.drop_columns if col.strip()] class OpenAIHandler: """Class for handling OpenAI operations.""" @@ -47,26 +51,52 @@ async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[f """ response = openai.Embedding.create( model="text-embedding-ada-002", - input=input_text + input=input_text, + # Might be useful to add the user parameter ) return response class PineconeHandler: """Class for handling Pinecone operations.""" - def __init__(self, config: EnvConfig) -> None: - """Initialize Pinecone API key.""" - pinecone.init(api_key=config.pinecone_key) - + def __init__(self, config: "EnvConfig") -> None: + """ + Initialize Pinecone API key, environment, and index name. + + Args: + config (EnvConfig): An instance of the EnvConfig class containing environment variables and API keys. + """ + pinecone.init(api_key=config.pinecone_key, environment=config.pinecone_environment) + self.index_name = config.pinecone_index + self.drop_columns = config.drop_columns + @backoff.on_exception(backoff.expo, Exception, max_tries=3) async def upload_embedding(self, embedding: Dict[str, Union[int, List[float]]]) -> None: """ - Upload an embedding to Pinecone index. + Asynchronously uploads an embedding to the Pinecone index specified during initialization. - Parameters: - embedding (Dict): The embedding to be uploaded. + This method will retry up to 3 times in case of failure, using exponential back-off. + + Args: + embedding (Dict): A dictionary containing the following keys: + - 'id': A unique identifier for the embedding (str). + - 'values': A list of numerical values for the embedding (List[float]). + - 'metadata' (Optional): Any additional metadata as a dictionary (Dict). + - 'sparse_values' (Optional): Sparse values of the embedding as a dictionary with 'indices' and 'values' (Dict). """ - pinecone.upsert(index_name="your-index", items=embedding) + # Initialize Pinecone index + index = pinecone.Index(self.index_name) + + # Prepare the item for upsert + item = { + 'id': embedding['id'], + 'values': embedding['values'], + 'metadata': embedding.get('metadata', {}), + 'sparse_values': embedding.get('sparse_values', {}) + } + + # Perform the upsert operation + index.upsert(vectors=[item]) class DataStreamHandler: """Class for handling data streams.""" diff --git a/Auto-Embedder/requirements.txt b/Auto-Embedder/requirements.txt new file mode 100644 index 0000000..866309a --- /dev/null +++ b/Auto-Embedder/requirements.txt @@ -0,0 +1,4 @@ +openai +pinecone-client +python-dotenv +langchain \ No newline at end of file