Skip to content

Commit

Permalink
Merge pull request #21 from Daethyra/working
Browse files Browse the repository at this point in the history
Working
  • Loading branch information
Daethyra authored Oct 4, 2023
2 parents 24a83fa + 74144d7 commit a4260e7
Show file tree
Hide file tree
Showing 16 changed files with 394 additions and 115 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,4 @@
[OpenAI]
OPENAI_API_KEY = sk-
MODEL = gpt-4-32k # gpt-3.5-turbo # gpt-4 # gpt-4-32k
TEMPERATURE = 0

[Pinecone]
PINECONE_API_KEY =
PINECONE_ENVIRONMENT =
PINEDEX =
TEMPERATURE = 0.5
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
class OpenAI_Chat:
def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', 0)):
def __init__(self, model=os.getenv('MODEL', 'gpt-3.5-turbo'), temperature=os.getenv('TEMPERATURE', .5)):
self.model = model
self.temperature = float(temperature)
self.messages = []
File renamed without changes.
9 changes: 9 additions & 0 deletions .env.template
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[OpenAI]
OPENAI_API_KEY = sk-[...]
MODEL = text-embeddings-ada-002

[Pinecone]
PINECONE_API_KEY =
PINECONE_ENVIRONMENT = us-central1-gcp
PINEDEX = default_name
DROPCOLUMNS = metadata,sparse_values
Binary file removed .github/mindmap.png
Binary file not shown.
60 changes: 60 additions & 0 deletions Auto-Embedder/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Streamline calls to OpenAI and Pinecone | Automate the OP stack

## What's this?

This single pluggable module named [pinembed.py](pinembed.py) provides a data-pipe using the OP stack.
It automates the retrieval of vector embeddings from OpenAI's `text-embeddings-ada-002` model as well the uploading of said data to a Pinecone index.

It does the following:

- Ingests data
- Sends data to 'Ada-002' at OpenAI to receive embeddings
- Automatically [upserts](https://docs.pinecone.io/reference/upsert "Upsert documentation") received embedding data in real time

## Why should I care?

- Skip the programming!
- Provides a modular multi-class structure for isolating and using specific functionality, like asynchronous embedding retrieval.
- Eases the process of building Large Language Models
- Enables semantic similarity searches
- [Empowers](https://platform.openai.com/docs/guides/embeddings/what-are-embeddings#:~:text=To%20see%20embeddings%20in%20action%2C%20check%20out%20our%20code%20samples "Reference Documentation"):
- Classification
- Topic clustering
- Search
- Recommendations

### Requirements

- OpenAI
- Pinecone
- Python-dotenv

## Roadmap

1) Create pseudocode for more functionality, namely further querying the Pinecone index
2) Draft Python logic for ['similarity'](https://docs.pinecone.io/reference/query) queries
3) Remove 0.3 data-stream cooldown. | This is literally an async pluggable module -- don't need that.
4) Create LangChain class on top of `DataStreamHandler` with the goal of testing it as a Question/Answering service
* LangChain `DirectoryLoader`
5) Extend package to enable [Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/ "Agent Documentation") & [Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/ "Memory Documentation") for large language models

#### Self-asked Dev-questions

- How will someone easily query their index?
- Automating 'similarity' queries is a good starting point
- How can this module be even easier to side-load for *any* project?
- Did I properly write docstrings that accurately reflect the expected data types for Pinecone specifically? I know I checked for Ada-002.
- Is it worth having multiple data streams for different processes an end-user might have? Especially if they're an organization, with multiple keys running?
- I'd also therefore need to make room for more keys, etc. I will use organizational ID management to help further differentiate where necessary.

## Official Reference Documentation

- [OpenAI Documentation](https://platform.openai.com/docs/guides/embeddings)
- [Embeddings API Reference](https://platform.openai.com/docs/api-reference)
- [Pinecone Example Projects](https://docs.pinecone.io/page/examples)
- [Pinecone API Reference](https://docs.pinecone.io/reference)
- [LangChain / Pinecone "Getting Startetd"](https://www.pinecone.io/learn/series/langchain/langchain-intro/)
- [LangChain Agents](https://www.pinecone.io/learn/series/langchain/langchain-agents/)
- [LangChain Conversational Memory](https://www.pinecone.io/learn/series/langchain/langchain-conversational-memory/)

## [LICENSE](../LICENSE)
133 changes: 133 additions & 0 deletions Auto-Embedder/pinembed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
"""Easily automate the retrieval from OpenAI and storage of embeddings in Pinecone."""

import os
import logging
import asyncio
from dotenv import load_dotenv
from datetime import datetime
from typing import Dict, Union, List
import openai
import pinecone
import backoff

# Load environment variables from .env file
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class EnvConfig:
"""Class for handling environment variables and API keys."""

def __init__(self) -> None:
"""Initialize environment variables."""
self.openai_key: str = os.getenv("OPENAI_API_KEY")
self.pinecone_key: str = os.getenv("PINECONE_API_KEY")
self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT")
self.pinecone_index: str = os.getenv("PINEDEX")
self.drop_columns: List[str] = os.getenv("DROPCOLUMNS", "").split(",")

# Remove any empty strings that may appear if "DROPCOLUMNS" is empty or has trailing commas
self.drop_columns = [col.strip() for col in self.drop_columns if col.strip()]

class OpenAIHandler:
"""Class for handling OpenAI operations."""

def __init__(self, config: EnvConfig) -> None:
"""Initialize OpenAI API key."""
openai.api_key = config.openai_key

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[float]]]:
"""
Create an embedding using OpenAI.
Parameters:
input_text (str): The text to be embedded.
Returns:
Dict[str, Union[int, List[float]]]: The embedding response.
"""
response = openai.Embedding.create(
model="text-embedding-ada-002",
input=input_text,
# Might be useful to add the user parameter
)
return response

class PineconeHandler:
"""Class for handling Pinecone operations."""

def __init__(self, config: "EnvConfig") -> None:
"""
Initialize Pinecone API key, environment, and index name.
Args:
config (EnvConfig): An instance of the EnvConfig class containing environment variables and API keys.
"""
pinecone.init(api_key=config.pinecone_key, environment=config.pinecone_environment)
self.index_name = config.pinecone_index
self.drop_columns = config.drop_columns

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
async def upload_embedding(self, embedding: Dict[str, Union[int, List[float]]]) -> None:
"""
Asynchronously uploads an embedding to the Pinecone index specified during initialization.
This method will retry up to 3 times in case of failure, using exponential back-off.
Args:
embedding (Dict): A dictionary containing the following keys:
- 'id': A unique identifier for the embedding (str).
- 'values': A list of numerical values for the embedding (List[float]).
- 'metadata' (Optional): Any additional metadata as a dictionary (Dict).
- 'sparse_values' (Optional): Sparse values of the embedding as a dictionary with 'indices' and 'values' (Dict).
"""
# Initialize Pinecone index
index = pinecone.Index(self.index_name)

# Prepare the item for upsert
item = {
'id': embedding['id'],
'values': embedding['values'],
'metadata': embedding.get('metadata', {}),
'sparse_values': embedding.get('sparse_values', {})
}

# Perform the upsert operation
index.upsert(vectors=[item])

class DataStreamHandler:
"""Class for handling data streams."""

def __init__(self, openai_handler: OpenAIHandler, pinecone_handler: PineconeHandler) -> None:
"""Initialize DataStreamHandler."""
self.openai_handler = openai_handler
self.pinecone_handler = pinecone_handler
self.last_run_time: datetime = datetime.now()

async def process_data(self, data: str) -> None:
"""
Process data to create and upload embeddings.
Parameters:
data (str): The data to be processed.
"""
if type(data) != str:
raise ValueError("Invalid data type.")

current_time = datetime.now()
elapsed_time = (current_time - self.last_run_time).total_seconds()
if elapsed_time < 0.3:
await asyncio.sleep(0.3 - elapsed_time)

self.last_run_time = datetime.now()
embedding = await self.openai_handler.create_embedding(data)
await self.pinecone_handler.upload_embedding(embedding)

if __name__ == "__main__":
config = EnvConfig()
openai_handler = OpenAIHandler(config)
pinecone_handler = PineconeHandler(config)
data_streams = [DataStreamHandler(openai_handler, pinecone_handler) for _ in range(3)]
4 changes: 4 additions & 0 deletions Auto-Embedder/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
openai
pinecone-client
python-dotenv
langchain
71 changes: 71 additions & 0 deletions Auto-Embedder/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import unittest
from unittest.mock import patch
import os
import logging
import asyncio
from dotenv import load_dotenv
from datetime import datetime
from typing import Dict, Union, List
import openai
import pinecone
import backoff

# Load environment variables from .env file
load_dotenv()

# Initialize logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class EnvConfig:
"""Class for handling environment variables and API keys."""

def __init__(self) -> None:
"""Initialize environment variables."""
self.openai_key: str = os.getenv("OPENAI_API_KEY")
self.pinecone_key: str = os.getenv("PINECONE_API_KEY")
self.pinecone_environment: str = os.getenv("PINECONE_ENVIRONMENT")
self.pinecone_environment: str = os.getenv("PINEDEX")

class OpenAIHandler:
"""Class for handling OpenAI operations."""

def __init__(self, config: EnvConfig) -> None:
"""Initialize OpenAI API key."""
openai.api_key = config.openai_key

@backoff.on_exception(backoff.expo, Exception, max_tries=3)
async def create_embedding(self, input_text: str) -> Dict[str, Union[int, List[float]]]:
"""
Create an embedding using OpenAI.
Parameters:
input_text (str): The text to be embedded.
Returns:
Dict[str, Union[int, List[float]]]: The embedding response.
"""
response = openai.Embedding.create(
model="text-embedding-ada-002",engine="ada",
text=input_text,
)
return response

# Create test class
class TestOpenAIHandler(unittest.TestCase):
# Set up test environment
def setUp(self):
self.config = EnvConfig()
self.openai_handler = OpenAIHandler(self.config)

# Test create_embedding method
@patch('openai.Embedding.create')
def test_create_embedding(self, mock_create):
input_text = 'This is a test'
expected_response = {'id': 12345, 'embedding': [1.0, 2.0, 3.0]}
mock_create.return_value = expected_response
response = self.openai_handler.create_embedding(input_text)
self.assertEqual(response, expected_response)

if __name__ == "__main__":
unittest.main()
Loading

0 comments on commit a4260e7

Please sign in to comment.