From 89cf9761d1e87ede2e3a29c04955b3101343d577 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 16:58:47 -0700
Subject: [PATCH 01/48] mark depricated API section

---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)
diff --git a/README.md b/README.md
index 66ed4a0139..5826b786c0 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,10 @@ To run MemGPT for as a conversation agent in CLI mode, simply run `memgpt`:
 memgpt
 ```
 
+
+
+<details>
+<summary><h2>Deprecated API</h2></summary>
 <details>
 <summary><strong>Debugging command not found</strong></summary>
 
@@ -330,6 +334,7 @@ MemGPT also enables you to chat with docs -- try running this example to talk to
    If you downloaded from Hugging Face, it will be `memgpt/personas/docqa/llamaindex-api-docs`.
    If you built the index yourself, it will be `memgpt/personas/docqa`.
 </details>
+</details>
 
 ## Support
 

From be6212c5cc552e47ecdce43ef37531602fa42c88 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:10:09 -0700
Subject: [PATCH 02/48] add readme

---
 README.md | 48 +++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 41 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 5826b786c0..40d5de3424 100644
--- a/README.md
+++ b/README.md
@@ -87,23 +87,57 @@ Add your OpenAI API key to your environment:
 ```sh
 # on Linux/Mac
 export OPENAI_API_KEY=YOUR_API_KEY
-```
 
-```sh
 # on Windows
 set OPENAI_API_KEY=YOUR_API_KEY
-```
 
-```sh
 # on Windows (PowerShell)
 $Env:OPENAI_API_KEY = "YOUR_API_KEY"
 ```
 
-To run MemGPT for as a conversation agent in CLI mode, simply run `memgpt`:
-
+Now, you can run MemGPT with:
 ```sh
-memgpt
+memgpt run
 ```
+The `run` command supports the following flags:
+* `--agent`: (str) Name of agent to create or to resume chatting with.
+* `--human`: (str) Name of the human to run the agent with.
+* `--model`: (str) LLM model to run
+
+### Configuration
+You can configure defaults settings for MemGPT with:
+```
+memgpt configure
+```
+Configuration defaults can be overriden when calling `memgpt run` by using flags:
+```
+memgpt run \
+--persona
+--agent
+--human
+```
+
+### Adding Custom Personas/Humans
+You can add new human or persona definitions either by providing a file (using the `-f` flag) or text (using the `--text` flag).
+```
+# add a human
+memgpt add human [-f <FILENAME>] [--text <TEXT>]
+
+# add a persona
+memgpt add persona [-f <FILENAME>] [--text <TEXT>]
+```
+
+### Adding Data Sources
+MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations.
+```
+memgpt load
+```
+To encourage your agent to reference its archival memory, we recommend adding phrases like "search your archival memory..." for the best results.
+
+## Advanced
+
+### Adding new connectors
+
 
 
 

From b01138011077ba4598e845d2eba71e25abdd3fb7 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:13:17 -0700
Subject: [PATCH 03/48] add readme

---
 README.md | 20 ++++++++------------
 1 file changed, 8 insertions(+), 12 deletions(-)

diff --git a/README.md b/README.md
index 40d5de3424..d118456aef 100644
--- a/README.md
+++ b/README.md
@@ -77,22 +77,13 @@ Install MemGPT:
 pip install pymemgpt
 ```
 
-To update the package, run
-```sh
-pip install pymemgpt -U
-```
-
 Add your OpenAI API key to your environment:
 
 ```sh
-# on Linux/Mac
-export OPENAI_API_KEY=YOUR_API_KEY
-
-# on Windows
-set OPENAI_API_KEY=YOUR_API_KEY
 
-# on Windows (PowerShell)
-$Env:OPENAI_API_KEY = "YOUR_API_KEY"
+export OPENAI_API_KEY=YOUR_API_KEY # on Linux/Mac
+set OPENAI_API_KEY=YOUR_API_KEY # on Windows
+$Env:OPENAI_API_KEY = "YOUR_API_KEY" # on Windows (PowerShell)
 ```
 
 Now, you can run MemGPT with:
@@ -127,6 +118,11 @@ memgpt add human [-f <FILENAME>] [--text <TEXT>]
 memgpt add persona [-f <FILENAME>] [--text <TEXT>]
 ```
 
+You can view available persona and human files with the following command:
+```
+memgpt list [human/persona]
+```
+
 ### Adding Data Sources
 MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations.
 ```

From 59f7b715b14284a73273d7a0347977af8527552f Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:22:25 -0700
Subject: [PATCH 04/48] add readme

---
 README.md | 31 +++++++++++++++----------------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/README.md b/README.md
index d118456aef..b3c9c3dcf2 100644
--- a/README.md
+++ b/README.md
@@ -85,28 +85,25 @@ export OPENAI_API_KEY=YOUR_API_KEY # on Linux/Mac
 set OPENAI_API_KEY=YOUR_API_KEY # on Windows
 $Env:OPENAI_API_KEY = "YOUR_API_KEY" # on Windows (PowerShell)
 ```
-
+Configure default setting for MemGPT by running:
+```
+memgpt configure
+```
 Now, you can run MemGPT with:
 ```sh
 memgpt run
 ```
-The `run` command supports the following flags:
+The `run` command supports the following optional flags (if set, will override config defaults):
 * `--agent`: (str) Name of agent to create or to resume chatting with.
 * `--human`: (str) Name of the human to run the agent with.
-* `--model`: (str) LLM model to run
-
-### Configuration
-You can configure defaults settings for MemGPT with:
-```
-memgpt configure
-```
-Configuration defaults can be overriden when calling `memgpt run` by using flags:
-```
-memgpt run \
---persona
---agent
---human
-```
+* `--persona`: (str) Name of agent persona to use.
+* `--model`: (str) LLM model to run [gpt-4, gpt-3.5].
+* `--preset`: (str) MemGPT preset to run agent with.
+* `--data_source`: (str) Name of data source (loaded with `memgpt load`) to connect to agent.
+* `--first`: (str) Allow user to sent the first message.
+* `--debug`: (bool) Show debug logs (default=False)
+* `--no_verify`: (bool) Bypass message verification (default=False)
+* `--yes`/`-y`: (bool) Skip confirmation prompt and use defaults (default=False)
 
 ### Adding Custom Personas/Humans
 You can add new human or persona definitions either by providing a file (using the `-f` flag) or text (using the `--text` flag).
@@ -130,6 +127,8 @@ memgpt load
 ```
 To encourage your agent to reference its archival memory, we recommend adding phrases like "search your archival memory..." for the best results.
 
+### Using other endpoints
+
 ## Advanced
 
 ### Adding new connectors

From 176538b736ce2eac9cee85dc9222a048b6f51604 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:28:18 -0700
Subject: [PATCH 05/48] add readme

---
 README.md | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index b3c9c3dcf2..9b7e1297bd 100644
--- a/README.md
+++ b/README.md
@@ -121,9 +121,17 @@ memgpt list [human/persona]
 ```
 
 ### Adding Data Sources
-MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations.
+MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations with `memgpt load --name <NAME>`, where the `--name` flag specifies a unique ID for the data source which you can use to attach the data source to the agent when running `memgpt run --data-source <NAME>`.
+
+Loading from a directory:
+```
+# loading a directory
+memgpt load directory --name <NAME> [--input_dir <DIRECTORY>] [--input-files <FILE1> <FILE2>...] [--recursive]
+```
+Loading from a database:
 ```
-memgpt load
+# loading a database
+memgpt load database --name <NAME>
 ```
 To encourage your agent to reference its archival memory, we recommend adding phrases like "search your archival memory..." for the best results.
 

From 99052664cc57fd0b673bd0c613c0f78d469e6b2d Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:35:27 -0700
Subject: [PATCH 06/48] add readme

---
 README.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/README.md b/README.md
index 9b7e1297bd..7efd9b7dc8 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,16 @@ The `run` command supports the following optional flags (if set, will override c
 * `--no_verify`: (bool) Bypass message verification (default=False)
 * `--yes`/`-y`: (bool) Skip confirmation prompt and use defaults (default=False)
 
+You can run the following commands in the MemGPT CLI prompt:
+* `/exit`: Exit the CLI
+* `/save`: Save a checkpoint of the current agent/conversation state
+* `/dump`: View the current message log (see the contents of main context)
+* `/memory`: Print the current contents of agent memory
+* `/pop`: Undo the last message in the conversation
+* `/heartbeat`: Send a heartbeat system message to the agent
+* `/memorywarning`: Send a memory warning system message to the agent
+Once you exit the CLI with `/exit`, you can resume chatting with the same agent by specifying the agent name in `memgpt run --agent <NAME>`.
+
 ### Adding Custom Personas/Humans
 You can add new human or persona definitions either by providing a file (using the `-f` flag) or text (using the `--text` flag).
 ```
@@ -135,6 +145,11 @@ memgpt load database --name <NAME>
 ```
 To encourage your agent to reference its archival memory, we recommend adding phrases like "search your archival memory..." for the best results.
 
+You can view loaded data source with:
+```
+memgpt list sources
+```
+
 ### Using other endpoints
 
 ## Advanced

From 3606959065f7eacc27ed08dcc3a95a2201e2ca69 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:41:06 -0700
Subject: [PATCH 07/48] add readme

---
 README.md | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 7efd9b7dc8..96c7a0c196 100644
--- a/README.md
+++ b/README.md
@@ -113,6 +113,8 @@ You can run the following commands in the MemGPT CLI prompt:
 * `/pop`: Undo the last message in the conversation
 * `/heartbeat`: Send a heartbeat system message to the agent
 * `/memorywarning`: Send a memory warning system message to the agent
+
+
 Once you exit the CLI with `/exit`, you can resume chatting with the same agent by specifying the agent name in `memgpt run --agent <NAME>`.
 
 ### Adding Custom Personas/Humans
@@ -130,28 +132,39 @@ You can view available persona and human files with the following command:
 memgpt list [human/persona]
 ```
 
-### Adding Data Sources
-MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations with `memgpt load --name <NAME>`, where the `--name` flag specifies a unique ID for the data source which you can use to attach the data source to the agent when running `memgpt run --data-source <NAME>`.
+### Data Sources
+MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations with an agent by specifying the data source with the flag `memgpt run --data-source <NAME>`.
 
+#### Loading Data
 Loading from a directory:
 ```
 # loading a directory
-memgpt load directory --name <NAME> [--input_dir <DIRECTORY>] [--input-files <FILE1> <FILE2>...] [--recursive]
+memgpt load directory --name <NAME> \
+    [--input_dir <DIRECTORY>] [--input-files <FILE1> <FILE2>...] [--recursive]
 ```
 Loading from a database:
-```
+```sh
 # loading a database
-memgpt load database --name <NAME>
+memgpt load database --name <NAME>  \
+    --query <QUERY> \ # Query to run on database to get data
+    --dump_path <PATH> \ # Path to dump file
+    --scheme <SCHEME> \ # Database scheme
+    --host <HOST> \ # Database host
+    --port <PORT> \ # Database port
+    --user <USER> \ # Database user
+    --password <PASSWORD> \ # Database password
+    --dbname <DB_NAME> # Database name
 ```
 To encourage your agent to reference its archival memory, we recommend adding phrases like "search your archival memory..." for the best results.
 
+### Viewing available data sources
 You can view loaded data source with:
 ```
 memgpt list sources
 ```
 
 ### Using other endpoints
-
+AddingAddingAdding
 ## Advanced
 
 ### Adding new connectors

From c48803c584472d69eb5760eacd2434a7bd7f7a43 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:44:54 -0700
Subject: [PATCH 08/48] add readme

---
 README.md | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index 96c7a0c196..2698f1f710 100644
--- a/README.md
+++ b/README.md
@@ -132,22 +132,23 @@ You can view available persona and human files with the following command:
 memgpt list [human/persona]
 ```
 
-### Data Sources
+### Data Sources (i.e. chat with your data)
 MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations with an agent by specifying the data source with the flag `memgpt run --data-source <NAME>`.
 
 #### Loading Data
+We currently support loading from a directory and database dump. We highly encourage contributions for new data sources, which can be added as a new [CLI data load command](https://github.com/cpacker/MemGPT/blob/main/memgpt/cli/cli_load.py).
+
 Loading from a directory:
 ```
 # loading a directory
 memgpt load directory --name <NAME> \
     [--input_dir <DIRECTORY>] [--input-files <FILE1> <FILE2>...] [--recursive]
 ```
-Loading from a database:
+Loading from a database dump:
 ```sh
-# loading a database
 memgpt load database --name <NAME>  \
     --query <QUERY> \ # Query to run on database to get data
-    --dump_path <PATH> \ # Path to dump file
+    --dump-path <PATH> \ # Path to dump file
     --scheme <SCHEME> \ # Database scheme
     --host <HOST> \ # Database host
     --port <PORT> \ # Database port
@@ -157,14 +158,14 @@ memgpt load database --name <NAME>  \
 ```
 To encourage your agent to reference its archival memory, we recommend adding phrases like "search your archival memory..." for the best results.
 
-### Viewing available data sources
+#### Viewing available data sources
 You can view loaded data source with:
 ```
 memgpt list sources
 ```
 
 ### Using other endpoints
-AddingAddingAdding
+
 ## Advanced
 
 ### Adding new connectors

From 40cdb236648c78ba3b6c19b95e86ad7881b4ad9e Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:46:12 -0700
Subject: [PATCH 09/48] add readme

---
 README.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 2698f1f710..0f6a23abf5 100644
--- a/README.md
+++ b/README.md
@@ -136,9 +136,9 @@ memgpt list [human/persona]
 MemGPT supports pre-loading data into archival memory, so your agent can reference loaded data in your conversations with an agent by specifying the data source with the flag `memgpt run --data-source <NAME>`.
 
 #### Loading Data
-We currently support loading from a directory and database dump. We highly encourage contributions for new data sources, which can be added as a new [CLI data load command](https://github.com/cpacker/MemGPT/blob/main/memgpt/cli/cli_load.py).
+We currently support loading from a directory and database dumps. We highly encourage contributions for new data sources, which can be added as a new [CLI data load command](https://github.com/cpacker/MemGPT/blob/main/memgpt/cli/cli_load.py).
 
-Loading from a directory:
+Loading from a directorsy:
 ```
 # loading a directory
 memgpt load directory --name <NAME> \
@@ -166,9 +166,6 @@ memgpt list sources
 
 ### Using other endpoints
 
-## Advanced
-
-### Adding new connectors
 
 
 

From ff43c9865839ed0156580b91bde51cda5cd3e94c Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Mon, 30 Oct 2023 17:53:40 -0700
Subject: [PATCH 10/48] add readme

---
 README.md | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/README.md b/README.md
index 0f6a23abf5..7b18cd5026 100644
--- a/README.md
+++ b/README.md
@@ -166,6 +166,22 @@ memgpt list sources
 
 ### Using other endpoints
 
+#### Azure
+To use MemGPT with Azure, expore the following variables and then re-run `memgpt configure`:
+```sh
+# see https://github.com/openai/openai-python#microsoft-azure-endpoints
+export AZURE_OPENAI_KEY = ...
+export AZURE_OPENAI_ENDPOINT = ...
+export AZURE_OPENAI_VERSION = ...
+
+# set the below if you are using deployment ids
+export AZURE_OPENAI_DEPLOYMENT = ...
+export AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT = ...
+```
+
+#### Custom Endpoints
+To use custom endpoints, run `export OPENAI_API_BASE=<MY_CUSTOM_URL>` and then re-run `memgpt configure` to set the custom endpoint as the default endpoint.
+
 
 
 

From 01db319eafc5cc6b06b6aaee11523355f6647c7d Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 10:32:15 -0700
Subject: [PATCH 11/48] CLI bug fixes for azure

---
 memgpt/cli/cli.py        |  8 ++++++++
 memgpt/cli/cli_config.py | 10 +++++-----
 memgpt/config.py         | 11 +++++++----
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/memgpt/cli/cli.py b/memgpt/cli/cli.py
index 30ae919f79..64e50ade94 100644
--- a/memgpt/cli/cli.py
+++ b/memgpt/cli/cli.py
@@ -26,6 +26,10 @@
 from memgpt.constants import MEMGPT_DIR
 from memgpt.agent import AgentAsync
 from memgpt.embeddings import embedding_model
+from memgpt.openai_tools import (
+    configure_azure_support,
+    check_azure_embeddings,
+)
 
 
 def run(
@@ -144,5 +148,9 @@ def run(
     # start event loop
     from memgpt.main import run_agent_loop
 
+    # setup azure if using
+    # TODO: cleanup this code
+    configure_azure_support()
+
     loop = asyncio.get_event_loop()
     loop.run_until_complete(run_agent_loop(memgpt_agent, first, no_verify, config))  # TODO: add back no_verify
diff --git a/memgpt/cli/cli_config.py b/memgpt/cli/cli_config.py
index 905a007ab5..b916699801 100644
--- a/memgpt/cli/cli_config.py
+++ b/memgpt/cli/cli_config.py
@@ -37,10 +37,10 @@ def configure():
     use_azure_deployment_ids = False
     if use_azure:
         # search for key in enviornment
-        azure_key = os.getenv("AZURE_API_KEY")
-        azure_endpoint = (os.getenv("AZURE_ENDPOINT"),)
-        azure_version = (os.getenv("AZURE_VERSION"),)
-        azure_deployment = (os.getenv("AZURE_OPENAI_DEPLOYMENT"),)
+        azure_key = os.getenv("AZURE_OPENAI_KEY")
+        azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
+        azure_version = os.getenv("AZURE_OPENAI_VERSION")
+        azure_deployment = os.getenv("AZURE_OPENAI_DEPLOYMENT")
         azure_embedding_deployment = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
 
         if all([azure_key, azure_endpoint, azure_version]):
@@ -66,7 +66,7 @@ def configure():
     endpoint_options = []
     if os.getenv("OPENAI_API_BASE") is not None:
         endpoint_options.append(os.getenv("OPENAI_API_BASE"))
-    if os.getenv("AZURE_ENDPOINT") is not None:
+    if use_azure:
         endpoint_options += ["azure"]
     if use_openai:
         endpoint_options += ["openai"]
diff --git a/memgpt/config.py b/memgpt/config.py
index 1945d37cff..33ffe1cad8 100644
--- a/memgpt/config.py
+++ b/memgpt/config.py
@@ -110,8 +110,10 @@ def load(cls) -> "MemGPTConfig":
                 azure_key = config.get("azure", "key")
                 azure_endpoint = config.get("azure", "endpoint")
                 azure_version = config.get("azure", "version")
-                azure_deployment = config.get("azure", "deployment")
-                azure_embedding_deployment = config.get("azure", "embedding_deployment")
+                azure_deployment = config.get("azure", "deployment") if config.has_option("azure", "deployment") else None
+                azure_embedding_deployment = (
+                    config.get("azure", "embedding_deployment") if config.has_option("azure", "embedding_deployment") else None
+                )
 
             embedding_model = config.get("embedding", "model")
             embedding_dim = config.getint("embedding", "dim")
@@ -167,8 +169,9 @@ def save(self):
             config.set("azure", "key", self.azure_key)
             config.set("azure", "endpoint", self.azure_endpoint)
             config.set("azure", "version", self.azure_version)
-            config.set("azure", "deployment", self.azure_deployment)
-            config.set("azure", "embedding_deployment", self.azure_embedding_deployment)
+            if self.azure_deployment:
+                config.set("azure", "deployment", self.azure_deployment)
+                config.set("azure", "embedding_deployment", self.azure_embedding_deployment)
 
         # embeddings
         config.add_section("embedding")

From a11cef93c6279644229edbea9ff8f87d06655dfb Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 11:02:00 -0700
Subject: [PATCH 12/48] check azure before running

---
 memgpt/cli/cli.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/memgpt/cli/cli.py b/memgpt/cli/cli.py
index 64e50ade94..ecbeb873f4 100644
--- a/memgpt/cli/cli.py
+++ b/memgpt/cli/cli.py
@@ -150,7 +150,8 @@ def run(
 
     # setup azure if using
     # TODO: cleanup this code
-    configure_azure_support()
+    if config.model_endpoint == "azure":
+        configure_azure_support()
 
     loop = asyncio.get_event_loop()
     loop.run_until_complete(run_agent_loop(memgpt_agent, first, no_verify, config))  # TODO: add back no_verify

From fbe2482d5b3c24a9f7becdd6a2588ce75b8fb0b9 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 12:46:16 -0700
Subject: [PATCH 13/48] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7b18cd5026..7bc67cc613 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@
  <summary><h2>🗃️ Chat with your data - talk to your SQL database or your local files!</strong></h2></summary>
   <strong>SQL Database</strong>
   <div align="center">
-    <img src="https://memgpt.ai/assets/img/sql_demo.gif" alt="MemGPT demo video for sql search" width="800">
+    <img src="https://memgpt.ai/assets/img/doc.gif" alt="MemGPT demo video for sql search" width="800">
   </div>
   <strong>Local files</strong>
   <div align="center">

From 446a1a1c0749a18033a21622c504050a12d4deb7 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 12:53:20 -0700
Subject: [PATCH 14/48] Update README.md

---
 README.md | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/README.md b/README.md
index 7bc67cc613..6647336142 100644
--- a/README.md
+++ b/README.md
@@ -22,29 +22,10 @@
 </details>
 
 <details>
- <summary><h2>🗃️ Chat with your data - talk to your SQL database or your local files!</strong></h2></summary>
-  <strong>SQL Database</strong>
+ <summary><h2>🗃️ Chat with your data - talk to your local files or SQL database!</strong></h2></summary>
   <div align="center">
     <img src="https://memgpt.ai/assets/img/doc.gif" alt="MemGPT demo video for sql search" width="800">
   </div>
-  <strong>Local files</strong>
-  <div align="center">
-    <img src="https://memgpt.ai/assets/img/preload_archival_demo.gif" alt="MemGPT demo video for sql search" width="800">
-  </div>
-</details>
-
-<details>
-  <summary><h2>📄 You can also talk to docs - for example ask about <a href="memgpt/personas/examples/docqa">LlamaIndex</a>!</h1></summary>
-  <div align="center">
-    <img src="https://memgpt.ai/assets/img/docqa_demo.gif" alt="MemGPT demo video for llamaindex api docs search" width="800">
-  </div>
-  <details>
-  <summary><b>ChatGPT (GPT-4) when asked the same question:</b></summary>
-    <div align="center">
-      <img src="https://memgpt.ai/assets/img/llama_index_gpt4.png" alt="GPT-4 when asked about llamaindex api docs" width="800">
-    </div>
-    (Question from https://github.com/run-llama/llama_index/issues/7756)
-  </details>
 </details>
 
 ## Quick setup

From 154148223c90547bf23bfa00bec411ffedf7e9e6 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 13:06:02 -0700
Subject: [PATCH 15/48] bug fix with persona loading

---
 memgpt/cli/cli.py |  6 ++++--
 memgpt/utils.py   | 38 +++++++++++++++++++++++++++++++-------
 2 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/memgpt/cli/cli.py b/memgpt/cli/cli.py
index ecbeb873f4..88e1b1f65d 100644
--- a/memgpt/cli/cli.py
+++ b/memgpt/cli/cli.py
@@ -134,13 +134,15 @@ def run(
         agent_config.save()
         typer.secho(f"Created new agent {agent_config.name}.", fg=typer.colors.GREEN)
 
+        print(agent_config.human, agent_config.persona)
+
         # create agent
         memgpt_agent = presets.use_preset(
             agent_config.preset,
             agent_config,
             agent_config.model,
-            agent_config.persona,
-            agent_config.human,
+            utils.get_persona_text(agent_config.persona),
+            utils.get_human_text(agent_config.human),
             memgpt.interface,
             persistence_manager,
         )
diff --git a/memgpt/utils.py b/memgpt/utils.py
index e2146bc7fd..323a551c09 100644
--- a/memgpt/utils.py
+++ b/memgpt/utils.py
@@ -20,6 +20,8 @@
 from llama_index import set_global_service_context, ServiceContext, VectorStoreIndex, load_index_from_storage, StorageContext
 from llama_index.embeddings import OpenAIEmbedding
 
+from memgpt.embeddings import embedding_model
+
 
 def count_tokens(s: str, model: str = "gpt-4") -> int:
     encoding = tiktoken.encoding_for_model(model)
@@ -394,13 +396,11 @@ def get_index(name, docs):
 
     # read embedding confirguration
     # TODO: in the future, make an IngestData class that loads the config once
-    # config = MemGPTConfig.load()
-    # chunk_size = config.embedding_chunk_size
-    # model = config.embedding_model  # TODO: actually use this
-    # dim = config.embedding_dim  # TODO: actually use this
-    # embed_model = OpenAIEmbedding()
-    # service_context = ServiceContext.from_defaults(embed_model=embed_model, chunk_size=chunk_size)
-    # set_global_service_context(service_context)
+    config = MemGPTConfig.load()
+    embed_model = embedding_model(config)
+    chunk_size = config.embedding_chunk_size
+    service_context = ServiceContext.from_defaults(embed_model=embed_model, chunk_size=chunk_size)
+    set_global_service_context(service_context)
 
     # index documents
     index = VectorStoreIndex.from_documents(docs)
@@ -477,3 +477,27 @@ def list_persona_files():
     user_added = os.listdir(user_dir)
     user_added = [os.path.join(user_dir, f) for f in user_added]
     return memgpt_defaults + user_added
+
+
+def get_human_text(name: str):
+    for file_path in list_human_files():
+        file = os.path.basename(file_path)
+        if f"{name}.txt" == file or name == file:
+            return open(file_path, "r").read().strip()
+    raise ValueError(f"Human {name} not found")
+
+
+def get_persona_text(name: str):
+    for file_path in list_persona_files():
+        file = os.path.basename(file_path)
+        if f"{name}.txt" == file or name == file:
+            return open(file_path, "r").read().strip()
+
+    raise ValueError(f"Persona {name} not found")
+
+
+def get_human_text(name: str):
+    for file_path in list_human_files():
+        file = os.path.basename(file_path)
+        if f"{name}.txt" == file or name == file:
+            return open(file_path, "r").read().strip()

From 7a8eb80e2cee84238f93da6f37967de85d4d597c Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 13:52:27 -0700
Subject: [PATCH 16/48] remove print

---
 memgpt/cli/cli.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/memgpt/cli/cli.py b/memgpt/cli/cli.py
index 88e1b1f65d..2043fdbb46 100644
--- a/memgpt/cli/cli.py
+++ b/memgpt/cli/cli.py
@@ -134,8 +134,6 @@ def run(
         agent_config.save()
         typer.secho(f"Created new agent {agent_config.name}.", fg=typer.colors.GREEN)
 
-        print(agent_config.human, agent_config.persona)
-
         # create agent
         memgpt_agent = presets.use_preset(
             agent_config.preset,

From 36bb04dee51fa6c4d35892dc56dd4db7cf34d564 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Fri, 3 Nov 2023 13:04:34 -0700
Subject: [PATCH 17/48] make errors for cli flags more clear

---
 memgpt/cli/cli.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/memgpt/cli/cli.py b/memgpt/cli/cli.py
index 7a2a210172..20dc1bdba0 100644
--- a/memgpt/cli/cli.py
+++ b/memgpt/cli/cli.py
@@ -108,9 +108,12 @@ def run(
         printd("Index path:", agent_config.save_agent_index_dir())
         # persistence_manager = LocalStateManager(agent_config).load() # TODO: implement load
         # TODO: load prior agent state
-        assert not any(
-            [persona, human, model]
-        ), f"Cannot override existing agent state with command line arguments: {persona}, {human}, {model}"
+        if persona and persona != agent_config.persona:
+            raise ValueError(f"Cannot override {agent_config.name} existing persona {agent_config.persona} with {persona}")
+        if human and human != agent_config.human:
+            raise ValueError(f"Cannot override {agent_config.name} existing human {agent_config.human} with {human}")
+        if model and model != agent_config.model:
+            raise ValueError(f"Cannot override {agent_config.name} existing model {agent_config.model} with {model}")
 
         # load existing agent
         memgpt_agent = AgentAsync.load_agent(memgpt.interface, agent_config)

From 6f50db1ba33af577949d1f717f8fd958d042aa55 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Fri, 3 Nov 2023 13:05:52 -0700
Subject: [PATCH 18/48] format

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 06e0924470..3435e25d5d 100644
--- a/README.md
+++ b/README.md
@@ -159,6 +159,7 @@ export AZURE_OPENAI_VERSION = ...
 export AZURE_OPENAI_DEPLOYMENT = ...
 export AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT = ...
 ```
+
 Note: your Azure endpoint must support functions or you will get an error. See https://github.com/cpacker/MemGPT/issues/91 for more information.
 
 #### Custom Endpoints

From 31282f5c2701d961ee5c070551e643a3e11fe072 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 14:08:08 -0700
Subject: [PATCH 19/48] add initial postgres implementation

---
 memgpt/memory/archival.py | 191 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 191 insertions(+)
 create mode 100644 memgpt/memory/archival.py

diff --git a/memgpt/memory/archival.py b/memgpt/memory/archival.py
new file mode 100644
index 0000000000..b9f1102487
--- /dev/null
+++ b/memgpt/memory/archival.py
@@ -0,0 +1,191 @@
+from abc import ABC, abstractmethod
+import os
+import datetime
+import re
+import faiss
+import numpy as np
+from typing import Optional, List, Tuple
+
+from memgpt.config import AgentConfig, MemGPTConfig
+from .constants import MESSAGE_SUMMARY_WARNING_TOKENS, MEMGPT_DIR
+from .utils import cosine_similarity, get_local_time, printd, count_tokens
+from .prompts.gpt_summarize import SYSTEM as SUMMARY_PROMPT_SYSTEM
+from memgpt import utils
+from .openai_tools import (
+    acompletions_with_backoff as acreate,
+    async_get_embedding_with_backoff,
+    get_embedding_with_backoff,
+    completions_with_backoff as create,
+)
+from llama_index import (
+    VectorStoreIndex,
+    EmptyIndex,
+    get_response_synthesizer,
+    load_index_from_storage,
+    StorageContext,
+)
+from llama_index.retrievers import VectorIndexRetriever
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.indices.postprocessor import SimilarityPostprocessor
+
+# TODO: move to different file
+import psycopg2
+from sqlalchemy import make_url
+
+
+class ArchivalMemory(ABC):
+    @abstractmethod
+    def insert(self, memory_string):
+        """Insert new archival memory
+
+        :param memory_string: Memory string to insert
+        :type memory_string: str
+        """
+        pass
+
+    @abstractmethod
+    def search(self, query_string, count=None, start=None) -> Tuple[List[str], int]:
+        """Search archival memory
+
+        :param query_string: Query string
+        :type query_string: str
+        :param count: Number of results to return (None for all)
+        :type count: Optional[int]
+        :param start: Offset to start returning results from (None if 0)
+        :type start: Optional[int]
+
+        :return: Tuple of (list of results, total number of results)
+        """
+        pass
+
+    @abstractmethod
+    def __repr__(self) -> str:
+        pass
+
+
+class LocalArchivalMemory(ArchivalMemory):
+    """Archival memory built on top of Llama Index"""
+
+    def __init__(self, agent_config, top_k: Optional[int] = 100):
+        """Init function for archival memory
+
+        :param archiva_memory_database: name of dataset to pre-fill archival with
+        :type archival_memory_database: str
+        """
+
+        self.top_k = top_k
+        self.agent_config = agent_config
+
+        # locate saved index
+        if self.agent_config.data_source is not None:  # connected data source
+            directory = f"{MEMGPT_DIR}/archival/{self.agent_config.data_source}"
+            assert os.path.exists(directory), f"Archival memory database {self.agent_config.data_source} does not exist"
+        elif self.agent_config.name is not None:
+            directory = agent_config.save_agent_index_dir()
+            if not os.path.exists(directory):
+                # no existing archival storage
+                directory = None
+
+        # load/create index
+        if directory:
+            storage_context = StorageContext.from_defaults(persist_dir=directory)
+            self.index = load_index_from_storage(storage_context)
+        else:
+            self.index = EmptyIndex()
+
+        # create retriever
+        if isinstance(self.index, EmptyIndex):
+            self.retriever = None  # cant create retriever over empty indes
+        else:
+            self.retriever = VectorIndexRetriever(
+                index=self.index,  # does this get refreshed?
+                similarity_top_k=self.top_k,
+            )
+
+        # TODO: have some mechanism for cleanup otherwise will lead to OOM
+        self.cache = {}
+
+    def save(self):
+        """Save the index to disk"""
+        if self.agent_config.data_source:  # update original archival index
+            # TODO: this corrupts the originally loaded data. do we want to do this?
+            utils.save_index(self.index, self.agent_config.data_source)
+        else:
+            utils.save_agent_index(self.index, self.agent_config)
+
+    async def insert(self, memory_string):
+        self.index.insert(memory_string)
+
+        # TODO: figure out if this needs to be refreshed (probably not)
+        self.retriever = VectorIndexRetriever(
+            index=self.index,
+            similarity_top_k=self.top_k,
+        )
+
+    async def search(self, query_string, count=None, start=None):
+        if self.retriever is None:
+            print("Warning: archival memory is empty")
+            return [], 0
+
+        start = start if start else 0
+        count = count if count else self.top_k
+        count = min(count + start, self.top_k)
+
+        if query_string not in self.cache:
+            self.cache[query_string] = self.retriever.retrieve(query_string)
+
+        results = self.cache[query_string][start : start + count]
+        results = [{"timestamp": get_local_time(), "content": node.node.text} for node in results]
+        # from pprint import pprint
+        # pprint(results)
+        return results, len(results)
+
+    async def a_search(self, query_string, count=None, start=None):
+        return self.search(query_string, count, start)
+
+    def __repr__(self) -> str:
+        print(self.index.ref_doc_info)
+        return ""
+
+
+class PostgresArchivalMemory(ArchivalMemory):
+    def __init__(
+        self,
+        agent_config: AgentConfig,
+        connection_string: str,
+        db_name: str,
+    ):
+        self.agent_config = agent_config
+        self.connection_string = connection_string
+        self.db_name = db_name
+        self.table_name = "archival_memory"
+        self.top_k = 100
+
+        # create table
+        self.conn = psycopg2.connect(self.connection_string)
+        self.conn.autocommit = True
+
+        with self.conn.cursor() as c:
+            c.execute(f"DROP DATABASE IF EXISTS {db_name}")
+            c.execute(f"CREATE DATABASE {db_name}")
+
+        url = make_url(connection_string)
+        vector_store = PGVectorStore.from_params(
+            database=self.db_name,
+            host=url.host,
+            password=url.password,
+            port=url.port,
+            user=url.username,
+            table_name=self.table_name,
+            embed_dim=MemGPTConfig.load().embedding_dim,  # openai embedding dimension
+        )
+
+        storage_context = StorageContext.from_defaults(vector_store=vector_store)
+        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, show_progress=True)
+        query_engine = index.as_query_engine()
+
+        # create retriever
+        self.retriever = VectorIndexRetriever(
+            index=self.index,
+            similarity_top_k=self.top_k,
+        )

From d9e137c9e82efbd30c9731b07329f2bda8e0c467 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 18:27:04 -0700
Subject: [PATCH 20/48] working chroma loading

---
 memgpt/embeddings.py        | 70 +++++++++++++++++++++++++++++++++++
 memgpt/memory/archival.py   | 41 ++++++++++++++++++++
 memgpt/utils.py             |  2 +
 tests/test_load_archival.py | 74 +++++++++++++++++++++++++++++++++++++
 4 files changed, 187 insertions(+)

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index 20c6040ed9..44bf1ea07d 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -36,3 +36,73 @@ def embedding_model():
 
         # loads BAAI/bge-small-en-v1.5
         return HuggingFaceEmbedding(model_name=model)
+
+
+class Index:
+    def __init__(self, name: str, save_directory: Optional[str] = None):
+
+        config = MemGPTConfig.load()
+        self.save_directory = save_directory
+
+        # setup storage
+        self.storage_type = config.archival_storage_type
+        print("VECTORDB CONFIG", self.save_directory, self.storage_type)
+        if config.archival_storage_type == "local":
+            self.storage_context = StorageContext.from_defaults(persist_dir=self.save_directory)
+        else:
+            if config.archival_storage_type == "postgres":
+                from llama_index.vector_stores import PGVectorStore
+
+                self.vector_store = PGVectorStore.from_params(
+                    database=self.db_name,
+                    host=url.host,
+                    password=url.password,
+                    port=url.port,
+                    user=url.username,
+                    table_name=name,  # table_name = data source name
+                    embed_dim=MemGPTConfig.load().embedding_dim,  # openai embedding dimension
+                )
+            elif config.archival_storage_type == "chroma":
+                from llama_index.vector_stores import ChromaVectorStore
+                import chromadb
+
+                print("use chroma")
+                # chroma_client = chromadb.EphemeralClient()
+                chroma_client = chromadb.PersistentClient(path="/Users/sarahwooders/repos/MemGPT/chromadb")
+                chroma_collection = chroma_client.get_or_create_collection(name)
+                self.vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
+            else:
+                raise ValueError(f"Unknown archival storage type {config.archival_storage_type}")
+            self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
+
+        # setup embedding model
+        self.embed_model = embedding_model(config)
+
+        # setup service context
+        self.service_context = ServiceContext.from_defaults(llm=None, embed_model=self.embed_model, chunk_size=config.embedding_chunk_size)
+
+    def load_documents(self, documents):
+        self.index = VectorStoreIndex.from_documents(
+            documents, storage_context=self.storage_context, service_context=self.service_context, show_progress=True
+        )
+        if self.storage_type == "local":
+            # save to disk if local
+            self.index.storage_context.persist(persist_dir=self.directory)  # TODO:
+
+    def load_index(self, index_dir: str):
+        storage_context = StorageContext.from_defaults(persist_dir=index_dir)
+        self.index = load_index_from_storage(storage_context)
+
+        # persist
+
+    def get_index(self):
+        if self.index:
+            # index already loaded
+            return self.index
+
+        if self.storage_type == "local":
+            self.index = load_index_from_storage(self.storage_context)
+        else:
+            self.index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
+
+        return self.index
diff --git a/memgpt/memory/archival.py b/memgpt/memory/archival.py
index b9f1102487..bf057b7781 100644
--- a/memgpt/memory/archival.py
+++ b/memgpt/memory/archival.py
@@ -34,6 +34,9 @@
 
 
 class ArchivalMemory(ABC):
+
+    """Wrapper around Llama Index VectorStoreIndex"""
+
     @abstractmethod
     def insert(self, memory_string):
         """Insert new archival memory
@@ -189,3 +192,41 @@ def __init__(
             index=self.index,
             similarity_top_k=self.top_k,
         )
+
+
+class ChromaArchivalMemory(ArchivalMemory):
+
+    import chromadb
+
+    def __init__(
+        self,
+        agent_config: AgentConfig,
+        top_k: int = 100,
+    ):
+        self.agent_config = agent_config
+        self.data_source_name = agent_config.data_source
+
+        # connect to client
+        self.client = chromadb.Client()
+        # client = chromadb.PersistentClient(path="/path/to/save/to")
+        self.collection = self.client.get_collection(self.data_source_name)
+
+        # TODO: have some mechanism for cleanup otherwise will lead to OOM
+        self.cache = {}
+
+    def search(self, query_string, count=None, start=None):
+
+        start = start if start else 0
+        count = count if count else self.top_k
+        count = min(count + start, self.top_k)
+
+        if query_string not in self.cache:
+            self.cache[query_string] = self.collection.query(
+                query_texts=[query_string],
+            )
+
+        results = self.cache[query_string][start : start + count]
+        results = [{"timestamp": get_local_time(), "content": node.node.text} for node in results]
+        # from pprint import pprint
+        # pprint(results)
+        return results, len(results)
diff --git a/memgpt/utils.py b/memgpt/utils.py
index ff87fd17f6..cf65169619 100644
--- a/memgpt/utils.py
+++ b/memgpt/utils.py
@@ -20,6 +20,8 @@
 from llama_index import set_global_service_context, ServiceContext, VectorStoreIndex, load_index_from_storage, StorageContext
 from llama_index.embeddings import OpenAIEmbedding
 
+from memgpt.embeddings import embedding_model
+
 
 def count_tokens(s: str, model: str = "gpt-4") -> int:
     encoding = tiktoken.encoding_for_model(model)
diff --git a/tests/test_load_archival.py b/tests/test_load_archival.py
index d21eb7c244..15e8e15b8c 100644
--- a/tests/test_load_archival.py
+++ b/tests/test_load_archival.py
@@ -92,6 +92,77 @@ def test_chroma():
     # assert len(results) == 2, f"Expected 2 results, but got {len(results)}"
 
 
+def test_postgres():
+
+    # override config path with enviornment variable
+    # TODO: make into temporary file
+    os.environ["MEMGPT_CONFIG_PATH"] = "/Users/sarahwooders/repos/MemGPT/test_config.cfg"
+    print("env", os.getenv("MEMGPT_CONFIG_PATH"))
+    config = memgpt.config.MemGPTConfig(archival_storage_type="postgres", config_path=os.getenv("MEMGPT_CONFIG_PATH"))
+    print(config)
+    config.save()
+    # exit()
+
+    name = "tmp_hf_dataset"
+
+    dataset = load_dataset("MemGPT/example_short_stories")
+
+    cache_dir = os.getenv("HF_DATASETS_CACHE")
+    if cache_dir is None:
+        # Construct the default path if the environment variable is not set.
+        cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "datasets")
+
+    load_directory(
+        name=name,
+        input_dir=cache_dir,
+        recursive=True,
+    )
+
+
+def test_chroma():
+
+    import chromadb
+
+    # override config path with enviornment variable
+    # TODO: make into temporary file
+    os.environ["MEMGPT_CONFIG_PATH"] = "/Users/sarahwooders/repos/MemGPT/test_config.cfg"
+    print("env", os.getenv("MEMGPT_CONFIG_PATH"))
+    config = memgpt.config.MemGPTConfig(archival_storage_type="chroma", config_path=os.getenv("MEMGPT_CONFIG_PATH"))
+    print(config)
+    config.save()
+    # exit()
+
+    name = "tmp_hf_dataset"
+
+    dataset = load_dataset("MemGPT/example_short_stories")
+
+    cache_dir = os.getenv("HF_DATASETS_CACHE")
+    if cache_dir is None:
+        # Construct the default path if the environment variable is not set.
+        cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "huggingface", "datasets")
+
+    config = memgpt.config.MemGPTConfig(archival_storage_type="chroma")
+
+    load_directory(
+        name=name,
+        input_dir=cache_dir,
+        recursive=True,
+    )
+
+    # index = memgpt.embeddings.Index(name)
+
+    ## query chroma
+    ##chroma_client = chromadb.Client()
+    # chroma_client = chromadb.PersistentClient(path="/Users/sarahwooders/repos/MemGPT/chromadb")
+    # collection = chroma_client.get_collection(name=name)
+    # results = collection.query(
+    #    query_texts=["cinderella be getting sick"],
+    #    n_results=2
+    # )
+    # print(results)
+    # assert len(results) == 2, f"Expected 2 results, but got {len(results)}"
+
+
 def test_load_directory():
     return
     # downloading hugging face dataset (if does not exist)
@@ -200,3 +271,6 @@ def test_load_database():
     )
     print("Successfully loaded into index")
     assert True
+
+
+test_chroma()

From 25b45f13033a6bb2bdce2d32de80def662488b27 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 19:50:07 -0700
Subject: [PATCH 21/48] add postgres tests

---
 memgpt/embeddings.py        | 14 +++++++++++++-
 tests/test_load_archival.py |  3 ++-
 2 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index 44bf1ea07d..129c6a980d 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -52,9 +52,15 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
         else:
             if config.archival_storage_type == "postgres":
                 from llama_index.vector_stores import PGVectorStore
+                from sqlalchemy import make_url
+
+                connection_string = ""  # TODO: read from config
+                url = make_url(connection_string)
+
+                print("table", name)
 
                 self.vector_store = PGVectorStore.from_params(
-                    database=self.db_name,
+                    database=url.database,
                     host=url.host,
                     password=url.password,
                     port=url.port,
@@ -74,6 +80,7 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
             else:
                 raise ValueError(f"Unknown archival storage type {config.archival_storage_type}")
             self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
+            print("storage context", self.storage_context)
 
         # setup embedding model
         self.embed_model = embedding_model(config)
@@ -85,9 +92,14 @@ def load_documents(self, documents):
         self.index = VectorStoreIndex.from_documents(
             documents, storage_context=self.storage_context, service_context=self.service_context, show_progress=True
         )
+        print("loaded docs")
         if self.storage_type == "local":
             # save to disk if local
             self.index.storage_context.persist(persist_dir=self.directory)  # TODO:
+            print("saved local")
+        else:
+            self.index.storage_context.persist()
+            print("saved storage")
 
     def load_index(self, index_dir: str):
         storage_context = StorageContext.from_defaults(persist_dir=index_dir)
diff --git a/tests/test_load_archival.py b/tests/test_load_archival.py
index 15e8e15b8c..2e22ea51e0 100644
--- a/tests/test_load_archival.py
+++ b/tests/test_load_archival.py
@@ -273,4 +273,5 @@ def test_load_database():
     assert True
 
 
-test_chroma()
+test_postgres()
+# test_chroma()

From 54bd66d16f18bc8e915ccad002db32fd8be82933 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 31 Oct 2023 20:24:29 -0700
Subject: [PATCH 22/48] working initial load into postgres and chroma

---
 memgpt/embeddings.py        | 57 +++++++++++++++++++++----------------
 memgpt/utils.py             |  2 +-
 tests/test_load_archival.py |  2 +-
 3 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index 129c6a980d..de64069104 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -46,7 +46,6 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
 
         # setup storage
         self.storage_type = config.archival_storage_type
-        print("VECTORDB CONFIG", self.save_directory, self.storage_type)
         if config.archival_storage_type == "local":
             self.storage_context = StorageContext.from_defaults(persist_dir=self.save_directory)
         else:
@@ -54,11 +53,9 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
                 from llama_index.vector_stores import PGVectorStore
                 from sqlalchemy import make_url
 
-                connection_string = ""  # TODO: read from config
+                connection_string = config.archival_storage_uri
                 url = make_url(connection_string)
 
-                print("table", name)
-
                 self.vector_store = PGVectorStore.from_params(
                     database=url.database,
                     host=url.host,
@@ -66,14 +63,14 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
                     port=url.port,
                     user=url.username,
                     table_name=name,  # table_name = data source name
-                    embed_dim=MemGPTConfig.load().embedding_dim,  # openai embedding dimension
+                    embed_dim=config.embedding_dim,  # openai embedding dimension
                 )
             elif config.archival_storage_type == "chroma":
                 from llama_index.vector_stores import ChromaVectorStore
                 import chromadb
 
-                print("use chroma")
                 # chroma_client = chromadb.EphemeralClient()
+                # TODO: connect to storage URI if provided
                 chroma_client = chromadb.PersistentClient(path="/Users/sarahwooders/repos/MemGPT/chromadb")
                 chroma_collection = chroma_client.get_or_create_collection(name)
                 self.vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
@@ -88,33 +85,45 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
         # setup service context
         self.service_context = ServiceContext.from_defaults(llm=None, embed_model=self.embed_model, chunk_size=config.embedding_chunk_size)
 
+        # load index (if exists)
+        # TODO: make sure this doesn't cause an error if the index doesn't exist yet
+        if self.storage_type == "local":
+            # load from disk if local
+            self.index = load_index_from_storage(self.storage_context)
+        else:
+            # load from vector store
+            self.index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
+
     def load_documents(self, documents):
+        """Load a list of documents into an index
+
+        :param documents: List of documents to create an index with
+        :type documents: List[Document]
+        """
+        # need to remove problematic characters to avoid errors
+        for doc in documents:
+            doc.text = doc.text.replace("\x00", "\uFFFD")  # hacky fix for error on null characters
+
+        # create index
         self.index = VectorStoreIndex.from_documents(
             documents, storage_context=self.storage_context, service_context=self.service_context, show_progress=True
         )
-        print("loaded docs")
+
+        # persist state
         if self.storage_type == "local":
             # save to disk if local
             self.index.storage_context.persist(persist_dir=self.directory)  # TODO:
-            print("saved local")
         else:
             self.index.storage_context.persist()
-            print("saved storage")
-
-    def load_index(self, index_dir: str):
-        storage_context = StorageContext.from_defaults(persist_dir=index_dir)
-        self.index = load_index_from_storage(storage_context)
 
-        # persist
+    def update(self, documents):
+        """Update an index with new documents
 
-    def get_index(self):
-        if self.index:
-            # index already loaded
-            return self.index
-
-        if self.storage_type == "local":
-            self.index = load_index_from_storage(self.storage_context)
-        else:
-            self.index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
+        :param documents: List of documents to update an index with
+        :type documents: List[Document]
+        """
+        # need to remove problematic characters to avoid errors
+        for doc in documents:
+            doc.text = doc.text.replace("\x00", "\uFFFD")
 
-        return self.index
+        # TODO: make sure document is persisted in the remote DB
diff --git a/memgpt/utils.py b/memgpt/utils.py
index cf65169619..4a088a43d9 100644
--- a/memgpt/utils.py
+++ b/memgpt/utils.py
@@ -350,7 +350,7 @@ def estimate_openai_cost(docs):
     from llama_index.callbacks import CallbackManager, TokenCountingHandler
     import tiktoken
 
-    embed_model = MockEmbedding(embed_dim=1536)
+    embed_model = MockEmbedding(embed_dim=768)
 
     token_counter = TokenCountingHandler(tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
 
diff --git a/tests/test_load_archival.py b/tests/test_load_archival.py
index 2e22ea51e0..cf32cb83e4 100644
--- a/tests/test_load_archival.py
+++ b/tests/test_load_archival.py
@@ -103,7 +103,7 @@ def test_postgres():
     config.save()
     # exit()
 
-    name = "tmp_hf_dataset"
+    name = "tmp_hf_dataset2"
 
     dataset = load_dataset("MemGPT/example_short_stories")
 

From 3632c3f512c0e437e7342422a3b4ca371cd68662 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Wed, 1 Nov 2023 10:10:28 -0700
Subject: [PATCH 23/48] add load index command

---
 memgpt/embeddings.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index de64069104..b2a23ad62a 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -108,6 +108,9 @@ def load_documents(self, documents):
         self.index = VectorStoreIndex.from_documents(
             documents, storage_context=self.storage_context, service_context=self.service_context, show_progress=True
         )
+        self.persist()
+
+    def persist(self):
 
         # persist state
         if self.storage_type == "local":

From fe256826ec5efe915871d43c97ab786b5329f55f Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Wed, 1 Nov 2023 11:22:39 -0700
Subject: [PATCH 24/48] semi working load index

---
 memgpt/embeddings.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index b2a23ad62a..e54130ea34 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -94,6 +94,10 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
             # load from vector store
             self.index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
 
+    def load_nodes(self, nodes):
+        self.index.build_index_from_nodes(nodes=nodes)
+        self.persist()
+
     def load_documents(self, documents):
         """Load a list of documents into an index
 

From 732e732fdf8310a9ba2c5dff33b636bb6a6b9005 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Wed, 1 Nov 2023 16:55:41 -0700
Subject: [PATCH 25/48] disgusting import code thanks to llama index's nasty
 APIs

---
 memgpt/cli/cli.py    |  1 +
 memgpt/embeddings.py | 81 +++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/memgpt/cli/cli.py b/memgpt/cli/cli.py
index ac115fdfe6..87b73f52a4 100644
--- a/memgpt/cli/cli.py
+++ b/memgpt/cli/cli.py
@@ -30,6 +30,7 @@
     configure_azure_support,
     check_azure_embeddings,
 )
+from memgpt.embeddings import Index
 
 
 def run(
diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index e54130ea34..0372a802df 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -1,5 +1,6 @@
 import typer
 from llama_index.embeddings import OpenAIEmbedding
+from llama_index.schema import BaseComponent, TextNode, Document
 
 
 def embedding_model():
@@ -65,6 +66,9 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
                     table_name=name,  # table_name = data source name
                     embed_dim=config.embedding_dim,  # openai embedding dimension
                 )
+                self.uri = config.archival_storage_uri
+                self.table_name = "data_%s" % name.lower()  # TODO: figure out exactly what this is
+                print("TABLE NAME", self.table_name)
             elif config.archival_storage_type == "chroma":
                 from llama_index.vector_stores import ChromaVectorStore
                 import chromadb
@@ -95,7 +99,15 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
             self.index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
 
     def load_nodes(self, nodes):
+        """Loads a list of LlamaIndex nodes into index
+
+        :param nodes: List of nodes to create an index with
+        :type nodes: List[TextNode]
+        """
+        for node in nodes:
+            node.text = node.text.replace("\x00", "\uFFFD")  # hacky fix for error on null characters
         self.index.build_index_from_nodes(nodes=nodes)
+        print(f"Added {len(nodes)} nodes")
         self.persist()
 
     def load_documents(self, documents):
@@ -123,7 +135,15 @@ def persist(self):
         else:
             self.index.storage_context.persist()
 
-    def update(self, documents):
+    def insert(self, text: str, embedding: Optional[List[float]] = None):
+        """Insert new string into index
+
+        :param text: String to insert into index
+        :type text: str
+        """
+        self.index.insert(Document(text=text, embedding=embedding))
+
+    def update(self, documents, embeddings=[]):
         """Update an index with new documents
 
         :param documents: List of documents to update an index with
@@ -134,3 +154,62 @@ def update(self, documents):
             doc.text = doc.text.replace("\x00", "\uFFFD")
 
         # TODO: make sure document is persisted in the remote DB
+
+        # TODO: allow for existing embeddings
+
+    def get_nodes(self):
+        """Get the list of nodes from an index (useful for moving data from one index to another)
+
+        :return: Nodes contained in index
+        :rtype: List[TextNode]
+        """
+
+        if self.storage_type == "local":
+            embed_dict = self.index._vector_store._data.embedding_dict
+            node_dict = self.index._docstore.docs
+
+            nodes = []
+            for node_id, node in node_dict.items():
+                vector = embed_dict[node_id]
+                node.embedding = vector
+                nodes.append(node)
+            return nodes
+        elif self.storage_type == "postgres":
+            from sqlalchemy import create_engine, MetaData, Table, select
+
+            engine = create_engine(self.uri)
+            metadata = MetaData()
+            # data_table = Table(self.table_name, metadata, autoload_with=engine, schema='public')k
+            print(self.vector_store._table_class)
+
+            # Initialize a list to store the Node objects
+            nodes = []
+
+            # Start a connection to the database
+            with engine.connect() as conn:
+                # Select all data from the table
+                select_stmt = select(self.vector_store._table_class)
+                results = conn.execute(select_stmt).all()
+
+                print(results[0])
+                print("DATA", results[1].embedding, results[1].text)
+
+                # Iterate over the rows to create Node objects
+                for row in results:
+                    # Assuming that 'text' is the document and 'embedding' is the binary representation of the embedding
+                    # If 'embedding' is stored in a different format, you might need to adjust the code to handle it correctly
+                    # import json
+                    # document = json.loads(row[1])
+                    try:
+                        node = Document(document=row.text, embedding=list(row.embedding))
+                    except Exception as e:
+                        print(row)
+                        raise e
+                    nodes.append(node)
+            print("nodes", len(nodes))
+            return nodes
+
+        elif self.storage_type == "chroma":
+            raise NotImplementedError("TODO")
+        else:
+            raise ValueError(f"Unknown archival storage type {self.storage_type}")

From ac6638c6e46c553b67908b2a2f94e222c19e1826 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 13:06:51 -0700
Subject: [PATCH 26/48] add postgres connector

---
 memgpt/embeddings.py | 21 +++++++++++++++++----
 memgpt/memory.py     |  5 +++++
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index 0372a802df..1f4fa4f882 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -47,6 +47,7 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
 
         # setup storage
         self.storage_type = config.archival_storage_type
+        print("storage type", self.storage_type)
         if config.archival_storage_type == "local":
             self.storage_context = StorageContext.from_defaults(persist_dir=self.save_directory)
         else:
@@ -97,6 +98,7 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
         else:
             # load from vector store
             self.index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
+            # print("BASIC QUERY", self.index.as_query_engine().query("cinderella crying"))
 
     def load_nodes(self, nodes):
         """Loads a list of LlamaIndex nodes into index
@@ -104,8 +106,8 @@ def load_nodes(self, nodes):
         :param nodes: List of nodes to create an index with
         :type nodes: List[TextNode]
         """
-        for node in nodes:
-            node.text = node.text.replace("\x00", "\uFFFD")  # hacky fix for error on null characters
+        # for node in nodes:
+        #    node.text = node.text.replace("\x00", "\uFFFD")  # hacky fix for error on null characters
         self.index.build_index_from_nodes(nodes=nodes)
         print(f"Added {len(nodes)} nodes")
         self.persist()
@@ -141,7 +143,10 @@ def insert(self, text: str, embedding: Optional[List[float]] = None):
         :param text: String to insert into index
         :type text: str
         """
-        self.index.insert(Document(text=text, embedding=embedding))
+        if embedding is None:
+            self.index.insert(text)
+        else:
+            self.index.insert(Document(text=text, embedding=embedding))
 
     def update(self, documents, embeddings=[]):
         """Update an index with new documents
@@ -201,7 +206,15 @@ def get_nodes(self):
                     # import json
                     # document = json.loads(row[1])
                     try:
-                        node = Document(document=row.text, embedding=list(row.embedding))
+
+                        node = TextNode(
+                            id_=row.node_id,
+                            text=row.text,
+                            metadata=row.metadata,
+                            embedding=row.embedding,
+                        )
+                        print(node)
+                        # node = Document(document=row.text, embedding=list(row.embedding))
                     except Exception as e:
                         print(row)
                         raise e
diff --git a/memgpt/memory.py b/memgpt/memory.py
index 6f5efa43d9..03c890232d 100644
--- a/memgpt/memory.py
+++ b/memgpt/memory.py
@@ -30,10 +30,15 @@
 from llama_index.query_engine import RetrieverQueryEngine
 from llama_index.indices.postprocessor import SimilarityPostprocessor
 
+<<<<<<< HEAD
 from memgpt.embeddings import embedding_model
 from memgpt.config import MemGPTConfig
 
 from memgpt.embeddings import embedding_model
+=======
+from memgpt.embeddings import Index, embedding_model
+from memgpt.connectors.storage import StorageConnector, Passage
+>>>>>>> b55cdad (add postgres connector)
 from memgpt.config import MemGPTConfig
 
 

From 8787936820c8b7b7b5f95ded5571db7bf987d9d4 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 14:33:24 -0700
Subject: [PATCH 27/48] working postgres integration

---
 memgpt/embeddings.py | 39 +++++++++++++++++++++------------------
 memgpt/memory.py     |  8 --------
 2 files changed, 21 insertions(+), 26 deletions(-)

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index 1f4fa4f882..5f6d9e28e5 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -2,6 +2,8 @@
 from llama_index.embeddings import OpenAIEmbedding
 from llama_index.schema import BaseComponent, TextNode, Document
 
+from memgpt.connectors.storage import PostgresStorageConnector
+
 
 def embedding_model():
     """Return LlamaIndex embedding model to use for embeddings"""
@@ -52,24 +54,25 @@ def __init__(self, name: str, save_directory: Optional[str] = None):
             self.storage_context = StorageContext.from_defaults(persist_dir=self.save_directory)
         else:
             if config.archival_storage_type == "postgres":
-                from llama_index.vector_stores import PGVectorStore
-                from sqlalchemy import make_url
-
-                connection_string = config.archival_storage_uri
-                url = make_url(connection_string)
-
-                self.vector_store = PGVectorStore.from_params(
-                    database=url.database,
-                    host=url.host,
-                    password=url.password,
-                    port=url.port,
-                    user=url.username,
-                    table_name=name,  # table_name = data source name
-                    embed_dim=config.embedding_dim,  # openai embedding dimension
-                )
-                self.uri = config.archival_storage_uri
-                self.table_name = "data_%s" % name.lower()  # TODO: figure out exactly what this is
-                print("TABLE NAME", self.table_name)
+                self.storage = PostgresStorageConnector(name)
+                # from llama_index.vector_stores import PGVectorStore
+                # from sqlalchemy import make_url
+
+                # connection_string = config.archival_storage_uri
+                # url = make_url(connection_string)
+
+                # self.vector_store = PGVectorStore.from_params(
+                #    database=url.database,
+                #    host=url.host,
+                #    password=url.password,
+                #    port=url.port,
+                #    user=url.username,
+                #    table_name=name,  # table_name = data source name
+                #    embed_dim=config.embedding_dim,  # openai embedding dimension
+                # )
+                # self.uri = config.archival_storage_uri
+                # self.table_name = "data_%s" % name.lower()  # TODO: figure out exactly what this is
+                # print("TABLE NAME", self.table_name)
             elif config.archival_storage_type == "chroma":
                 from llama_index.vector_stores import ChromaVectorStore
                 import chromadb
diff --git a/memgpt/memory.py b/memgpt/memory.py
index 03c890232d..135a8d49e4 100644
--- a/memgpt/memory.py
+++ b/memgpt/memory.py
@@ -30,17 +30,9 @@
 from llama_index.query_engine import RetrieverQueryEngine
 from llama_index.indices.postprocessor import SimilarityPostprocessor
 
-<<<<<<< HEAD
 from memgpt.embeddings import embedding_model
 from memgpt.config import MemGPTConfig
 
-from memgpt.embeddings import embedding_model
-=======
-from memgpt.embeddings import Index, embedding_model
-from memgpt.connectors.storage import StorageConnector, Passage
->>>>>>> b55cdad (add postgres connector)
-from memgpt.config import MemGPTConfig
-
 
 class CoreMemory(object):
     """Held in-context inside the system message

From 6e6b3e1a1c1afae5b14c66b1123f6bd9c84e397a Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 17:19:40 -0700
Subject: [PATCH 28/48] working local storage (changed saving)

---
 memgpt/connectors/storage.py | 16 ++++++++++++++++
 memgpt/memory.py             |  5 +++++
 2 files changed, 21 insertions(+)

diff --git a/memgpt/connectors/storage.py b/memgpt/connectors/storage.py
index 7839e5ce34..df4e4fc6b2 100644
--- a/memgpt/connectors/storage.py
+++ b/memgpt/connectors/storage.py
@@ -13,6 +13,22 @@
 import numpy as np
 from tqdm import tqdm
 
+from llama_index import (
+    VectorStoreIndex,
+    EmptyIndex,
+    get_response_synthesizer,
+    load_index_from_storage,
+    StorageContext,
+    ServiceContext,
+)
+from llama_index.retrievers import VectorIndexRetriever
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.indices.postprocessor import SimilarityPostprocessor
+from llama_index.schema import BaseComponent, TextNode, Document
+
+
+from memgpt.constants import MEMGPT_DIR
+
 
 from memgpt.config import AgentConfig, MemGPTConfig
 
diff --git a/memgpt/memory.py b/memgpt/memory.py
index 135a8d49e4..c407bf665d 100644
--- a/memgpt/memory.py
+++ b/memgpt/memory.py
@@ -758,6 +758,11 @@ def __init__(self, agent_config, top_k: Optional[int] = 100):
         self.embed_model = embedding_model()
         self.embedding_chunk_size = config.embedding_chunk_size
 
+        # create parser
+        self.parser = SimpleNodeParser.from_defaults(
+            chunk_size=config.embedding_chunk_size,
+        )
+
         # create storage backend
         self.storage = StorageConnector.get_storage_connector(agent_config=agent_config)
         # TODO: have some mechanism for cleanup otherwise will lead to OOM

From b409eac7fb4a8bf9313cb776bae1e8458f572981 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 18:04:30 -0700
Subject: [PATCH 29/48] implement /attach

---
 memgpt/main.py | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/memgpt/main.py b/memgpt/main.py
index ac78e08227..6e5a81f0d4 100644
--- a/memgpt/main.py
+++ b/memgpt/main.py
@@ -470,6 +470,25 @@ async def run_agent_loop(memgpt_agent, first, no_verify=False, cfg=None, strip_u
                     )
                     continue
 
+                elif user_input.lower() == "/attach":
+                    if legacy:
+                        typer.secho("Error: /attach is not supported in legacy mode.", fg=typer.colors.RED, bold=True)
+                        continue
+
+                    # TODO: check if agent already has it
+                    data_source_options = StorageConnector.list_loaded_data()
+                    data_source = await questionary.select("Select data source", choices=data_source_options).ask_async()
+
+                    # attach new data
+                    attach(memgpt_agent.config.name, data_source)
+
+                    # reload agent with new data source
+                    # TODO: maybe make this less ugly...
+                    memgpt_agent.persistence_manager.archival_memory.storage = StorageConnector.get_storage_connector(
+                        agent_config=memgpt_agent.config
+                    )
+                    continue
+
                 elif user_input.lower() == "/dump":
                     await memgpt.interface.print_messages(memgpt_agent.messages)
                     continue

From b7842ec62220559464ddfa26d403901d140a967f Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 18:43:37 -0700
Subject: [PATCH 30/48] remove old code

---
 memgpt/embeddings.py      | 194 -------------------------------
 memgpt/memory/archival.py | 232 --------------------------------------
 2 files changed, 426 deletions(-)
 delete mode 100644 memgpt/memory/archival.py

diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index 5f6d9e28e5..4f1a4093fc 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -1,8 +1,4 @@
 import typer
-from llama_index.embeddings import OpenAIEmbedding
-from llama_index.schema import BaseComponent, TextNode, Document
-
-from memgpt.connectors.storage import PostgresStorageConnector
 
 
 def embedding_model():
@@ -39,193 +35,3 @@ def embedding_model():
 
         # loads BAAI/bge-small-en-v1.5
         return HuggingFaceEmbedding(model_name=model)
-
-
-class Index:
-    def __init__(self, name: str, save_directory: Optional[str] = None):
-
-        config = MemGPTConfig.load()
-        self.save_directory = save_directory
-
-        # setup storage
-        self.storage_type = config.archival_storage_type
-        print("storage type", self.storage_type)
-        if config.archival_storage_type == "local":
-            self.storage_context = StorageContext.from_defaults(persist_dir=self.save_directory)
-        else:
-            if config.archival_storage_type == "postgres":
-                self.storage = PostgresStorageConnector(name)
-                # from llama_index.vector_stores import PGVectorStore
-                # from sqlalchemy import make_url
-
-                # connection_string = config.archival_storage_uri
-                # url = make_url(connection_string)
-
-                # self.vector_store = PGVectorStore.from_params(
-                #    database=url.database,
-                #    host=url.host,
-                #    password=url.password,
-                #    port=url.port,
-                #    user=url.username,
-                #    table_name=name,  # table_name = data source name
-                #    embed_dim=config.embedding_dim,  # openai embedding dimension
-                # )
-                # self.uri = config.archival_storage_uri
-                # self.table_name = "data_%s" % name.lower()  # TODO: figure out exactly what this is
-                # print("TABLE NAME", self.table_name)
-            elif config.archival_storage_type == "chroma":
-                from llama_index.vector_stores import ChromaVectorStore
-                import chromadb
-
-                # chroma_client = chromadb.EphemeralClient()
-                # TODO: connect to storage URI if provided
-                chroma_client = chromadb.PersistentClient(path="/Users/sarahwooders/repos/MemGPT/chromadb")
-                chroma_collection = chroma_client.get_or_create_collection(name)
-                self.vector_store = ChromaVectorStore(chroma_collection=chroma_collection)
-            else:
-                raise ValueError(f"Unknown archival storage type {config.archival_storage_type}")
-            self.storage_context = StorageContext.from_defaults(vector_store=self.vector_store)
-            print("storage context", self.storage_context)
-
-        # setup embedding model
-        self.embed_model = embedding_model(config)
-
-        # setup service context
-        self.service_context = ServiceContext.from_defaults(llm=None, embed_model=self.embed_model, chunk_size=config.embedding_chunk_size)
-
-        # load index (if exists)
-        # TODO: make sure this doesn't cause an error if the index doesn't exist yet
-        if self.storage_type == "local":
-            # load from disk if local
-            self.index = load_index_from_storage(self.storage_context)
-        else:
-            # load from vector store
-            self.index = VectorStoreIndex.from_vector_store(vector_store=self.vector_store)
-            # print("BASIC QUERY", self.index.as_query_engine().query("cinderella crying"))
-
-    def load_nodes(self, nodes):
-        """Loads a list of LlamaIndex nodes into index
-
-        :param nodes: List of nodes to create an index with
-        :type nodes: List[TextNode]
-        """
-        # for node in nodes:
-        #    node.text = node.text.replace("\x00", "\uFFFD")  # hacky fix for error on null characters
-        self.index.build_index_from_nodes(nodes=nodes)
-        print(f"Added {len(nodes)} nodes")
-        self.persist()
-
-    def load_documents(self, documents):
-        """Load a list of documents into an index
-
-        :param documents: List of documents to create an index with
-        :type documents: List[Document]
-        """
-        # need to remove problematic characters to avoid errors
-        for doc in documents:
-            doc.text = doc.text.replace("\x00", "\uFFFD")  # hacky fix for error on null characters
-
-        # create index
-        self.index = VectorStoreIndex.from_documents(
-            documents, storage_context=self.storage_context, service_context=self.service_context, show_progress=True
-        )
-        self.persist()
-
-    def persist(self):
-
-        # persist state
-        if self.storage_type == "local":
-            # save to disk if local
-            self.index.storage_context.persist(persist_dir=self.directory)  # TODO:
-        else:
-            self.index.storage_context.persist()
-
-    def insert(self, text: str, embedding: Optional[List[float]] = None):
-        """Insert new string into index
-
-        :param text: String to insert into index
-        :type text: str
-        """
-        if embedding is None:
-            self.index.insert(text)
-        else:
-            self.index.insert(Document(text=text, embedding=embedding))
-
-    def update(self, documents, embeddings=[]):
-        """Update an index with new documents
-
-        :param documents: List of documents to update an index with
-        :type documents: List[Document]
-        """
-        # need to remove problematic characters to avoid errors
-        for doc in documents:
-            doc.text = doc.text.replace("\x00", "\uFFFD")
-
-        # TODO: make sure document is persisted in the remote DB
-
-        # TODO: allow for existing embeddings
-
-    def get_nodes(self):
-        """Get the list of nodes from an index (useful for moving data from one index to another)
-
-        :return: Nodes contained in index
-        :rtype: List[TextNode]
-        """
-
-        if self.storage_type == "local":
-            embed_dict = self.index._vector_store._data.embedding_dict
-            node_dict = self.index._docstore.docs
-
-            nodes = []
-            for node_id, node in node_dict.items():
-                vector = embed_dict[node_id]
-                node.embedding = vector
-                nodes.append(node)
-            return nodes
-        elif self.storage_type == "postgres":
-            from sqlalchemy import create_engine, MetaData, Table, select
-
-            engine = create_engine(self.uri)
-            metadata = MetaData()
-            # data_table = Table(self.table_name, metadata, autoload_with=engine, schema='public')k
-            print(self.vector_store._table_class)
-
-            # Initialize a list to store the Node objects
-            nodes = []
-
-            # Start a connection to the database
-            with engine.connect() as conn:
-                # Select all data from the table
-                select_stmt = select(self.vector_store._table_class)
-                results = conn.execute(select_stmt).all()
-
-                print(results[0])
-                print("DATA", results[1].embedding, results[1].text)
-
-                # Iterate over the rows to create Node objects
-                for row in results:
-                    # Assuming that 'text' is the document and 'embedding' is the binary representation of the embedding
-                    # If 'embedding' is stored in a different format, you might need to adjust the code to handle it correctly
-                    # import json
-                    # document = json.loads(row[1])
-                    try:
-
-                        node = TextNode(
-                            id_=row.node_id,
-                            text=row.text,
-                            metadata=row.metadata,
-                            embedding=row.embedding,
-                        )
-                        print(node)
-                        # node = Document(document=row.text, embedding=list(row.embedding))
-                    except Exception as e:
-                        print(row)
-                        raise e
-                    nodes.append(node)
-            print("nodes", len(nodes))
-            return nodes
-
-        elif self.storage_type == "chroma":
-            raise NotImplementedError("TODO")
-        else:
-            raise ValueError(f"Unknown archival storage type {self.storage_type}")
diff --git a/memgpt/memory/archival.py b/memgpt/memory/archival.py
deleted file mode 100644
index bf057b7781..0000000000
--- a/memgpt/memory/archival.py
+++ /dev/null
@@ -1,232 +0,0 @@
-from abc import ABC, abstractmethod
-import os
-import datetime
-import re
-import faiss
-import numpy as np
-from typing import Optional, List, Tuple
-
-from memgpt.config import AgentConfig, MemGPTConfig
-from .constants import MESSAGE_SUMMARY_WARNING_TOKENS, MEMGPT_DIR
-from .utils import cosine_similarity, get_local_time, printd, count_tokens
-from .prompts.gpt_summarize import SYSTEM as SUMMARY_PROMPT_SYSTEM
-from memgpt import utils
-from .openai_tools import (
-    acompletions_with_backoff as acreate,
-    async_get_embedding_with_backoff,
-    get_embedding_with_backoff,
-    completions_with_backoff as create,
-)
-from llama_index import (
-    VectorStoreIndex,
-    EmptyIndex,
-    get_response_synthesizer,
-    load_index_from_storage,
-    StorageContext,
-)
-from llama_index.retrievers import VectorIndexRetriever
-from llama_index.query_engine import RetrieverQueryEngine
-from llama_index.indices.postprocessor import SimilarityPostprocessor
-
-# TODO: move to different file
-import psycopg2
-from sqlalchemy import make_url
-
-
-class ArchivalMemory(ABC):
-
-    """Wrapper around Llama Index VectorStoreIndex"""
-
-    @abstractmethod
-    def insert(self, memory_string):
-        """Insert new archival memory
-
-        :param memory_string: Memory string to insert
-        :type memory_string: str
-        """
-        pass
-
-    @abstractmethod
-    def search(self, query_string, count=None, start=None) -> Tuple[List[str], int]:
-        """Search archival memory
-
-        :param query_string: Query string
-        :type query_string: str
-        :param count: Number of results to return (None for all)
-        :type count: Optional[int]
-        :param start: Offset to start returning results from (None if 0)
-        :type start: Optional[int]
-
-        :return: Tuple of (list of results, total number of results)
-        """
-        pass
-
-    @abstractmethod
-    def __repr__(self) -> str:
-        pass
-
-
-class LocalArchivalMemory(ArchivalMemory):
-    """Archival memory built on top of Llama Index"""
-
-    def __init__(self, agent_config, top_k: Optional[int] = 100):
-        """Init function for archival memory
-
-        :param archiva_memory_database: name of dataset to pre-fill archival with
-        :type archival_memory_database: str
-        """
-
-        self.top_k = top_k
-        self.agent_config = agent_config
-
-        # locate saved index
-        if self.agent_config.data_source is not None:  # connected data source
-            directory = f"{MEMGPT_DIR}/archival/{self.agent_config.data_source}"
-            assert os.path.exists(directory), f"Archival memory database {self.agent_config.data_source} does not exist"
-        elif self.agent_config.name is not None:
-            directory = agent_config.save_agent_index_dir()
-            if not os.path.exists(directory):
-                # no existing archival storage
-                directory = None
-
-        # load/create index
-        if directory:
-            storage_context = StorageContext.from_defaults(persist_dir=directory)
-            self.index = load_index_from_storage(storage_context)
-        else:
-            self.index = EmptyIndex()
-
-        # create retriever
-        if isinstance(self.index, EmptyIndex):
-            self.retriever = None  # cant create retriever over empty indes
-        else:
-            self.retriever = VectorIndexRetriever(
-                index=self.index,  # does this get refreshed?
-                similarity_top_k=self.top_k,
-            )
-
-        # TODO: have some mechanism for cleanup otherwise will lead to OOM
-        self.cache = {}
-
-    def save(self):
-        """Save the index to disk"""
-        if self.agent_config.data_source:  # update original archival index
-            # TODO: this corrupts the originally loaded data. do we want to do this?
-            utils.save_index(self.index, self.agent_config.data_source)
-        else:
-            utils.save_agent_index(self.index, self.agent_config)
-
-    async def insert(self, memory_string):
-        self.index.insert(memory_string)
-
-        # TODO: figure out if this needs to be refreshed (probably not)
-        self.retriever = VectorIndexRetriever(
-            index=self.index,
-            similarity_top_k=self.top_k,
-        )
-
-    async def search(self, query_string, count=None, start=None):
-        if self.retriever is None:
-            print("Warning: archival memory is empty")
-            return [], 0
-
-        start = start if start else 0
-        count = count if count else self.top_k
-        count = min(count + start, self.top_k)
-
-        if query_string not in self.cache:
-            self.cache[query_string] = self.retriever.retrieve(query_string)
-
-        results = self.cache[query_string][start : start + count]
-        results = [{"timestamp": get_local_time(), "content": node.node.text} for node in results]
-        # from pprint import pprint
-        # pprint(results)
-        return results, len(results)
-
-    async def a_search(self, query_string, count=None, start=None):
-        return self.search(query_string, count, start)
-
-    def __repr__(self) -> str:
-        print(self.index.ref_doc_info)
-        return ""
-
-
-class PostgresArchivalMemory(ArchivalMemory):
-    def __init__(
-        self,
-        agent_config: AgentConfig,
-        connection_string: str,
-        db_name: str,
-    ):
-        self.agent_config = agent_config
-        self.connection_string = connection_string
-        self.db_name = db_name
-        self.table_name = "archival_memory"
-        self.top_k = 100
-
-        # create table
-        self.conn = psycopg2.connect(self.connection_string)
-        self.conn.autocommit = True
-
-        with self.conn.cursor() as c:
-            c.execute(f"DROP DATABASE IF EXISTS {db_name}")
-            c.execute(f"CREATE DATABASE {db_name}")
-
-        url = make_url(connection_string)
-        vector_store = PGVectorStore.from_params(
-            database=self.db_name,
-            host=url.host,
-            password=url.password,
-            port=url.port,
-            user=url.username,
-            table_name=self.table_name,
-            embed_dim=MemGPTConfig.load().embedding_dim,  # openai embedding dimension
-        )
-
-        storage_context = StorageContext.from_defaults(vector_store=vector_store)
-        index = VectorStoreIndex.from_documents(documents, storage_context=storage_context, show_progress=True)
-        query_engine = index.as_query_engine()
-
-        # create retriever
-        self.retriever = VectorIndexRetriever(
-            index=self.index,
-            similarity_top_k=self.top_k,
-        )
-
-
-class ChromaArchivalMemory(ArchivalMemory):
-
-    import chromadb
-
-    def __init__(
-        self,
-        agent_config: AgentConfig,
-        top_k: int = 100,
-    ):
-        self.agent_config = agent_config
-        self.data_source_name = agent_config.data_source
-
-        # connect to client
-        self.client = chromadb.Client()
-        # client = chromadb.PersistentClient(path="/path/to/save/to")
-        self.collection = self.client.get_collection(self.data_source_name)
-
-        # TODO: have some mechanism for cleanup otherwise will lead to OOM
-        self.cache = {}
-
-    def search(self, query_string, count=None, start=None):
-
-        start = start if start else 0
-        count = count if count else self.top_k
-        count = min(count + start, self.top_k)
-
-        if query_string not in self.cache:
-            self.cache[query_string] = self.collection.query(
-                query_texts=[query_string],
-            )
-
-        results = self.cache[query_string][start : start + count]
-        results = [{"timestamp": get_local_time(), "content": node.node.text} for node in results]
-        # from pprint import pprint
-        # pprint(results)
-        return results, len(results)

From c5ab5948e2170fd9e2ba6f11355679c0d69b4850 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 18:44:01 -0700
Subject: [PATCH 31/48] split up storage conenctors into multiple files

---
 memgpt/connectors/local.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/memgpt/connectors/local.py b/memgpt/connectors/local.py
index a916aac124..9c54dc9fbe 100644
--- a/memgpt/connectors/local.py
+++ b/memgpt/connectors/local.py
@@ -84,7 +84,6 @@ def get(self, id: str) -> Passage:
 
     def insert(self, passage: Passage):
         nodes = [TextNode(text=passage.text, embedding=passage.embedding)]
-        print("nodes", nodes)
         self.nodes += nodes
         if isinstance(self.index, EmptyIndex):
             self.index = VectorStoreIndex(self.nodes, service_context=self.service_context, show_progress=True)

From b6eeb2f1db6306d2d2e0a3b5e8f7adac7fd7e15b Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 18:55:53 -0700
Subject: [PATCH 32/48] remove unused code

---
 memgpt/cli/cli_load.py | 2 --
 memgpt/utils.py        | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/memgpt/cli/cli_load.py b/memgpt/cli/cli_load.py
index 10becc7583..a9c3f49fff 100644
--- a/memgpt/cli/cli_load.py
+++ b/memgpt/cli/cli_load.py
@@ -102,9 +102,7 @@ def load_directory(
         reader = SimpleDirectoryReader(input_files=input_files)
 
     # load docs
-    print("loading data")
     docs = reader.load_data()
-    print("done loading data")
     store_docs(name, docs)
 
 
diff --git a/memgpt/utils.py b/memgpt/utils.py
index 4a088a43d9..cf65169619 100644
--- a/memgpt/utils.py
+++ b/memgpt/utils.py
@@ -350,7 +350,7 @@ def estimate_openai_cost(docs):
     from llama_index.callbacks import CallbackManager, TokenCountingHandler
     import tiktoken
 
-    embed_model = MockEmbedding(embed_dim=768)
+    embed_model = MockEmbedding(embed_dim=1536)
 
     token_counter = TokenCountingHandler(tokenizer=tiktoken.encoding_for_model("gpt-3.5-turbo").encode)
 

From 70e5a5f244ae870ac3e5c2d3bbafe21cda52930b Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 19:35:05 -0700
Subject: [PATCH 33/48] cleanup

---
 memgpt/cli/cli.py            |  1 -
 memgpt/connectors/db.py      |  2 --
 memgpt/connectors/storage.py | 17 ++---------------
 memgpt/embeddings.py         |  1 +
 memgpt/main.py               |  3 +++
 5 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/memgpt/cli/cli.py b/memgpt/cli/cli.py
index 87b73f52a4..ac115fdfe6 100644
--- a/memgpt/cli/cli.py
+++ b/memgpt/cli/cli.py
@@ -30,7 +30,6 @@
     configure_azure_support,
     check_azure_embeddings,
 )
-from memgpt.embeddings import Index
 
 
 def run(
diff --git a/memgpt/connectors/db.py b/memgpt/connectors/db.py
index 5cefc5e34a..2df49ce5b6 100644
--- a/memgpt/connectors/db.py
+++ b/memgpt/connectors/db.py
@@ -64,8 +64,6 @@ def __init__(self, name: Optional[str] = None, agent_config: Optional[AgentConfi
         else:
             raise ValueError("Must specify either agent config or name")
 
-        printd(f"Using table name {self.table_name}")
-
         # create table
         self.uri = config.archival_storage_uri
         if config.archival_storage_uri is None:
diff --git a/memgpt/connectors/storage.py b/memgpt/connectors/storage.py
index df4e4fc6b2..3eb66c7e21 100644
--- a/memgpt/connectors/storage.py
+++ b/memgpt/connectors/storage.py
@@ -13,21 +13,8 @@
 import numpy as np
 from tqdm import tqdm
 
-from llama_index import (
-    VectorStoreIndex,
-    EmptyIndex,
-    get_response_synthesizer,
-    load_index_from_storage,
-    StorageContext,
-    ServiceContext,
-)
-from llama_index.retrievers import VectorIndexRetriever
-from llama_index.query_engine import RetrieverQueryEngine
-from llama_index.indices.postprocessor import SimilarityPostprocessor
-from llama_index.schema import BaseComponent, TextNode, Document
-
-
-from memgpt.constants import MEMGPT_DIR
+
+from memgpt.config import AgentConfig, MemGPTConfig
 
 
 from memgpt.config import AgentConfig, MemGPTConfig
diff --git a/memgpt/embeddings.py b/memgpt/embeddings.py
index 4f1a4093fc..20c6040ed9 100644
--- a/memgpt/embeddings.py
+++ b/memgpt/embeddings.py
@@ -1,4 +1,5 @@
 import typer
+from llama_index.embeddings import OpenAIEmbedding
 
 
 def embedding_model():
diff --git a/memgpt/main.py b/memgpt/main.py
index 6e5a81f0d4..e0a5a7164d 100644
--- a/memgpt/main.py
+++ b/memgpt/main.py
@@ -482,6 +482,9 @@ async def run_agent_loop(memgpt_agent, first, no_verify=False, cfg=None, strip_u
                     # attach new data
                     attach(memgpt_agent.config.name, data_source)
 
+                    # update agent config
+                    memgpt_agent.config.attach_data_source(data_source)
+
                     # reload agent with new data source
                     # TODO: maybe make this less ugly...
                     memgpt_agent.persistence_manager.archival_memory.storage = StorageConnector.get_storage_connector(

From da2a20892d5b83172fbac677f5f3f9ed69d69273 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 20:14:00 -0700
Subject: [PATCH 34/48] implement vector db loading

---
 memgpt/connectors/db.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/memgpt/connectors/db.py b/memgpt/connectors/db.py
index 2df49ce5b6..5cefc5e34a 100644
--- a/memgpt/connectors/db.py
+++ b/memgpt/connectors/db.py
@@ -64,6 +64,8 @@ def __init__(self, name: Optional[str] = None, agent_config: Optional[AgentConfi
         else:
             raise ValueError("Must specify either agent config or name")
 
+        printd(f"Using table name {self.table_name}")
+
         # create table
         self.uri = config.archival_storage_uri
         if config.archival_storage_uri is None:

From 88b5e18fd6ba39b16af129984c51d7de8aefec5b Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Thu, 2 Nov 2023 20:36:42 -0700
Subject: [PATCH 35/48] cleanup state savign

---
 memgpt/memory.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/memgpt/memory.py b/memgpt/memory.py
index c407bf665d..135a8d49e4 100644
--- a/memgpt/memory.py
+++ b/memgpt/memory.py
@@ -758,11 +758,6 @@ def __init__(self, agent_config, top_k: Optional[int] = 100):
         self.embed_model = embedding_model()
         self.embedding_chunk_size = config.embedding_chunk_size
 
-        # create parser
-        self.parser = SimpleNodeParser.from_defaults(
-            chunk_size=config.embedding_chunk_size,
-        )
-
         # create storage backend
         self.storage = StorageConnector.get_storage_connector(agent_config=agent_config)
         # TODO: have some mechanism for cleanup otherwise will lead to OOM

From 82c0cbf5b91309180e7f3c8806acf58c5ea746a9 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Fri, 3 Nov 2023 10:41:41 -0700
Subject: [PATCH 36/48] add chroma

---
 memgpt/cli/cli_config.py     |  11 +++-
 memgpt/connectors/chroma.py  | 100 +++++++++++++++++++++++++++++++++++
 memgpt/connectors/storage.py |   8 +++
 tests/test_storage.py        |  29 +++++++++-
 4 files changed, 146 insertions(+), 2 deletions(-)
 create mode 100644 memgpt/connectors/chroma.py

diff --git a/memgpt/cli/cli_config.py b/memgpt/cli/cli_config.py
index b03d23b740..14e6294458 100644
--- a/memgpt/cli/cli_config.py
+++ b/memgpt/cli/cli_config.py
@@ -113,13 +113,21 @@ def configure():
     #    default_agent = None
 
     # Configure archival storage backend
-    archival_storage_options = ["local", "postgres"]
+    archival_storage_options = ["local", "postgres", "chroma"]
     archival_storage_type = questionary.select("Select storage backend for archival data:", archival_storage_options, default="local").ask()
     archival_storage_uri = None
+    archival_storage_path = None
     if archival_storage_type == "postgres":
         archival_storage_uri = questionary.text(
             "Enter postgres connection string (e.g. postgresql+pg8000://{user}:{password}@{ip}:5432/{database}):"
         ).ask()
+    elif archival_storage_type == "chroma":
+        chroma_type = questionary.select("Select chroma client type:", ["HTTP", "Persistent Storage"]).ask()
+        if chroma_type == "HTTP":
+            archival_storage_uri = questionary.text("Enter chroma server (e.g. localhost:8000)").ask()
+            # TODO: check correct formatting
+        else:
+            archival_storage_path = questionary.text("Enter chroma storage path (e.g. /tmp/chroma)").ask()
 
     # TODO: allow configuring embedding model
 
@@ -138,6 +146,7 @@ def configure():
         azure_embedding_deployment=azure_embedding_deployment if use_azure_deployment_ids else None,
         archival_storage_type=archival_storage_type,
         archival_storage_uri=archival_storage_uri,
+        archival_storage_path=archival_storage_path,
     )
     print(f"Saving config to {config.config_path}")
     config.save()
diff --git a/memgpt/connectors/chroma.py b/memgpt/connectors/chroma.py
new file mode 100644
index 0000000000..d9c5a98e20
--- /dev/null
+++ b/memgpt/connectors/chroma.py
@@ -0,0 +1,100 @@
+import chromadb
+import json
+import re
+from typing import Optional, List
+from memgpt.connectors.storage import StorageConnector, Passage
+from memgpt.utils import printd
+from memgpt.config import AgentConfig, MemGPTConfig
+
+
+class ChromaStorageConnector(StorageConnector):
+    """Storage via Chroma"""
+
+    # WARNING: This is not thread safe. Do NOT do concurrent access to the same collection.
+
+    def __init__(self, name: Optional[str] = None, agent_config: Optional[AgentConfig] = None):
+        config = MemGPTConfig.load()
+
+        # determine table name
+        if agent_config:
+            assert name is None, f"Cannot specify both agent config and name {name}"
+            self.table_name = self.generate_table_name_agent(agent_config)
+        elif name:
+            assert agent_config is None, f"Cannot specify both agent config and name {name}"
+            self.table_name = self.generate_table_name(name)
+        else:
+            raise ValueError("Must specify either agent config or name")
+
+        printd(f"Using table name {self.table_name}")
+
+        # create chroma client
+        if config.archival_storage_path:
+            self.client = chromadb.PersistentClient(config.archival_storage_path)
+        else:
+            # assume uri={ip}:{port}
+            ip = config.archival_storage_uri.split(":")[0]
+            port = config.archival_storage_uri.split(":")[1]
+            self.client = chromadb.HttpClient(host="localhost", port=8000)
+
+        # get a collection or create if it doesn't exist already
+        self.collection = self.client.get_or_create_collection(self.table_name)
+
+    def get_all(self) -> List[Passage]:
+        results = self.collection.get(include=["embeddings", "documents"])
+        return [Passage(text=text, embedding=embedding) for (text, embedding) in zip(results["documents"], results["embeddings"])]
+
+    def get(self, id: str) -> Optional[Passage]:
+        results = self.collection.get(ids=[id])
+        return [Passage(text=text, embedding=embedding) for (text, embedding) in zip(results["documents"], results["embeddings"])]
+
+    def insert(self, passage: Passage):
+        self.collection.add(documents=[passage.text], embeddings=[passage.embedding], ids=[str(self.collection.count())])
+
+    def insert_many(self, passages: List[Passage], show_progress=True):
+        count = self.collection.count()
+        ids = [str(count + i) for i in range(len(passages))]
+        self.collection.add(
+            documents=[passage.text for passage in passages], embeddings=[passage.embedding for passage in passages], ids=ids
+        )
+
+    def query(self, query: str, query_vec: List[float], top_k: int = 10) -> List[Passage]:
+        results = self.collection.query(query_embeddings=[query_vec], n_results=top_k, include=["embeddings", "documents"])
+        # get index [0] since query is passed as list
+        return [Passage(text=text, embedding=embedding) for (text, embedding) in zip(results["documents"][0], results["embeddings"][0])]
+
+    def delete(self):
+        self.client.delete_collection(name=self.table_name)
+
+    def save(self):
+        # save to persistence file
+        printd("Saving chroma")
+
+    @staticmethod
+    def list_loaded_data():
+        config = MemGPTConfig.load()
+        collections = self.client.list_collections()
+        collections = [c for c in collections if c.name.startswith("memgpt_") and not c.name.startswith("memgpt_agent_")]
+        return collections
+
+    def sanitize_table_name(self, name: str) -> str:
+        # Remove leading and trailing whitespace
+        name = name.strip()
+
+        # Replace spaces and invalid characters with underscores
+        name = re.sub(r"\s+|\W+", "_", name)
+
+        # Truncate to the maximum identifier length (e.g., 63 for PostgreSQL)
+        max_length = 63
+        if len(name) > max_length:
+            name = name[:max_length].rstrip("_")
+
+        # Convert to lowercase
+        name = name.lower()
+
+        return name
+
+    def generate_table_name_agent(self, agent_config: AgentConfig):
+        return f"memgpt_agent_{self.sanitize_table_name(agent_config.name)}"
+
+    def generate_table_name(self, name: str):
+        return f"memgpt_{self.sanitize_table_name(name)}"
diff --git a/memgpt/connectors/storage.py b/memgpt/connectors/storage.py
index 3eb66c7e21..042f683e59 100644
--- a/memgpt/connectors/storage.py
+++ b/memgpt/connectors/storage.py
@@ -41,12 +41,17 @@ class StorageConnector:
     def get_storage_connector(name: Optional[str] = None, agent_config: Optional[AgentConfig] = None):
         from memgpt.connectors.db import PostgresStorageConnector
         from memgpt.connectors.local import LocalStorageConnector
+        from memgpt.connectors.chroma import ChromaStorageConnector
+
+        # TODO: determine the table name here, not inside of storage
 
         storage_type = MemGPTConfig.load().archival_storage_type
         if storage_type == "local":
             return LocalStorageConnector(name=name, agent_config=agent_config)
         elif storage_type == "postgres":
             return PostgresStorageConnector(name=name, agent_config=agent_config)
+        elif storage_type == "chroma":
+            return ChromaStorageConnector(name=name, agent_config=agent_config)
         else:
             raise NotImplementedError(f"Storage type {storage_type} not implemented")
 
@@ -54,12 +59,15 @@ def get_storage_connector(name: Optional[str] = None, agent_config: Optional[Age
     def list_loaded_data():
         from memgpt.connectors.db import PostgresStorageConnector
         from memgpt.connectors.local import LocalStorageConnector
+        from memgpt.connectors.chroma import ChromaStorageConnector
 
         storage_type = MemGPTConfig.load().archival_storage_type
         if storage_type == "local":
             return LocalStorageConnector.list_loaded_data()
         elif storage_type == "postgres":
             return PostgresStorageConnector.list_loaded_data()
+        elif storage_type == "chroma":
+            return ChromaStorageConnector.list_loaded_data()
         else:
             raise NotImplementedError(f"Storage type {storage_type} not implemented")
 
diff --git a/tests/test_storage.py b/tests/test_storage.py
index f15de2c947..e050bc9d5e 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -9,6 +9,7 @@
 
 from memgpt.connectors.storage import StorageConnector, Passage
 from memgpt.connectors.db import PostgresStorageConnector
+from memgpt.connectors.chroma import ChromaStorageConnector
 from memgpt.embeddings import embedding_model
 from memgpt.config import MemGPTConfig, AgentConfig
 
@@ -46,4 +47,30 @@ def test_postgres():
     print("...finished")
 
 
-test_postgres()
+def test_chroma():
+
+    config = MemGPTConfig.load()
+    embed_model = embedding_model()
+
+    passage = ["This is a test passage", "This is another test passage", "Cinderella wept"]
+
+    db = ChromaStorageConnector(name="test2")
+
+    for passage in passage:
+        db.insert(Passage(text=passage, embedding=embed_model.get_text_embedding(passage)))
+
+    query = "why was she crying"
+    query_vec = embed_model.get_text_embedding(query)
+    res = db.query(query, query_vec, top_k=2)
+
+    assert len(res) == 2, f"Expected 2 results, got {len(res)}"
+    assert "wept" in res[0].text, f"Expected 'wept' in results, but got {res[0].text}"
+
+    print(res[0].text)
+
+    print("deleting")
+    db.delete()
+
+
+# test_postgres()
+test_chroma()

From 25053f7e06a780135e7beed94b5a37b26d2822d6 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 21 Nov 2023 14:21:42 -0800
Subject: [PATCH 37/48] minor fix

---
 memgpt/cli/cli_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memgpt/cli/cli_config.py b/memgpt/cli/cli_config.py
index 6c05bf84d7..110646a5bf 100644
--- a/memgpt/cli/cli_config.py
+++ b/memgpt/cli/cli_config.py
@@ -64,7 +64,7 @@ def configure_llm_endpoint(config: MemGPTConfig):
         if config.model_endpoint_type in backend_options:
             # set from previous config
             default_model_endpoint_type = config.model_endpoint_type
-        else:
+        if os.getenv("BACKEND_TYPE") and os.getenv("BACKEND_TYPE") in backend_options:
             # set form env variable (ok if none)
             default_model_endpoint_type = os.getenv("BACKEND_TYPE")
         model_endpoint_type = questionary.select(

From 5f9c7ef836e767089be7ff8683cdc4bfbff66f2f Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 21 Nov 2023 15:41:05 -0800
Subject: [PATCH 38/48] fix up chroma integration

---
 memgpt/cli/cli_config.py    | 24 ++++++++++++++---
 memgpt/config.py            |  1 +
 memgpt/connectors/chroma.py | 52 +++++++++++++++++++++++++++----------
 tests/test_storage.py       | 14 ++++++++--
 4 files changed, 72 insertions(+), 19 deletions(-)

diff --git a/memgpt/cli/cli_config.py b/memgpt/cli/cli_config.py
index d19eb75525..8fe134ffa2 100644
--- a/memgpt/cli/cli_config.py
+++ b/memgpt/cli/cli_config.py
@@ -210,24 +210,40 @@ def configure_cli(config: MemGPTConfig):
 
 def configure_archival_storage(config: MemGPTConfig):
     # Configure archival storage backend
-    archival_storage_options = ["local", "lancedb", "postgres"]
+    archival_storage_options = ["local", "lancedb", "postgres", "chroma"]
     archival_storage_type = questionary.select(
         "Select storage backend for archival data:", archival_storage_options, default=config.archival_storage_type
     ).ask()
-    archival_storage_uri = None
+    archival_storage_uri, archival_storage_path = None, None
+
+    # configure postgres
     if archival_storage_type == "postgres":
         archival_storage_uri = questionary.text(
             "Enter postgres connection string (e.g. postgresql+pg8000://{user}:{password}@{ip}:5432/{database}):",
             default=config.archival_storage_uri if config.archival_storage_uri else "",
         ).ask()
 
+    # configure lancedb
     if archival_storage_type == "lancedb":
         archival_storage_uri = questionary.text(
             "Enter lanncedb connection string (e.g. ./.lancedb",
             default=config.archival_storage_uri if config.archival_storage_uri else "./.lancedb",
         ).ask()
 
-    return archival_storage_type, archival_storage_uri
+    # configure chroma
+    if archival_storage_type == "chroma":
+        chroma_type = questionary.select("Select chroma backend:", ["http", "persistent"], default="http").ask()
+        if chroma_type == "http":
+            archival_storage_uri = questionary.text("Enter chroma ip (e.g. localhost:8000):", default="localhost:8000").ask()
+        if chroma_type == "persistent":
+            print(config.config_path, config.archival_storage_path)
+            default_archival_storage_path = (
+                config.archival_storage_path if config.archival_storage_path else os.path.join(config.config_path, "chroma")
+            )
+            print(default_archival_storage_path)
+            archival_storage_path = questionary.text("Enter persistent storage location:", default=default_archival_storage_path).ask()
+
+    return archival_storage_type, archival_storage_uri, archival_storage_path
 
     # TODO: allow configuring embedding model
 
@@ -244,7 +260,7 @@ def configure():
     model, model_wrapper, context_window = configure_model(config, model_endpoint_type)
     embedding_endpoint_type, embedding_endpoint, embedding_dim = configure_embedding_endpoint(config)
     default_preset, default_persona, default_human, default_agent = configure_cli(config)
-    archival_storage_type, archival_storage_uri = configure_archival_storage(config)
+    archival_storage_type, archival_storage_uri, archival_storage_path = configure_archival_storage(config)
 
     # check credentials
     azure_key, azure_endpoint, azure_version, azure_deployment, azure_embedding_deployment = get_azure_credentials()
diff --git a/memgpt/config.py b/memgpt/config.py
index 34073da851..e4408b3ddd 100644
--- a/memgpt/config.py
+++ b/memgpt/config.py
@@ -208,6 +208,7 @@ def save(self):
 
         # archival storage
         set_field(config, "archival_storage", "type", self.archival_storage_type)
+        print(self.archival_storage_path)
         set_field(config, "archival_storage", "path", self.archival_storage_path)
         set_field(config, "archival_storage", "uri", self.archival_storage_uri)
 
diff --git a/memgpt/connectors/chroma.py b/memgpt/connectors/chroma.py
index d9c5a98e20..ef2ab8a37a 100644
--- a/memgpt/connectors/chroma.py
+++ b/memgpt/connectors/chroma.py
@@ -1,19 +1,31 @@
 import chromadb
 import json
 import re
-from typing import Optional, List
+from typing import Optional, List, Iterator
 from memgpt.connectors.storage import StorageConnector, Passage
 from memgpt.utils import printd
 from memgpt.config import AgentConfig, MemGPTConfig
 
 
+def create_chroma_client():
+    config = MemGPTConfig.load()
+    # create chroma client
+    if config.archival_storage_path:
+        client = chromadb.PersistentClient(config.archival_storage_path)
+    else:
+        # assume uri={ip}:{port}
+        ip = config.archival_storage_uri.split(":")[0]
+        port = config.archival_storage_uri.split(":")[1]
+        client = chromadb.HttpClient(host=ip, port=port)
+    return client
+
+
 class ChromaStorageConnector(StorageConnector):
     """Storage via Chroma"""
 
     # WARNING: This is not thread safe. Do NOT do concurrent access to the same collection.
 
     def __init__(self, name: Optional[str] = None, agent_config: Optional[AgentConfig] = None):
-        config = MemGPTConfig.load()
 
         # determine table name
         if agent_config:
@@ -27,18 +39,28 @@ def __init__(self, name: Optional[str] = None, agent_config: Optional[AgentConfi
 
         printd(f"Using table name {self.table_name}")
 
-        # create chroma client
-        if config.archival_storage_path:
-            self.client = chromadb.PersistentClient(config.archival_storage_path)
-        else:
-            # assume uri={ip}:{port}
-            ip = config.archival_storage_uri.split(":")[0]
-            port = config.archival_storage_uri.split(":")[1]
-            self.client = chromadb.HttpClient(host="localhost", port=8000)
+        # create client
+        self.client = create_chroma_client()
 
         # get a collection or create if it doesn't exist already
         self.collection = self.client.get_or_create_collection(self.table_name)
 
+    def get_all_paginated(self, page_size: int) -> Iterator[List[Passage]]:
+        offset = 0
+        while True:
+            # Retrieve a chunk of records with the given page_size
+            db_passages_chunk = self.collection.get(offset=offset, limit=page_size, include=["embeddings", "documents"])
+
+            # If the chunk is empty, we've retrieved all records
+            if not db_passages_chunk:
+                break
+
+            # Yield a list of Passage objects converted from the chunk
+            yield [Passage(text=p.text, embedding=p.embedding, doc_id=p.doc_id, passage_id=p.id) for p in db_passages_chunk]
+
+            # Increment the offset to get the next chunk in the next iteration
+            offset += page_size
+
     def get_all(self) -> List[Passage]:
         results = self.collection.get(include=["embeddings", "documents"])
         return [Passage(text=text, embedding=embedding) for (text, embedding) in zip(results["documents"], results["embeddings"])]
@@ -66,13 +88,14 @@ def delete(self):
         self.client.delete_collection(name=self.table_name)
 
     def save(self):
-        # save to persistence file
+        # save to persistence file (nothing needs to be done)
         printd("Saving chroma")
+        pass
 
     @staticmethod
     def list_loaded_data():
-        config = MemGPTConfig.load()
-        collections = self.client.list_collections()
+        client = create_chroma_client()
+        collections = client.list_collections()
         collections = [c for c in collections if c.name.startswith("memgpt_") and not c.name.startswith("memgpt_agent_")]
         return collections
 
@@ -98,3 +121,6 @@ def generate_table_name_agent(self, agent_config: AgentConfig):
 
     def generate_table_name(self, name: str):
         return f"memgpt_{self.sanitize_table_name(name)}"
+
+    def size(self) -> int:
+        return self.collection.count()
diff --git a/tests/test_storage.py b/tests/test_storage.py
index c878a3d5f1..fc941fa13b 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -11,6 +11,7 @@
 import pgvector  # Try to import again after installing
 
 from memgpt.connectors.storage import StorageConnector, Passage
+from memgpt.connectors.chroma import ChromaStorageConnector
 from memgpt.connectors.db import PostgresStorageConnector, LanceDBConnector
 from memgpt.embeddings import embedding_model
 from memgpt.config import MemGPTConfig, AgentConfig
@@ -59,12 +60,21 @@ def test_postgres_openai():
     # print("...finished")
 
 
-@pytest.mark.skipif(os.getenv("OPENAI_API_KEY"), reason="Missing OpenAI API key")
+@pytest.mark.skipif(not os.getenv("OPENAI_API_KEY"), reason="Missing OpenAI API key")
 def test_chroma_openai():
     if not os.getenv("OPENAI_API_KEY"):
         return  # soft pass
 
-    config = MemGPTConfig.load()
+    config = MemGPTConfig(
+        archival_storage_type="chroma",
+        archival_storage_path="./test_chroma",
+        embedding_endpoint_type="openai",
+        embedding_dim=1536,
+        model="gpt4",
+        model_endpoint_type="openai",
+        model_endpoint="https://api.openai.com/v1",
+    )
+    config.save()
     embed_model = embedding_model()
 
     passage = ["This is a test passage", "This is another test passage", "Cinderella wept"]

From 1edb6b6285172afb94b07e86d3953777e427341f Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 21 Nov 2023 15:48:46 -0800
Subject: [PATCH 39/48] fix list error

---
 memgpt/connectors/chroma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/memgpt/connectors/chroma.py b/memgpt/connectors/chroma.py
index ef2ab8a37a..eff0608fb6 100644
--- a/memgpt/connectors/chroma.py
+++ b/memgpt/connectors/chroma.py
@@ -96,7 +96,7 @@ def save(self):
     def list_loaded_data():
         client = create_chroma_client()
         collections = client.list_collections()
-        collections = [c for c in collections if c.name.startswith("memgpt_") and not c.name.startswith("memgpt_agent_")]
+        collections = [c.name for c in collections if c.name.startswith("memgpt_") and not c.name.startswith("memgpt_agent_")]
         return collections
 
     def sanitize_table_name(self, name: str) -> str:

From 53792e3916bb540e5dd7b4c7a637ba8f3249edf9 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 21 Nov 2023 15:51:02 -0800
Subject: [PATCH 40/48] update dependencies

---
 .github/workflows/tests.yml |  2 +-
 poetry.lock                 | 40 ++++++++++++++++++++-----------------
 pyproject.toml              |  2 ++
 3 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ce02993ecd..d13450a630 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -42,7 +42,7 @@ jobs:
         PGVECTOR_TEST_DB_URL: ${{ secrets.PGVECTOR_TEST_DB_URL }}
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       run: |
-        poetry install -E dev -E postgres -E local -E legacy
+        poetry install -E dev -E postgres -E local -E legacy -E chroma -E lancedb
 
     - name: Set Poetry config
       env:
diff --git a/poetry.lock b/poetry.lock
index 9e2e39979d..6cff1af885 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -254,7 +254,7 @@ uvloop = ["uvloop (>=0.15.2)"]
 name = "cachetools"
 version = "5.3.2"
 description = "Extensible memoizing collections and decorators"
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "cachetools-5.3.2-py3-none-any.whl", hash = "sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1"},
@@ -382,6 +382,16 @@ files = [
     {file = "charset_normalizer-3.3.2-py3-none-any.whl", hash = "sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc"},
 ]
 
+[[package]]
+name = "chroma"
+version = "0.2.0"
+description = "Color handling made simple."
+optional = false
+python-versions = "*"
+files = [
+    {file = "Chroma-0.2.0.tar.gz", hash = "sha256:e265bcd503e2b35c4448b83257467166c252ecf3ab610492432780691cdfb286"},
+]
+
 [[package]]
 name = "click"
 version = "8.1.7"
@@ -497,7 +507,7 @@ vision = ["Pillow (>=6.2.1)"]
 name = "decorator"
 version = "5.1.1"
 description = "Decorators for Humans"
-optional = false
+optional = true
 python-versions = ">=3.5"
 files = [
     {file = "decorator-5.1.1-py3-none-any.whl", hash = "sha256:b8c3f85900b9dc423225913c5aace94729fe1fa9763b38939a95226f02d37186"},
@@ -535,7 +545,7 @@ dev = ["PyTest", "PyTest-Cov", "bump2version (<1)", "sphinx (<2)", "tox"]
 name = "deprecation"
 version = "2.1.0"
 description = "A library to handle automated deprecations"
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "deprecation-2.1.0-py2.py3-none-any.whl", hash = "sha256:a10811591210e1fb0e768a8c25517cabeabcba6f0bf96564f8ff45189f90b14a"},
@@ -950,7 +960,7 @@ files = [
 name = "lancedb"
 version = "0.3.3"
 description = "lancedb"
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "lancedb-0.3.3-py3-none-any.whl", hash = "sha256:67ccea22a6cb39c688041f7469be778a2e64b141db80866f6f0dec25a3122aff"},
@@ -2009,7 +2019,7 @@ files = [
 name = "py"
 version = "1.11.0"
 description = "library with cross-python path, ini-parsing, io, code, log facilities"
-optional = false
+optional = true
 python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
 files = [
     {file = "py-1.11.0-py2.py3-none-any.whl", hash = "sha256:607c53218732647dff4acdfcd50cb62615cedf612e72d1724fb1a0cc6405b378"},
@@ -2020,7 +2030,7 @@ files = [
 name = "pyarrow"
 version = "14.0.1"
 description = "Python library for Apache Arrow"
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "pyarrow-14.0.1-cp310-cp310-macosx_10_14_x86_64.whl", hash = "sha256:96d64e5ba7dceb519a955e5eeb5c9adcfd63f73a56aea4722e2cc81364fc567a"},
@@ -2219,7 +2229,7 @@ plugins = ["importlib-metadata"]
 name = "pylance"
 version = "0.8.10"
 description = "python wrapper for Lance columnar format"
-optional = false
+optional = true
 python-versions = ">=3.8"
 files = [
     {file = "pylance-0.8.10-cp38-abi3-macosx_10_15_x86_64.whl", hash = "sha256:aecf053e12f13a1810a70c786c1e73bcf3ffe7287c0bfe2cc5df77a91f0a084c"},
@@ -2246,31 +2256,26 @@ python-versions = ">=3.8"
 files = [
     {file = "PyMuPDF-1.23.6-cp310-none-macosx_10_9_x86_64.whl", hash = "sha256:c4eb71b88a22c1008f764b3121b36a9d25340f9920b870508356050a365d9ca1"},
     {file = "PyMuPDF-1.23.6-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:3ce2d3678dbf822cff213b1902f2e59756313e543efd516a2b4f15bb0353bd6c"},
-    {file = "PyMuPDF-1.23.6-cp310-none-manylinux2014_aarch64.whl", hash = "sha256:2e27857a15c8a810d0b66455b8c8a79013640b6267a9b4ea808a5fe1f47711f2"},
     {file = "PyMuPDF-1.23.6-cp310-none-manylinux2014_x86_64.whl", hash = "sha256:5cd05700c8f18c9dafef63ac2ed3b1099ca06017ca0c32deea13093cea1b8671"},
     {file = "PyMuPDF-1.23.6-cp310-none-win32.whl", hash = "sha256:951d280c1daafac2fd6a664b031f7f98b27eb2def55d39c92a19087bd8041c5d"},
     {file = "PyMuPDF-1.23.6-cp310-none-win_amd64.whl", hash = "sha256:19d1711d5908c4527ad2deef5af2d066649f3f9a12950faf30be5f7251d18abc"},
     {file = "PyMuPDF-1.23.6-cp311-none-macosx_10_9_x86_64.whl", hash = "sha256:3f0f9b76bc4f039e7587003cbd40684d93a98441549dd033cab38ca07d61988d"},
     {file = "PyMuPDF-1.23.6-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:e047571d799b30459ad7ee0bc6e68900a7f6b928876f956c976f279808814e72"},
-    {file = "PyMuPDF-1.23.6-cp311-none-manylinux2014_aarch64.whl", hash = "sha256:1cbcf05c06f314fdf3042ceee674e9a0ac7fae598347d5442e2138c6046d4e82"},
     {file = "PyMuPDF-1.23.6-cp311-none-manylinux2014_x86_64.whl", hash = "sha256:e33f8ec5ba7265fe78b30332840b8f454184addfa79f9c27f160f19789aa5ffd"},
     {file = "PyMuPDF-1.23.6-cp311-none-win32.whl", hash = "sha256:2c141f33e2733e48de8524dfd2de56d889feef0c7773b20a8cd216c03ab24793"},
     {file = "PyMuPDF-1.23.6-cp311-none-win_amd64.whl", hash = "sha256:8fd9c4ee1dd4744a515b9190d8ba9133348b0d94c362293ed77726aa1c13b0a6"},
     {file = "PyMuPDF-1.23.6-cp312-none-macosx_10_9_x86_64.whl", hash = "sha256:4d06751d5cd213e96f84f2faaa71a51cf4d641851e07579247ca1190121f173b"},
     {file = "PyMuPDF-1.23.6-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:526b26a5207e923aab65877ad305644402851823a352cb92d362053426899354"},
-    {file = "PyMuPDF-1.23.6-cp312-none-manylinux2014_aarch64.whl", hash = "sha256:0f852d125defc26716878b1796f4d68870e9065041d00cf46bde317fd8d30e68"},
     {file = "PyMuPDF-1.23.6-cp312-none-manylinux2014_x86_64.whl", hash = "sha256:5bdf7020b90987412381acc42427dd1b7a03d771ee9ec273de003e570164ec1a"},
     {file = "PyMuPDF-1.23.6-cp312-none-win32.whl", hash = "sha256:e2d64799c6d9a3735be9e162a5d11061c0b7fbcb1e5fc7446e0993d0f815a93a"},
     {file = "PyMuPDF-1.23.6-cp312-none-win_amd64.whl", hash = "sha256:c8ea81964c1433ea163ad4b53c56053a87a9ef6e1bd7a879d4d368a3988b60d1"},
     {file = "PyMuPDF-1.23.6-cp38-none-macosx_10_9_x86_64.whl", hash = "sha256:761501a4965264e81acdd8f2224f993020bf24474e9b34fcdb5805a6826eda1c"},
     {file = "PyMuPDF-1.23.6-cp38-none-macosx_11_0_arm64.whl", hash = "sha256:fd8388e82b6045807d19addf310d8119d32908e89f76cc8bbf8cf1ec36fce947"},
-    {file = "PyMuPDF-1.23.6-cp38-none-manylinux2014_aarch64.whl", hash = "sha256:4ac9673a6d6ee7e80cb242dacb43f9ca097b502d9c5e44687dbdffc2bce7961a"},
     {file = "PyMuPDF-1.23.6-cp38-none-manylinux2014_x86_64.whl", hash = "sha256:6e319c1f49476e07b9a12017c2d031687617713f8a46b7adcec03c636ed04607"},
     {file = "PyMuPDF-1.23.6-cp38-none-win32.whl", hash = "sha256:1103eea4ab727e32b9cb93347b35f71562033018c333a7f3a17d115e980fea4a"},
     {file = "PyMuPDF-1.23.6-cp38-none-win_amd64.whl", hash = "sha256:991a37e1cba43775ce094da87cf0bf72172a5532a09644003276bc8bfdfe9f1a"},
     {file = "PyMuPDF-1.23.6-cp39-none-macosx_10_9_x86_64.whl", hash = "sha256:57725e15872f7ab67a9fb3e06e5384d1047b2121e85755c93a6d4266d3ca8983"},
     {file = "PyMuPDF-1.23.6-cp39-none-macosx_11_0_arm64.whl", hash = "sha256:224c341fe254adda97c8f06a4c5838cdbcf609fa89e70b1fb179752533378f2f"},
-    {file = "PyMuPDF-1.23.6-cp39-none-manylinux2014_aarch64.whl", hash = "sha256:271bdf6059bb8347f9c9c6b721329bd353a933681b1fc62f43241b410e7ab7ae"},
     {file = "PyMuPDF-1.23.6-cp39-none-manylinux2014_x86_64.whl", hash = "sha256:57e22bea69690450197b34dcde16bd9fe0265ac4425b4033535ccc5c044246fb"},
     {file = "PyMuPDF-1.23.6-cp39-none-win32.whl", hash = "sha256:2885a26220a32fb45ea443443b72194bb7107d6862d8d546b59e4ad0c8a1f2c9"},
     {file = "PyMuPDF-1.23.6-cp39-none-win_amd64.whl", hash = "sha256:361cab1be45481bd3dc4e00ec82628ebc189b4f4b6fd9bd78a00cfeed54e0034"},
@@ -2289,7 +2294,6 @@ python-versions = ">=3.8"
 files = [
     {file = "PyMuPDFb-1.23.6-py3-none-macosx_10_9_x86_64.whl", hash = "sha256:e5af77580aad3d1103aeec57009d156bfca429cecda14a17c573fcbe97bafb30"},
     {file = "PyMuPDFb-1.23.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:9925816cbe3e05e920f9be925e5752c2eef42b793885b62075bb0f6a69178598"},
-    {file = "PyMuPDFb-1.23.6-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:009e2cff166059e13bf71f93919e688f46b8fc11d122433574cfb0cc9134690e"},
     {file = "PyMuPDFb-1.23.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7132b30e6ad6ff2013344e3a481b2287fe0be3710d80694807dd6e0d8635f085"},
     {file = "PyMuPDFb-1.23.6-py3-none-win32.whl", hash = "sha256:9d24ddadc204e895bee5000ddc7507c801643548e59f5a56aad6d32981d17eeb"},
     {file = "PyMuPDFb-1.23.6-py3-none-win_amd64.whl", hash = "sha256:7bef75988e6979b10ca804cf9487f817aae43b0fff1c6e315b3b9ee0cf1cc32f"},
@@ -2434,7 +2438,7 @@ prompt_toolkit = ">=2.0,<=3.0.36"
 name = "ratelimiter"
 version = "1.2.0.post0"
 description = "Simple python rate limiting object"
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "ratelimiter-1.2.0.post0-py3-none-any.whl", hash = "sha256:a52be07bc0bb0b3674b4b304550f10c769bbb00fead3072e035904474259809f"},
@@ -2566,7 +2570,7 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 name = "retry"
 version = "0.9.2"
 description = "Easy to use retry decorator."
-optional = false
+optional = true
 python-versions = "*"
 files = [
     {file = "retry-0.9.2-py2.py3-none-any.whl", hash = "sha256:ccddf89761fa2c726ab29391837d4327f819ea14d244c232a1d24c67a2f98606"},
@@ -2732,7 +2736,7 @@ asn1crypto = ">=1.5.1"
 name = "semver"
 version = "3.0.2"
 description = "Python helper for Semantic Versioning (https://semver.org)"
-optional = false
+optional = true
 python-versions = ">=3.7"
 files = [
     {file = "semver-3.0.2-py3-none-any.whl", hash = "sha256:b1ea4686fe70b981f85359eda33199d60c53964284e0cfb4977d243e37cf4bf4"},
@@ -3743,7 +3747,7 @@ multidict = ">=4.0"
 
 [extras]
 dev = ["black", "datasets", "pre-commit", "pytest"]
-lancedb = []
+lancedb = ["lancedb"]
 legacy = ["faiss-cpu", "numpy"]
 local = ["huggingface-hub", "torch", "transformers"]
 postgres = ["pg8000", "pgvector", "psycopg", "psycopg-binary", "psycopg2-binary"]
@@ -3751,4 +3755,4 @@ postgres = ["pg8000", "pgvector", "psycopg", "psycopg-binary", "psycopg2-binary"
 [metadata]
 lock-version = "2.0"
 python-versions = "<3.12,>=3.9"
-content-hash = "130c4da6c4b59aeb80aecf9549f75bed28123c275e30f159232e491d726034d5"
+content-hash = "822f99f883ba47babfb44f62e368ebba158789a57a82615cfb851b015bbe1bac"
diff --git a/pyproject.toml b/pyproject.toml
index 803a569db3..d3faf9192a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -48,12 +48,14 @@ torch = {version = ">=2.0.0, !=2.0.1, !=2.1.0", optional = true}
 websockets = "^12.0"
 docstring-parser = "^0.15"
 lancedb = {version = "^0.3.3", optional = true}
+chroma = {version = "^0.2.0", optional = true}
 
 [tool.poetry.extras]
 legacy = ["faiss-cpu", "numpy"]
 local = ["torch", "huggingface-hub", "transformers"]
 lancedb = ["lancedb"]
 postgres = ["pgvector", "psycopg", "psycopg-binary", "psycopg2-binary", "pg8000"]
+chroma = ["chroma"]
 dev = ["pytest", "black", "pre-commit", "datasets"]
 
 [build-system]

From 9ff32a81d5b55ee60fb85b44530db1fa951c0761 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 21 Nov 2023 15:56:23 -0800
Subject: [PATCH 41/48] update docs

---
 docs/storage.md | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/docs/storage.md b/docs/storage.md
index 72bfbac8ff..5d28c28ae5 100644
--- a/docs/storage.md
+++ b/docs/storage.md
@@ -17,23 +17,34 @@ pip install 'pymemgpt[postgres]'
 ### Running Postgres
 You will need to have a URI to a Postgres database which support [pgvector](https://github.com/pgvector/pgvector). You can either use a [hosted provider](https://github.com/pgvector/pgvector/issues/54) or [install pgvector](https://github.com/pgvector/pgvector#installation).
 
+## Chroma
+To enable the Chroma storage backend, install the dependencies with: 
+```
+pip install `pymemgpt[chroma]`
+```
+You can configure Chroma with both the HTTP and persistent storage client via `memgpt configure`. You will need to specify either a persistent storage path or host/port dependending on your client choice. The example below shows how to configure Chroma with local persistent storage: 
+```
+? Select LLM inference provider: openai
+? Override default endpoint: https://api.openai.com/v1
+? Select default model (recommended: gpt-4): gpt-4
+? Select embedding provider: openai
+? Select default preset: memgpt_chat
+? Select default persona: sam_pov
+? Select default human: cs_phd
+? Select storage backend for archival data: chroma
+? Select chroma backend: persistent
+? Enter persistent storage location: /Users/sarahwooders/.memgpt/config/chroma
+```
 
 ## LanceDB
-In order to use the LanceDB backend.
-
- You have to enable the LanceDB backend by running 
- 
- ```
- memgpt configure
- ```
-  and selecting `lancedb` for archival storage, and database URI (e.g. `./.lancedb`"), Empty archival uri is also handled and default uri is set at `./.lancedb`. 
-
 To enable the LanceDB backend, make sure to install the required dependencies with:
 ```
 pip install 'pymemgpt[lancedb]'
 ```
-for more checkout [lancedb docs](https://lancedb.github.io/lancedb/)
+You have to enable the LanceDB backend by running 
+ ```
+ memgpt configure
+ ```
+and selecting `lancedb` for archival storage, and database URI (e.g. `./.lancedb`"), Empty archival uri is also handled and default uri is set at `./.lancedb`. For more checkout [lancedb docs](https://lancedb.github.io/lancedb/)
 
 
-## Chroma
-(Coming soon)

From 3368d5cd84f7b7f2a2e15e3907f73efa3776dd10 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 21 Nov 2023 15:58:10 -0800
Subject: [PATCH 42/48] format

---
 memgpt/connectors/chroma.py | 1 -
 tests/test_load_archival.py | 2 --
 2 files changed, 3 deletions(-)

diff --git a/memgpt/connectors/chroma.py b/memgpt/connectors/chroma.py
index eff0608fb6..8db7aa2ee2 100644
--- a/memgpt/connectors/chroma.py
+++ b/memgpt/connectors/chroma.py
@@ -26,7 +26,6 @@ class ChromaStorageConnector(StorageConnector):
     # WARNING: This is not thread safe. Do NOT do concurrent access to the same collection.
 
     def __init__(self, name: Optional[str] = None, agent_config: Optional[AgentConfig] = None):
-
         # determine table name
         if agent_config:
             assert name is None, f"Cannot specify both agent config and name {name}"
diff --git a/tests/test_load_archival.py b/tests/test_load_archival.py
index c01ddd0d34..c6bc3a4dbc 100644
--- a/tests/test_load_archival.py
+++ b/tests/test_load_archival.py
@@ -126,7 +126,6 @@ def test_chroma():
 
 
 def test_postgres():
-
     # override config path with enviornment variable
     # TODO: make into temporary file
     os.environ["MEMGPT_CONFIG_PATH"] = "/Users/sarahwooders/repos/MemGPT/test_config.cfg"
@@ -153,7 +152,6 @@ def test_postgres():
 
 
 def test_chroma():
-
     import chromadb
 
     # override config path with enviornment variable

From 4453fd4b6aed7a81e012758315e4eadbddaf981b Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 21 Nov 2023 16:03:12 -0800
Subject: [PATCH 43/48] cleanup

---
 memgpt/config.py            |  1 -
 tests/test_load_archival.py | 13 -------------
 2 files changed, 14 deletions(-)

diff --git a/memgpt/config.py b/memgpt/config.py
index e4408b3ddd..34073da851 100644
--- a/memgpt/config.py
+++ b/memgpt/config.py
@@ -208,7 +208,6 @@ def save(self):
 
         # archival storage
         set_field(config, "archival_storage", "type", self.archival_storage_type)
-        print(self.archival_storage_path)
         set_field(config, "archival_storage", "path", self.archival_storage_path)
         set_field(config, "archival_storage", "uri", self.archival_storage_uri)
 
diff --git a/tests/test_load_archival.py b/tests/test_load_archival.py
index c6bc3a4dbc..d639bb5599 100644
--- a/tests/test_load_archival.py
+++ b/tests/test_load_archival.py
@@ -111,19 +111,6 @@ def test_chroma():
         recursive=True,
     )
 
-    # index = memgpt.embeddings.Index(name)
-
-    ## query chroma
-    ##chroma_client = chromadb.Client()
-    # chroma_client = chromadb.PersistentClient(path="/Users/sarahwooders/repos/MemGPT/chromadb")
-    # collection = chroma_client.get_collection(name=name)
-    # results = collection.query(
-    #    query_texts=["cinderella be getting sick"],
-    #    n_results=2
-    # )
-    # print(results)
-    # assert len(results) == 2, f"Expected 2 results, but got {len(results)}"
-
 
 def test_postgres():
     # override config path with enviornment variable

From 36d41bf904dc804118ccc544eec4cc91472848d6 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Fri, 1 Dec 2023 16:29:51 -0800
Subject: [PATCH 44/48] forgot to add embedding file

---
 docs/embedding_endpoints.md | 68 +++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 docs/embedding_endpoints.md

diff --git a/docs/embedding_endpoints.md b/docs/embedding_endpoints.md
new file mode 100644
index 0000000000..7727087e71
--- /dev/null
+++ b/docs/embedding_endpoints.md
@@ -0,0 +1,68 @@
+MemGPT uses embedding models for retrieval search over archival memory. You can use embeddings provided by OpenAI, Azure, or any model on Hugging Face. 
+
+## OpenAI
+To use OpenAI, make sure your `OPENAI_API_KEY` enviornment variable is set.
+```sh
+export OPENAI_API_KEY=YOUR_API_KEY # on Linux/Mac
+```
+Then, configure MemGPT and select `openai` as the embedding provider: 
+```
+> memgpt configure                                                                                    
+...
+? Select embedding provider: openai 
+...
+```
+
+## Azure
+To use Azure, set enviornment variables for Azure and an additional variable specifying your embedding deployment:
+```sh
+# see https://github.com/openai/openai-python#microsoft-azure-endpoints
+export AZURE_OPENAI_KEY = ...
+export AZURE_OPENAI_ENDPOINT = ...
+export AZURE_OPENAI_VERSION = ...
+
+# set the below if you are using deployment ids
+export AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT = ...
+```
+Then, configure MemGPT and select `azure` as the embedding provider: 
+```
+> memgpt configure                                                                                    
+...
+? Select embedding provider: azure 
+...
+```
+
+## Custom Endpoint
+MemGPT supports running embeddings with any Hugging Face model using the [Text Embeddings Inference](https://github.com/huggingface/text-embeddings-inference)(TEI) library. To get started, first make sure you follow TEI's [instructions](https://github.com/huggingface/text-embeddings-inference#get-started) for getting started. Once you have a running endpoint, you can configure MemGPT to use your endpoint: 
+```
+> memgpt configure                                                                                    
+...
+? Select embedding provider: hugging-face
+? Enter default endpoint: http://localhost:8080
+? Enter HuggingFace model tag (e.g. BAAI/bge-large-en-v1.5): BAAI/bge-large-en-v1.5
+? Enter embedding model dimentions (e.g. 1024): 1536
+...
+```
+
+## Local Embeddings 
+
+MemGPT can compute embeddings locally using a lightweight embedding model [`BAAI/bge-small-en-v1.5`](https://huggingface.co/BAAI/bge-small-en-v1.5). 
+!!! warning "Local LLM Performance"
+
+    The `BAAI/bge-small-en-v1.5` was chose to be lightweight, so you may notice degraded performance with embedding-based retrieval when using this option. 
+
+
+
+To compute embeddings locally, install dependencies with: 
+```
+pip install `pymemgpt[local]`
+```
+Then, select the `local` option during configuration: 
+```
+> memgpt configure                                                                                    
+...
+? Select embedding provider: local
+...
+```
+
+

From c982b555a1ea41a50e1602e45e02ced624199ae7 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Fri, 1 Dec 2023 17:41:49 -0800
Subject: [PATCH 45/48] upgrade llama index

---
 poetry.lock    | 186 ++++++++++++++++++-------------------------------
 pyproject.toml |   2 +-
 2 files changed, 67 insertions(+), 121 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 4aea385f21..c6ff3f56b4 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.7.0 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.
 
 [[package]]
 name = "aiohttp"
@@ -196,6 +196,24 @@ docs = ["furo", "myst-parser", "sphinx", "sphinx-notfound-page", "sphinxcontrib-
 tests = ["attrs[tests-no-zope]", "zope-interface"]
 tests-no-zope = ["cloudpickle", "hypothesis", "mypy (>=1.1.1)", "pympler", "pytest (>=4.3.0)", "pytest-mypy-plugins", "pytest-xdist[psutil]"]
 
+[[package]]
+name = "beautifulsoup4"
+version = "4.12.2"
+description = "Screen-scraping library"
+optional = false
+python-versions = ">=3.6.0"
+files = [
+    {file = "beautifulsoup4-4.12.2-py3-none-any.whl", hash = "sha256:bd2520ca0d9d7d12694a53d44ac482d181b4ec1888909b035a3dbf40d0f57d4a"},
+    {file = "beautifulsoup4-4.12.2.tar.gz", hash = "sha256:492bbc69dca35d12daac71c4db1bfff0c876c00ef4a2ffacce226d4638eb72da"},
+]
+
+[package.dependencies]
+soupsieve = ">1.2"
+
+[package.extras]
+html5lib = ["html5lib"]
+lxml = ["lxml"]
+
 [[package]]
 name = "black"
 version = "23.11.0"
@@ -530,6 +548,17 @@ files = [
     {file = "distlib-0.3.7.tar.gz", hash = "sha256:9dafe54b34a028eafd95039d5e5d4851a13734540f1331060d31c9916e7147a8"},
 ]
 
+[[package]]
+name = "distro"
+version = "1.8.0"
+description = "Distro - an OS platform information API"
+optional = false
+python-versions = ">=3.6"
+files = [
+    {file = "distro-1.8.0-py3-none-any.whl", hash = "sha256:99522ca3e365cac527b44bde033f64c6945d90eb9f769703caaec52b09bbd3ff"},
+    {file = "distro-1.8.0.tar.gz", hash = "sha256:02e111d1dc6a50abb8eed6bf31c3e48ed8b0830d1ea2a1b78c61765c2513fdd8"},
+]
+
 [[package]]
 name = "docstring-parser"
 version = "0.15"
@@ -903,31 +932,6 @@ files = [
     {file = "joblib-1.3.2.tar.gz", hash = "sha256:92f865e621e17784e7955080b6d042489e3b8e294949cc44c6eac304f59772b1"},
 ]
 
-[[package]]
-name = "jsonpatch"
-version = "1.33"
-description = "Apply JSON-Patches (RFC 6902)"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
-files = [
-    {file = "jsonpatch-1.33-py2.py3-none-any.whl", hash = "sha256:0ae28c0cd062bbd8b8ecc26d7d164fbbea9652a1a3693f3b956c1eae5145dade"},
-    {file = "jsonpatch-1.33.tar.gz", hash = "sha256:9fcd4009c41e6d12348b4a0ff2563ba56a2923a7dfee731d004e212e1ee5030c"},
-]
-
-[package.dependencies]
-jsonpointer = ">=1.9"
-
-[[package]]
-name = "jsonpointer"
-version = "2.4"
-description = "Identify specific nodes in a JSON document (RFC 6901)"
-optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*, !=3.5.*, !=3.6.*"
-files = [
-    {file = "jsonpointer-2.4-py2.py3-none-any.whl", hash = "sha256:15d51bba20eea3165644553647711d150376234112651b4f1811022aecad7d7a"},
-    {file = "jsonpointer-2.4.tar.gz", hash = "sha256:585cee82b70211fa9e6043b7bb89db6e1aa49524340dde8ad6b63206ea689d88"},
-]
-
 [[package]]
 name = "lancedb"
 version = "0.3.4"
@@ -962,112 +966,42 @@ docs = ["mkdocs", "mkdocs-jupyter", "mkdocs-material", "mkdocstrings[python]"]
 embeddings = ["InstructorEmbedding", "cohere", "open-clip-torch", "openai", "pillow", "sentence-transformers", "torch"]
 tests = ["pandas (>=1.4)", "pytest", "pytest-asyncio", "pytest-mock", "requests"]
 
-[[package]]
-name = "langchain"
-version = "0.0.343"
-description = "Building applications with LLMs through composability"
-optional = false
-python-versions = ">=3.8.1,<4.0"
-files = [
-    {file = "langchain-0.0.343-py3-none-any.whl", hash = "sha256:1959336b6076066bf233dd99dce44be2e9adccb53d799bff92c653098178b347"},
-    {file = "langchain-0.0.343.tar.gz", hash = "sha256:166924d771a463009277f688f6dfc829a3af2d9cd5b41a64a7a6bd7860280e81"},
-]
-
-[package.dependencies]
-aiohttp = ">=3.8.3,<4.0.0"
-anyio = "<4.0"
-async-timeout = {version = ">=4.0.0,<5.0.0", markers = "python_version < \"3.11\""}
-dataclasses-json = ">=0.5.7,<0.7"
-jsonpatch = ">=1.33,<2.0"
-langchain-core = ">=0.0.7,<0.1"
-langsmith = ">=0.0.63,<0.1.0"
-numpy = ">=1,<2"
-pydantic = ">=1,<3"
-PyYAML = ">=5.3"
-requests = ">=2,<3"
-SQLAlchemy = ">=1.4,<3"
-tenacity = ">=8.1.0,<9.0.0"
-
-[package.extras]
-all = ["O365 (>=2.0.26,<3.0.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "amadeus (>=8.1.0)", "arxiv (>=1.4,<2.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "awadb (>=0.3.9,<0.4.0)", "azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-textanalytics (>=5.3.0,<6.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "beautifulsoup4 (>=4,<5)", "clarifai (>=9.1.0)", "clickhouse-connect (>=0.5.14,<0.6.0)", "cohere (>=4,<5)", "deeplake (>=3.8.3,<4.0.0)", "dgml-utils (>=0.3.0,<0.4.0)", "docarray[hnswlib] (>=0.32.0,<0.33.0)", "duckduckgo-search (>=3.8.3,<4.0.0)", "elasticsearch (>=8,<9)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "google-api-python-client (==2.70.0)", "google-auth (>=2.18.1,<3.0.0)", "google-search-results (>=2,<3)", "gptcache (>=0.1.7)", "html2text (>=2020.1.16,<2021.0.0)", "huggingface_hub (>=0,<1)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "lancedb (>=0.1,<0.2)", "langkit (>=0.0.6,<0.1.0)", "lark (>=1.1.5,<2.0.0)", "librosa (>=0.10.0.post2,<0.11.0)", "lxml (>=4.9.2,<5.0.0)", "manifest-ml (>=0.0.1,<0.0.2)", "marqo (>=1.2.4,<2.0.0)", "momento (>=1.13.0,<2.0.0)", "nebula3-python (>=3.4.0,<4.0.0)", "neo4j (>=5.8.1,<6.0.0)", "networkx (>=2.6.3,<4)", "nlpcloud (>=1,<2)", "nltk (>=3,<4)", "nomic (>=1.0.43,<2.0.0)", "openai (<2)", "openlm (>=0.0.5,<0.0.6)", "opensearch-py (>=2.0.0,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pexpect (>=4.8.0,<5.0.0)", "pgvector (>=0.1.6,<0.2.0)", "pinecone-client (>=2,<3)", "pinecone-text (>=0.4.2,<0.5.0)", "psycopg2-binary (>=2.9.5,<3.0.0)", "pymongo (>=4.3.3,<5.0.0)", "pyowm (>=3.3.0,<4.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pytesseract (>=0.3.10,<0.4.0)", "python-arango (>=7.5.9,<8.0.0)", "pyvespa (>=0.33.0,<0.34.0)", "qdrant-client (>=1.3.1,<2.0.0)", "rdflib (>=6.3.2,<7.0.0)", "redis (>=4,<5)", "requests-toolbelt (>=1.0.0,<2.0.0)", "sentence-transformers (>=2,<3)", "singlestoredb (>=0.7.1,<0.8.0)", "tensorflow-text (>=2.11.0,<3.0.0)", "tigrisdb (>=1.0.0b6,<2.0.0)", "tiktoken (>=0.3.2,<0.6.0)", "torch (>=1,<3)", "transformers (>=4,<5)", "weaviate-client (>=3,<4)", "wikipedia (>=1,<2)", "wolframalpha (==5.0.0)"]
-azure = ["azure-ai-formrecognizer (>=3.2.1,<4.0.0)", "azure-ai-textanalytics (>=5.3.0,<6.0.0)", "azure-ai-vision (>=0.11.1b1,<0.12.0)", "azure-cognitiveservices-speech (>=1.28.0,<2.0.0)", "azure-core (>=1.26.4,<2.0.0)", "azure-cosmos (>=4.4.0b1,<5.0.0)", "azure-identity (>=1.12.0,<2.0.0)", "azure-search-documents (==11.4.0b8)", "openai (<2)"]
-clarifai = ["clarifai (>=9.1.0)"]
-cli = ["typer (>=0.9.0,<0.10.0)"]
-cohere = ["cohere (>=4,<5)"]
-docarray = ["docarray[hnswlib] (>=0.32.0,<0.33.0)"]
-embeddings = ["sentence-transformers (>=2,<3)"]
-extended-testing = ["aiosqlite (>=0.19.0,<0.20.0)", "aleph-alpha-client (>=2.15.0,<3.0.0)", "anthropic (>=0.3.11,<0.4.0)", "arxiv (>=1.4,<2.0)", "assemblyai (>=0.17.0,<0.18.0)", "atlassian-python-api (>=3.36.0,<4.0.0)", "beautifulsoup4 (>=4,<5)", "bibtexparser (>=1.4.0,<2.0.0)", "cassio (>=0.1.0,<0.2.0)", "chardet (>=5.1.0,<6.0.0)", "dashvector (>=1.0.1,<2.0.0)", "databricks-vectorsearch (>=0.21,<0.22)", "dgml-utils (>=0.3.0,<0.4.0)", "esprima (>=4.0.1,<5.0.0)", "faiss-cpu (>=1,<2)", "feedparser (>=6.0.10,<7.0.0)", "fireworks-ai (>=0.6.0,<0.7.0)", "geopandas (>=0.13.1,<0.14.0)", "gitpython (>=3.1.32,<4.0.0)", "google-cloud-documentai (>=2.20.1,<3.0.0)", "gql (>=3.4.1,<4.0.0)", "html2text (>=2020.1.16,<2021.0.0)", "javelin-sdk (>=0.1.8,<0.2.0)", "jinja2 (>=3,<4)", "jq (>=1.4.1,<2.0.0)", "jsonschema (>1)", "lxml (>=4.9.2,<5.0.0)", "markdownify (>=0.11.6,<0.12.0)", "motor (>=3.3.1,<4.0.0)", "msal (>=1.25.0,<2.0.0)", "mwparserfromhell (>=0.6.4,<0.7.0)", "mwxml (>=0.3.3,<0.4.0)", "newspaper3k (>=0.2.8,<0.3.0)", "numexpr (>=2.8.6,<3.0.0)", "openai (<2)", "openapi-pydantic (>=0.3.2,<0.4.0)", "pandas (>=2.0.1,<3.0.0)", "pdfminer-six (>=20221105,<20221106)", "pgvector (>=0.1.6,<0.2.0)", "psychicapi (>=0.8.0,<0.9.0)", "py-trello (>=0.19.0,<0.20.0)", "pymupdf (>=1.22.3,<2.0.0)", "pypdf (>=3.4.0,<4.0.0)", "pypdfium2 (>=4.10.0,<5.0.0)", "pyspark (>=3.4.0,<4.0.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "rapidfuzz (>=3.1.1,<4.0.0)", "rapidocr-onnxruntime (>=1.3.2,<2.0.0)", "requests-toolbelt (>=1.0.0,<2.0.0)", "rspace_client (>=2.5.0,<3.0.0)", "scikit-learn (>=1.2.2,<2.0.0)", "sqlite-vss (>=0.1.2,<0.2.0)", "streamlit (>=1.18.0,<2.0.0)", "sympy (>=1.12,<2.0)", "telethon (>=1.28.5,<2.0.0)", "timescale-vector (>=0.0.1,<0.0.2)", "tqdm (>=4.48.0)", "upstash-redis (>=0.15.0,<0.16.0)", "xata (>=1.0.0a7,<2.0.0)", "xmltodict (>=0.13.0,<0.14.0)"]
-javascript = ["esprima (>=4.0.1,<5.0.0)"]
-llms = ["clarifai (>=9.1.0)", "cohere (>=4,<5)", "huggingface_hub (>=0,<1)", "manifest-ml (>=0.0.1,<0.0.2)", "nlpcloud (>=1,<2)", "openai (<2)", "openlm (>=0.0.5,<0.0.6)", "torch (>=1,<3)", "transformers (>=4,<5)"]
-openai = ["openai (<2)", "tiktoken (>=0.3.2,<0.6.0)"]
-qdrant = ["qdrant-client (>=1.3.1,<2.0.0)"]
-text-helpers = ["chardet (>=5.1.0,<6.0.0)"]
-
-[[package]]
-name = "langchain-core"
-version = "0.0.7"
-description = "Building applications with LLMs through composability"
-optional = false
-python-versions = ">=3.8.1,<4.0"
-files = [
-    {file = "langchain_core-0.0.7-py3-none-any.whl", hash = "sha256:368ae70a1da56971642df0a9ede5f480d762224238ba84d0f9b2cd7c776150de"},
-    {file = "langchain_core-0.0.7.tar.gz", hash = "sha256:2310df8b783194ec2dfe01c2864bd8b3ccb4adecb02b17cf1d63cc773c252b4a"},
-]
-
-[package.dependencies]
-jsonpatch = ">=1.33,<2.0"
-langsmith = ">=0.0.63,<0.1.0"
-pydantic = ">=1,<3"
-tenacity = ">=8.1.0,<9.0.0"
-
-[[package]]
-name = "langsmith"
-version = "0.0.67"
-description = "Client library to connect to the LangSmith LLM Tracing and Evaluation Platform."
-optional = false
-python-versions = ">=3.8.1,<4.0"
-files = [
-    {file = "langsmith-0.0.67-py3-none-any.whl", hash = "sha256:66a257b97dabd43a7e62af271b2ddb7566167ce4e446fd7b7760e97d6ce84a5e"},
-    {file = "langsmith-0.0.67.tar.gz", hash = "sha256:cef00bac2e7455a5943f3afaea91c032db1a1f2adb83003159a71e884fb5a9a2"},
-]
-
-[package.dependencies]
-pydantic = ">=1,<3"
-requests = ">=2,<3"
-
 [[package]]
 name = "llama-index"
-version = "0.8.62"
+version = "0.9.10"
 description = "Interface between LLMs and your data"
 optional = false
 python-versions = ">=3.8.1,<3.12"
 files = [
-    {file = "llama_index-0.8.62-py3-none-any.whl", hash = "sha256:5ea95e1a1ec0f759e29093c92cdfd3f1c780d3c638a306b86aa22993ab15ce80"},
-    {file = "llama_index-0.8.62.tar.gz", hash = "sha256:c0db90f49ca8a11777b14e2a72921bef1edbf21ac5564651911999a9913f14ae"},
+    {file = "llama_index-0.9.10-py3-none-any.whl", hash = "sha256:475678eea433b2e209a4faee768c67f1e7a58ba3ffd441a82c3387585e79b24e"},
+    {file = "llama_index-0.9.10.tar.gz", hash = "sha256:d42f035caa206f3110c5c8e908f3c6e2dd3a1bd59c8ba5afe5466d338d230109"},
 ]
 
 [package.dependencies]
+aiohttp = ">=3.8.6,<4.0.0"
 aiostream = ">=0.5.2,<0.6.0"
-dataclasses-json = ">=0.5.7,<0.6.0"
+beautifulsoup4 = ">=4.12.2,<5.0.0"
+dataclasses-json = "*"
 deprecated = ">=1.2.9.3"
 fsspec = ">=2023.5.0"
-langchain = ">=0.0.303"
+httpx = "*"
 nest-asyncio = ">=1.5.8,<2.0.0"
 nltk = ">=3.8.1,<4.0.0"
 numpy = "*"
-openai = "<1"
+openai = ">=1.1.0"
 pandas = "*"
+requests = ">=2.31.0"
 SQLAlchemy = {version = ">=1.4.49", extras = ["asyncio"]}
 tenacity = ">=8.2.0,<9.0.0"
 tiktoken = ">=0.3.3"
 typing-extensions = ">=4.5.0"
 typing-inspect = ">=0.8.0"
-urllib3 = "<2"
 
 [package.extras]
+langchain = ["langchain (>=0.0.303)"]
 local-models = ["optimum[onnxruntime] (>=1.13.2,<2.0.0)", "sentencepiece (>=0.1.99,<0.2.0)", "transformers[torch] (>=4.34.0,<5.0.0)"]
 postgres = ["asyncpg (>=0.28.0,<0.29.0)", "pgvector (>=0.1.0,<0.2.0)", "psycopg-binary (>=3.1.12,<4.0.0)"]
-query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "lm-format-enforcer (>=0.4.3,<0.5.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "scikit-learn (<1.3.0)", "spacy (>=3.7.1,<4.0.0)"]
+query-tools = ["guidance (>=0.0.64,<0.0.65)", "jsonpath-ng (>=1.6.0,<2.0.0)", "lm-format-enforcer (>=0.4.3,<0.5.0)", "rank-bm25 (>=0.2.2,<0.3.0)", "scikit-learn", "spacy (>=3.7.1,<4.0.0)"]
 
 [[package]]
 name = "markdown-it-py"
@@ -1588,25 +1522,26 @@ files = [
 
 [[package]]
 name = "openai"
-version = "0.28.1"
-description = "Python client library for the OpenAI API"
+version = "1.3.7"
+description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.7.1"
 files = [
-    {file = "openai-0.28.1-py3-none-any.whl", hash = "sha256:d18690f9e3d31eedb66b57b88c2165d760b24ea0a01f150dd3f068155088ce68"},
-    {file = "openai-0.28.1.tar.gz", hash = "sha256:4be1dad329a65b4ce1a660fe6d5431b438f429b5855c883435f0f7fcb6d2dcc8"},
+    {file = "openai-1.3.7-py3-none-any.whl", hash = "sha256:e5c51367a910297e4d1cd33d2298fb87d7edf681edbe012873925ac16f95bee0"},
+    {file = "openai-1.3.7.tar.gz", hash = "sha256:18074a0f51f9b49d1ae268c7abc36f7f33212a0c0d08ce11b7053ab2d17798de"},
 ]
 
 [package.dependencies]
-aiohttp = "*"
-requests = ">=2.20"
-tqdm = "*"
+anyio = ">=3.5.0,<4"
+distro = ">=1.7.0,<2"
+httpx = ">=0.23.0,<1"
+pydantic = ">=1.9.0,<3"
+sniffio = "*"
+tqdm = ">4"
+typing-extensions = ">=4.5,<5"
 
 [package.extras]
-datalib = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
-dev = ["black (>=21.6b0,<22.0)", "pytest (==6.*)", "pytest-asyncio", "pytest-mock"]
-embeddings = ["matplotlib", "numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "plotly", "scikit-learn (>=1.0.2)", "scipy", "tenacity (>=8.0.1)"]
-wandb = ["numpy", "openpyxl (>=3.0.7)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)", "wandb"]
+datalib = ["numpy (>=1)", "pandas (>=1.2.3)", "pandas-stubs (>=1.1.0.11)"]
 
 [[package]]
 name = "overrides"
@@ -2341,7 +2276,7 @@ files = [
 name = "pyyaml"
 version = "6.0.1"
 description = "YAML parser and emitter for Python"
-optional = false
+optional = true
 python-versions = ">=3.6"
 files = [
     {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
@@ -2768,6 +2703,17 @@ files = [
     {file = "sniffio-1.3.0.tar.gz", hash = "sha256:e60305c5e5d314f5389259b7f22aaa33d8f7dee49763119234af3755c55b9101"},
 ]
 
+[[package]]
+name = "soupsieve"
+version = "2.5"
+description = "A modern CSS selector implementation for Beautiful Soup."
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "soupsieve-2.5-py3-none-any.whl", hash = "sha256:eaa337ff55a1579b6549dc679565eac1e3d000563bcb1c8ab0d0fefbc0c2cdc7"},
+    {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"},
+]
+
 [[package]]
 name = "sqlalchemy"
 version = "2.0.23"
@@ -3718,4 +3664,4 @@ postgres = ["pg8000", "pgvector", "psycopg", "psycopg-binary", "psycopg2-binary"
 [metadata]
 lock-version = "2.0"
 python-versions = "<3.12,>=3.9"
-content-hash = "d4a3af7c9778a2ce0e66bd2f73b8d5c69fc1de473551b0ed52594678baa44d12"
+content-hash = "72d6f7fb6e619b61c886bada56262d08940b1bc110fd2b1397efc2c2d5a82a68"
diff --git a/pyproject.toml b/pyproject.toml
index 39139fe1a3..8bd68ee24f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,7 +26,7 @@ pytz = "^2023.3.post1"
 tqdm = "^4.66.1"
 black = { version = "^23.10.1", optional = true }
 pytest = { version = "^7.4.3", optional = true }
-llama-index = "^0.8.53.post3"
+llama-index = "0.9.10"
 setuptools = "^68.2.2"
 datasets = { version = "^2.14.6", optional = true}
 prettytable = "^3.9.0"

From 362697b507f2b02c6a2e1c40448be9cb41bb4c85 Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Fri, 1 Dec 2023 19:14:57 -0800
Subject: [PATCH 46/48] fix data source naming bug

---
 memgpt/connectors/db.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/memgpt/connectors/db.py b/memgpt/connectors/db.py
index 65f9aacb5b..ac09e4ddcd 100644
--- a/memgpt/connectors/db.py
+++ b/memgpt/connectors/db.py
@@ -157,7 +157,8 @@ def list_loaded_data():
         inspector = inspect(engine)
         tables = inspector.get_table_names()
         tables = [table for table in tables if table.startswith("memgpt_") and not table.startswith("memgpt_agent_")]
-        tables = [table.replace("memgpt_", "") for table in tables]
+        start_chars = len("memgpt_")
+        tables = [table[start_chars:] for table in tables]
         return tables
 
     def sanitize_table_name(self, name: str) -> str:
@@ -300,7 +301,8 @@ def list_loaded_data():
 
         tables = db.table_names()
         tables = [table for table in tables if table.startswith("memgpt_")]
-        tables = [table.replace("memgpt_", "") for table in tables]
+        start_chars = len("memgpt_")
+        tables = [table[start_chars:] for table in tables]
         return tables
 
     def sanitize_table_name(self, name: str) -> str:

From d4dbc3312a2a9804e5933096442e3faddef98eee Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 5 Dec 2023 17:01:25 -0800
Subject: [PATCH 47/48] remove legacy

---
 .github/workflows/tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index d13450a630..14a8a1adb7 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -42,7 +42,7 @@ jobs:
         PGVECTOR_TEST_DB_URL: ${{ secrets.PGVECTOR_TEST_DB_URL }}
         OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
       run: |
-        poetry install -E dev -E postgres -E local -E legacy -E chroma -E lancedb
+        poetry install -E dev -E postgres -E local -E chroma -E lancedb
 
     - name: Set Poetry config
       env:

From 3982e3b073f530f1ecd72c0e93fe5b2f4bcafe5d Mon Sep 17 00:00:00 2001
From: Sarah Wooders <sarahwooders@gmail.com>
Date: Tue, 5 Dec 2023 17:27:22 -0800
Subject: [PATCH 48/48] os import

---
 tests/test_load_archival.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/tests/test_load_archival.py b/tests/test_load_archival.py
index d639bb5599..f8265ec918 100644
--- a/tests/test_load_archival.py
+++ b/tests/test_load_archival.py
@@ -1,6 +1,6 @@
 # import tempfile
 # import asyncio
-# import os
+import os
 
 # import asyncio
 # from datasets import load_dataset
@@ -289,7 +289,3 @@ def test_load_database():
     )
     print("Successfully loaded into index")
     assert True
-
-
-test_postgres()
-# test_chroma()