-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvectorize_pdf.py
75 lines (58 loc) · 2.64 KB
/
vectorize_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import os
import json
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()
# Get the Hugging Face API key from the environment
inference_api_key = os.getenv("HUGGINGFACE_API_KEY")
# Define paths for data and processed files
DATA_DIR = "data/"
PROCESSED_FILES_PATH = "processed_files.json"
VECTOR_STORE_DIR = "vectorstores/db/"
def load_processed_files():
"""Load the list of processed files from JSON file."""
if os.path.exists(PROCESSED_FILES_PATH):
with open(PROCESSED_FILES_PATH, "r") as f:
return set(json.load(f)) # Use a set to avoid duplicates
return set()
def save_processed_files(processed_files):
"""Save the list of processed files to JSON file."""
with open(PROCESSED_FILES_PATH, "w") as f:
json.dump(list(processed_files), f) # Convert set back to list for JSON serialization
def create_vector_db():
"""Create a vector database from new PDF documents only."""
# Load the list of already processed files
processed_files = load_processed_files()
# Load documents from the specified directory
loader = PyPDFDirectoryLoader(DATA_DIR)
all_documents = loader.load()
# Filter out already processed documents
new_documents = [doc for doc in all_documents if doc.metadata["source"] not in processed_files]
if not new_documents:
print("No new documents to process.")
return
print(f"Processing {len(new_documents)} new pdf files.")
# Split the documents into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=5000, chunk_overlap=500)
texts = text_splitter.split_documents(new_documents)
# Create or update the vector store with new document chunks
vectorStore = Chroma.from_documents(
documents=texts,
embedding=HuggingFaceInferenceAPIEmbeddings(api_key=inference_api_key, model_name="sentence-transformers/all-MiniLM-L6-v2"),
persist_directory=VECTOR_STORE_DIR
)
# Persist the vector store to disk
# vectorStore.persist() # deprecated
print("Vector store updated.")
# Update the processed files list with only unique entries
processed_files.update(doc.metadata["source"] for doc in new_documents)
save_processed_files(processed_files)
def main():
"""Main function to create or update the vector database."""
create_vector_db()
if __name__ == "__main__":
main()