Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP, pass one at populate-json recipe #294

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions recipes/natural_language_processing/text-to-json/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
MODEL_URL ?=
MODEL_NAME ?=
MODEL_DIR ?= models

.PHONY: download-model
download-model:
curl -H "Cache-Control: no-cache" -s -S -L -f $(MODEL_URL) -z $(MODEL_DIR)/$(MODEL_NAME) -o $(MODEL_DIR)/$(MODEL_NAME).tmp && \
mv -f $(MODEL_NAME).tmp $(MODEL_NAME) 2>/dev/null || \
rm -f $(MODEL_DIR)/$(MODEL_NAME).tmp $(MODEL_DIR)/$(MODEL_NAME)

.PHONY: download-model-mistral # default model
download-model-mistral:
MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf make -f Makefile download-model
7 changes: 7 additions & 0 deletions recipes/natural_language_processing/text-to-json/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Steps:

1. begin local dev
2. Work on datasources
- connect with SRE teams to figure out ways we could get SRE tickets normalized into a training dataset easliy ingested by the model
- scrape stackoverflow, stackexchange, and medium for training data
3. deploy with langserve
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
aiohttp==3.9.5
aiosignal==1.3.1
annotated-types==0.6.0
anyio==4.3.0
attrs==23.2.0
certifi==2024.2.2
charset-normalizer==3.3.2
dataclasses-json==0.6.4
fastapi==0.110.2
frozenlist==1.4.1
h11==0.14.0
httpcore==1.0.5
httpx==0.27.0
idna==3.7
jsonpatch==1.33
jsonpointer==2.4
langchain==0.1.16
langchain-community==0.0.34
langchain-core==0.1.45
langchain-text-splitters==0.0.1
langserve==0.1.0
langsmith==0.1.49
marshmallow==3.21.1
multidict==6.0.5
mypy-extensions==1.0.0
numpy==1.26.4
orjson==3.10.1
packaging==23.2
pathlib==1.0.1
pydantic==2.7.0
pydantic_core==2.18.1
PyYAML==6.0.1
requests==2.31.0
sniffio==1.3.1
SQLAlchemy==2.0.29
starlette==0.37.2
tenacity==8.2.3
typing-inspect==0.9.0
typing_extensions==4.11.0
urllib3==2.2.1
validators==0.28.1
yarl==1.9.4
76 changes: 76 additions & 0 deletions recipes/natural_language_processing/text-to-json/source/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from langchain_core.callbacks import StreamingStdOutCallbackHandler
# from langchain.chains import LLMChain
# from langchain_core.prompts.prompt import PromptTemplate
from langchain_openai import OpenAI
from langchain_core.output_parsers import JsonOutputParser
from fastapi import FastAPI
import tiktoken
import os
import json
# import streamlit as st
from pprint import pprint
from pathlib import Path
# from langchain_text_splitters import RecursiveJsonSplitter

app = FastAPI()
model_path = os.getenv("MODEL_PATH", default="/locallm/models")
model_name = os.getenv("MODEL_NAME", default="mistral-7b-instruct-v0.1.Q4_K_M.gguf")
model_port = os.getenv("MODEL_PORT", default=8001)
model_server_ip = os.getenv("MODEL_SERVER_ENDPOINT", default="http://localhost")
model = f"{model_path}/{model_name}"
base_model_service = f"{model_server_ip}:{model_port}"
v1_model_service = f"{base_model_service}/v1"

revision = os.getenv("MODEL_REVISION", default="no_timm")

def log_template(string: str) -> None:
print("==================== PROMPT TEMPLATE ====================")
pprint(string)
print("================== END PROMPT TEMPLATE ==================")

# App function
def count_tokens(string: str) -> int:
encoding_name = "p50k_base"
encoding = tiktoken.get_encoding(encoding_name)
num_tokens = len(encoding.encode(string))
return num_tokens

# MS Function
def initialize_model_client() -> OpenAI:
callbacks = [StreamingStdOutCallbackHandler()]
openai_client = OpenAI(
base_url=v1_model_service,
api_key = "sk-no-key-required",
tiktoken_model_name="mistral",
temperature=0.9,
callbacks=callbacks,
# verbose=True,
# schema_json=True,
max_tokens=4000,
# streaming=True
)
return openai_client

@app.post("/populate_json_schema")
def no_download_json_chain(model: OpenAI, file_name: str, input: str):
with open (f"schemas/{file_name}", "r") as f:
json_schema = json.load(f)
json_schema_string = json.dumps(json_schema)
print("schema tokens: ", count_tokens(json_schema_string))
# dropping chunk splitting --> moving to a model with bigger token input
# splitter = RecursiveJsonSplitter(max_chunk_size=300)
# json_chunks = splitter.split_json(json_data=json_data)
template = """You are a world class engineer, who specializes in generating JSON objects. You will be provided text describing something, along with a JSON schema. Generate a JSON object from the schema based on the content of the text you are provided. Considering all possible cases, including but not limited to, text input missing the fields required in the schema, and irrelevant sections of the text input.
%JSON schema
{json_schema_string}
%User input:
{input}"""
template = template.format(json_schema_string=json_schema_string, input=input)
log_template(template)
print("template tokens: ", count_tokens(template))
return model.invoke(template)

model_client = initialize_model_client()
# no_download_json_chain(model_client, "fruit.json", "A red banana.")
test = no_download_json_chain(model_client, "employee.json", "My name is Gregory Pereira. I work in the Emereging Technologies department and the Platform and Services team. I like apples, and long walks on the beach.")
pprint(test)
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Car",
"type": "object",
"properties": {
"make": {
"type": "string",
"description": "The make or manufacturer of the car"
},
"model": {
"type": "string",
"description": "The model of the car"
},
"year": {
"type": "integer",
"minimum": 1900,
"maximum": 2024,
"description": "The manufacturing year of the car"
},
"color": {
"type": "string",
"description": "The color of the car"
},
"mileage": {
"type": "number",
"minimum": 0,
"description": "The mileage of the car in kilometers"
},
"price": {
"type": "number",
"minimum": 0,
"description": "The price of the car in USD"
}
},
"required": ["make", "model", "year", "color", "mileage", "price"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Computer",
"type": "object",
"properties": {
"brand": {
"type": "string",
"description": "The brand or manufacturer of the computer"
},
"model": {
"type": "string",
"description": "The model of the computer"
},
"processor": {
"type": "object",
"description": "Details about the processor of the computer",
"properties": {
"manufacturer": {
"type": "string",
"description": "The manufacturer of the processor"
},
"model": {
"type": "string",
"description": "The model of the processor"
},
"cores": {
"type": "integer",
"minimum": 1,
"description": "The number of processor cores"
},
"clock_speed": {
"type": "number",
"minimum": 0,
"description": "The clock speed of the processor in GHz"
}
},
"required": ["manufacturer", "model", "cores", "clock_speed"]
},
"ram": {
"type": "object",
"description": "Details about the RAM of the computer",
"properties": {
"size_gb": {
"type": "number",
"minimum": 0,
"description": "The size of RAM in gigabytes"
},
"type": {
"type": "string",
"description": "The type of RAM (e.g., DDR4)"
}
},
"required": ["size_gb", "type"]
},
"storage": {
"type": "object",
"description": "Details about the storage of the computer",
"properties": {
"type": {
"type": "string",
"description": "The type of storage (e.g., SSD, HDD)"
},
"capacity_gb": {
"type": "number",
"minimum": 0,
"description": "The capacity of storage in gigabytes"
}
},
"required": ["type", "capacity_gb"]
},
"price": {
"type": "number",
"minimum": 0,
"description": "The price of the computer in USD"
}
},
"required": ["brand", "model", "processor", "ram", "storage", "price"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"$id": "https://github.com/containers/ai-lab-recipes/recipes/json-to-text/employee.schema.json",
"title": "employee",
"description": "Acme's Employee Information",
"type": "object",
"properties": {
"name": {
"description": "The employee's full name",
"type": "string"
},
"employeeId": {
"description": "The unique identifier for a product",
"type": "integer"
},
"title": {
"description": "An Identifier for what position the employee holds within the company",
"type": "string"
},
"manager": {
"description": "Who sits above the employee in the Org chart and is responsible for managing them.",
"type": "object",
"items": {
"$ref": "#"
}
},
"teams": {
"description": "Which products, services or other initiatives is this employee responsible for contributing to.",
"type": "array",
"items": {
"type": "object",
"properties": {
"teamName": {
"description": "A name used to refer to and distinguish between teams.",
"type": "string"
},
"teamId": {
"description": "A unique integer used to identify a team.",
"type": "integer"
},
"leader": {
"description": "The one who is responsbile for guiding the team.",
"items": {
"$ref": "#"
}
},
"description": {
"description": "A short blurb giving information on the team.",
"type": "string"
}
},
"required": ["teamName", "teamId"]
},
"minItems": 1,
"uniqueItems": true
}
},
"required": ["employeeId", "title", "teams"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Fruit",
"type": "object",
"properties": {
"name": {
"type": "string",
"description": "The name of the fruit"
},
"color": {
"type": "string",
"description": "The color of the fruit"
}
},
"required": ["name", "color"]
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import validators.url
from urllib.error import URLError, HTTPError
import urllib.request

def download_json_file(url: str, file_name: str):
if validators.url(url):
try:
urllib.request.urlretrieve(url, file_name)
except HTTPError as e:
print(f"HTTP Error: {e.code}, {e.reason}")
except URLError as e:
print(f"URL Error: {e.reason}")
except Exception as e:
print(f"An unexpected error occurred: {e}")