containers · Gregory-Pereira · Apr 21, 2024 · Apr 26, 2024 · Apr 27, 2024
@@ -0,0 +1,13 @@
+MODEL_URL ?=
+MODEL_NAME ?=
+MODEL_DIR ?= models
+
+.PHONY: download-model
+download-model:
+	curl -H "Cache-Control: no-cache" -s -S -L -f $(MODEL_URL) -z $(MODEL_DIR)/$(MODEL_NAME) -o $(MODEL_DIR)/$(MODEL_NAME).tmp && \
+	mv -f $(MODEL_NAME).tmp $(MODEL_NAME) 2>/dev/null || \
+	rm -f $(MODEL_DIR)/$(MODEL_NAME).tmp $(MODEL_DIR)/$(MODEL_NAME)
+
+.PHONY: download-model-mistral # default model
+download-model-mistral:
+	MODEL_NAME=mistral-7b-instruct-v0.1.Q4_K_M.gguf MODEL_URL=https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_K_M.gguf make -f Makefile download-model
@@ -0,0 +1,7 @@
+Steps:
+
+1. begin local dev
+2. Work on datasources 
+    - connect with SRE teams to figure out ways we could get SRE tickets normalized into a training dataset easliy ingested by the model
+    - scrape stackoverflow, stackexchange, and medium for training data
+3. deploy with langserve
@@ -0,0 +1,42 @@
+aiohttp==3.9.5
+aiosignal==1.3.1
+annotated-types==0.6.0
+anyio==4.3.0
+attrs==23.2.0
+certifi==2024.2.2
+charset-normalizer==3.3.2
+dataclasses-json==0.6.4
+fastapi==0.110.2
+frozenlist==1.4.1
+h11==0.14.0
+httpcore==1.0.5
+httpx==0.27.0
+idna==3.7
+jsonpatch==1.33
+jsonpointer==2.4
+langchain==0.1.16
+langchain-community==0.0.34
+langchain-core==0.1.45
+langchain-text-splitters==0.0.1
+langserve==0.1.0
+langsmith==0.1.49
+marshmallow==3.21.1
+multidict==6.0.5
+mypy-extensions==1.0.0
+numpy==1.26.4
+orjson==3.10.1
+packaging==23.2
+pathlib==1.0.1
+pydantic==2.7.0
+pydantic_core==2.18.1
+PyYAML==6.0.1
+requests==2.31.0
+sniffio==1.3.1
+SQLAlchemy==2.0.29
+starlette==0.37.2
+tenacity==8.2.3
+typing-inspect==0.9.0
+typing_extensions==4.11.0
+urllib3==2.2.1
+validators==0.28.1
+yarl==1.9.4
@@ -0,0 +1,76 @@
+from langchain_core.callbacks import StreamingStdOutCallbackHandler
+# from langchain.chains import LLMChain
+# from langchain_core.prompts.prompt import PromptTemplate
+from langchain_openai import OpenAI
+from langchain_core.output_parsers import JsonOutputParser
+from fastapi import FastAPI
+import tiktoken
+import os
+import json
+# import streamlit as st
+from pprint import pprint
+from pathlib import Path
+# from langchain_text_splitters import RecursiveJsonSplitter
+
+app = FastAPI()
+model_path = os.getenv("MODEL_PATH", default="/locallm/models")
+model_name = os.getenv("MODEL_NAME", default="mistral-7b-instruct-v0.1.Q4_K_M.gguf")
+model_port = os.getenv("MODEL_PORT", default=8001)
+model_server_ip = os.getenv("MODEL_SERVER_ENDPOINT", default="http://localhost")
+model = f"{model_path}/{model_name}"
+base_model_service = f"{model_server_ip}:{model_port}"
+v1_model_service = f"{base_model_service}/v1"
+
+revision = os.getenv("MODEL_REVISION", default="no_timm")
+
+def log_template(string: str) -> None:
+    print("==================== PROMPT TEMPLATE ====================")
+    pprint(string)
+    print("================== END PROMPT TEMPLATE ==================")
+
+# App function
+def count_tokens(string: str) -> int:
+    encoding_name = "p50k_base"
+    encoding = tiktoken.get_encoding(encoding_name)
+    num_tokens = len(encoding.encode(string))
+    return num_tokens
+
+# MS Function
+def initialize_model_client() -> OpenAI:
+    callbacks = [StreamingStdOutCallbackHandler()]
+    openai_client = OpenAI(
+        base_url=v1_model_service,
+        api_key = "sk-no-key-required",
+        tiktoken_model_name="mistral",
+        temperature=0.9,
+        callbacks=callbacks,
+        # verbose=True,
+        # schema_json=True,
+        max_tokens=4000,
+        # streaming=True
+    )
+    return openai_client
+
+@app.post("/populate_json_schema")
+def no_download_json_chain(model: OpenAI, file_name: str, input: str):
+    with open (f"schemas/{file_name}", "r") as f:
+        json_schema = json.load(f)
+        json_schema_string = json.dumps(json_schema)
+    print("schema tokens: ", count_tokens(json_schema_string))
+            # dropping chunk splitting --> moving to a model with bigger token input
+            # splitter = RecursiveJsonSplitter(max_chunk_size=300) 
+            # json_chunks = splitter.split_json(json_data=json_data)
+    template = """You are a world class engineer, who specializes in generating JSON objects. You will be provided text describing something, along with a JSON schema. Generate a JSON object from the schema based on the content of the text you are provided. Considering all possible cases, including but not limited to, text input missing the fields required in the schema, and irrelevant sections of the text input.
+    %JSON schema
+    {json_schema_string}
+    %User input:
+    {input}"""
+    template = template.format(json_schema_string=json_schema_string, input=input)
+    log_template(template)
+    print("template tokens: ", count_tokens(template))
+    return model.invoke(template)
+
+model_client = initialize_model_client()
+# no_download_json_chain(model_client, "fruit.json", "A red banana.")
+test = no_download_json_chain(model_client, "employee.json", "My name is Gregory Pereira. I work in the Emereging Technologies department and the Platform and Services team. I like apples, and long walks on the beach.")
+pprint(test)
@@ -0,0 +1,36 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "Car",
+    "type": "object",
+    "properties": {
+      "make": {
+        "type": "string",
+        "description": "The make or manufacturer of the car"
+      },
+      "model": {
+        "type": "string",
+        "description": "The model of the car"
+      },
+      "year": {
+        "type": "integer",
+        "minimum": 1900,
+        "maximum": 2024,
+        "description": "The manufacturing year of the car"
+      },
+      "color": {
+        "type": "string",
+        "description": "The color of the car"
+      },
+      "mileage": {
+        "type": "number",
+        "minimum": 0,
+        "description": "The mileage of the car in kilometers"
+      },
+      "price": {
+        "type": "number",
+        "minimum": 0,
+        "description": "The price of the car in USD"
+      }
+    },
+    "required": ["make", "model", "year", "color", "mileage", "price"]
+}
@@ -0,0 +1,78 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "Computer",
+    "type": "object",
+    "properties": {
+      "brand": {
+        "type": "string",
+        "description": "The brand or manufacturer of the computer"
+      },
+      "model": {
+        "type": "string",
+        "description": "The model of the computer"
+      },
+      "processor": {
+        "type": "object",
+        "description": "Details about the processor of the computer",
+        "properties": {
+          "manufacturer": {
+            "type": "string",
+            "description": "The manufacturer of the processor"
+          },
+          "model": {
+            "type": "string",
+            "description": "The model of the processor"
+          },
+          "cores": {
+            "type": "integer",
+            "minimum": 1,
+            "description": "The number of processor cores"
+          },
+          "clock_speed": {
+            "type": "number",
+            "minimum": 0,
+            "description": "The clock speed of the processor in GHz"
+          }
+        },
+        "required": ["manufacturer", "model", "cores", "clock_speed"]
+      },
+      "ram": {
+        "type": "object",
+        "description": "Details about the RAM of the computer",
+        "properties": {
+          "size_gb": {
+            "type": "number",
+            "minimum": 0,
+            "description": "The size of RAM in gigabytes"
+          },
+          "type": {
+            "type": "string",
+            "description": "The type of RAM (e.g., DDR4)"
+          }
+        },
+        "required": ["size_gb", "type"]
+      },
+      "storage": {
+        "type": "object",
+        "description": "Details about the storage of the computer",
+        "properties": {
+          "type": {
+            "type": "string",
+            "description": "The type of storage (e.g., SSD, HDD)"
+          },
+          "capacity_gb": {
+            "type": "number",
+            "minimum": 0,
+            "description": "The capacity of storage in gigabytes"
+          }
+        },
+        "required": ["type", "capacity_gb"]
+      },
+      "price": {
+        "type": "number",
+        "minimum": 0,
+        "description": "The price of the computer in USD"
+      }
+    },
+    "required": ["brand", "model", "processor", "ram", "storage", "price"]
+  }
@@ -0,0 +1,59 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "$id": "https://github.com/containers/ai-lab-recipes/recipes/json-to-text/employee.schema.json",
+    "title": "employee",
+    "description": "Acme's Employee Information",
+    "type": "object",
+    "properties": {
+        "name": {
+            "description": "The employee's full name",
+            "type": "string"
+        },
+        "employeeId": {
+            "description": "The unique identifier for a product",
+            "type": "integer"
+      },
+        "title": {
+            "description": "An Identifier for what position the employee holds within the company",
+            "type": "string"
+      },
+        "manager": {
+            "description": "Who sits above the employee in the Org chart and is responsible for managing them.",
+            "type": "object",
+            "items": { 
+              "$ref": "#" 
+            }
+      },
+        "teams": {
+            "description": "Which products, services or other initiatives is this employee responsible for contributing to.",
+            "type": "array",
+            "items": {
+                "type": "object",
+                "properties": {
+                    "teamName": {
+                        "description": "A name used to refer to and distinguish between teams.",
+                        "type": "string"
+                    },
+                    "teamId": {
+                        "description": "A unique integer used to identify a team.",
+                        "type": "integer"
+                    },
+                    "leader": {
+                        "description": "The one who is responsbile for guiding the team.",
+                        "items": { 
+                          "$ref": "#" 
+                        }
+                    },
+                    "description": {
+                        "description": "A short blurb giving information on the team.",
+                        "type": "string"
+                    }
+                },
+                "required": ["teamName", "teamId"]
+            },
+        "minItems": 1,
+        "uniqueItems": true
+        }
+    },
+    "required": ["employeeId", "title", "teams"]
+}
@@ -0,0 +1,16 @@
+{
+    "$schema": "http://json-schema.org/draft-07/schema#",
+    "title": "Fruit",
+    "type": "object",
+    "properties": {
+      "name": {
+        "type": "string",
+        "description": "The name of the fruit"
+      },
+      "color": {
+        "type": "string",
+        "description": "The color of the fruit"
+      }
+    },
+    "required": ["name", "color"]
+}
@@ -0,0 +1,14 @@
+import validators.url
+from urllib.error import URLError, HTTPError
+import urllib.request
+
+def download_json_file(url: str, file_name: str):
+    if validators.url(url):
+        try:
+            urllib.request.urlretrieve(url, file_name)
+        except HTTPError as e:
+            print(f"HTTP Error: {e.code}, {e.reason}")
+        except URLError as e:
+            print(f"URL Error: {e.reason}")
+        except Exception as e:
+            print(f"An unexpected error occurred: {e}")