Skip to content

Commit

Permalink
Lupickup/rag/register local mlindex example (#2649)
Browse files Browse the repository at this point in the history
* Add example of registering a locally produced MLINdex.

* Update to use public azure-ai-ml and more examples of registering local MLIndex.

* Add custom crack_and_chunk with document_intelligence example.

* Apply formatting changes

* Various suggested fixes.

---------

Co-authored-by: Lucas Pickup <[email protected]>
  • Loading branch information
tot0 and Lucas Pickup authored Sep 13, 2023
1 parent 875c20b commit 110d25b
Show file tree
Hide file tree
Showing 15 changed files with 825 additions and 31 deletions.
4 changes: 2 additions & 2 deletions sdk/python/generative-ai/rag/code_first/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ Read more about their structure [here](./docs/mlindex.md).
## Pre-requisites

0. Install `azure-ai-ml` and `azureml-rag`:
- `pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/`
- `pip install -U 'azureml-rag[document_parsing,faiss,cognitive_search]>=0.2.0'`
- `pip install 'azure-ai-ml>=1.10'`
- `pip install 'azureml-rag[document_parsing,faiss,cognitive_search]>=0.2.0'`
1. You have unstructured data.
- In one of [AzureMLs supported data sources](https://learn.microsoft.com/azure/machine-learning/concept-data?view=azureml-api-2): Blob, ADLSgen2, OneLake, S3, Git
- In any of these supported file formats: md, txt, py, pdf, ppt(x), doc(x)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
# # Local Documents to Azure Cognitive Search Index

# %% Prerequisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[faiss]>=0.2.0'
# %pip install 'promptflow[azure]' promptflow-tools promptflow-vectordb

# %% Authenticate to you AzureML Workspace, download a `config.json` from the top right hand corner menu of the Workspace.
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,8 @@
# %%[markdown]
# # Local Documents to Azure Cognitive Search Index

# %% Prerequisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[cognitive_search]>=0.2.0'

# %% Authenticate to you AzureML Workspace, download a `config.json` from the top right hand corner menu of the Workspace.
from azureml.rag.dataindex import DataIndex
from azure.ai.ml import MLClient, load_data
from azure.identity import DefaultAzureCredential

Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
# %%[markdown]
# # S3 via OneLake to Azure Cognitive Search Index

# %% Prerequisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[cognitive_search]>=0.2.0'

# %% Authenticate to an AzureML Workspace, you can download a `config.json` from the top-right-hand corner menu of a Workspace.
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
# %%[markdown]
# # S3 via OneLake to Azure Cognitive Search Index

# %% Prerequisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[cognitive_search]>=0.2.0'

# %% Authenticate to an AzureML Workspace, you can download a `config.json` from the top-right-hand corner menu of a Workspace.
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
# # Build an ACS Index using langchain data loaders and MLIndex SDK

# %% Pre-requisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[cognitive_search]>=0.2.0'
# %pip install wikipedia

# %% Get Azure Cognitive Search Connection
Expand Down Expand Up @@ -33,6 +31,7 @@
# %%
from azureml.rag.mlindex import MLIndex

mlindex_output_path = "./hunter_x_hunter_aoai_acs"
# Process data into FAISS Index using HuggingFace embeddings
mlindex = MLIndex.from_documents(
documents=split_docs,
Expand All @@ -42,9 +41,31 @@
index_type="acs",
index_connection=acs_connection,
index_config={"index_name": "hunter_x_hunter_aoai_acs"},
output_path=mlindex_output_path,
)

# %% Query documents, use with inferencing framework
index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search("What is bungie gum?", k=5)
print(docs)

# %% Register local MLIndex as remote asset
from azure.ai.ml.entities import Data

asset_name = "hunter_x_hunter_aoai_acs_mlindex"
asset = ml_client.data.create_or_update(
Data(
name=asset_name,
version="1",
path=mlindex_output_path,
description="MLIndex Documentation Embedded using Azure OpenAI indexed using Azure Cognitive Search.",
properties={
"azureml.mlIndexAssetKind": "acs",
"azureml.mlIndexAsset": "true",
"azureml.mlIndexAssetSource": "Local Data",
"azureml.mlIndexAssetPipelineRunId": "Local",
},
)
)

print(asset)
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
# %%[markdown]
# # Build an ACS Index using MLIndex SDK

# %% Pre-requisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[document_parsing,cognitive_search]>=0.2.0'

# %% Get Azure Cognitive Search Connection
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential
Expand All @@ -17,6 +13,7 @@
# %%
from azureml.rag.mlindex import MLIndex

mlindex_output_path = "./acs_open_ai_index"
# Process data into FAISS Index using HuggingFace embeddings
mlindex = MLIndex.from_files(
source_uri="../",
Expand All @@ -28,15 +25,39 @@
index_type="acs",
index_connection=acs_connection,
index_config={"index_name": "mlindex_docs_aoai_acs"},
output_path="./acs_open_ai_index",
output_path=mlindex_output_path,
)

# %% Load MLIndex from local
from azureml.rag.mlindex import MLIndex

mlindex = MLIndex("./acs_open_ai_index")
mlindex = MLIndex(mlindex_output_path)

# %% Query documents, use with inferencing framework
index = mlindex.as_langchain_vectorstore()
docs = index.similarity_search("Topic in my data.", k=5)
print(docs)

# %% Register local MLIndex as remote asset
from azure.ai.ml.entities import Data

# TODO: MLIndex should help registering FAISS as asset with all the properties.
asset_name = "mlindex_docs_aoai_acs_mlindex"
asset = ml_client.data.create_or_update(
Data(
name=asset_name,
version="1",
path=mlindex_output_path,
description="MLIndex Documentation Embedded using Azure OpenAI indexed using Azure Cognitive Search.",
properties={
"azureml.mlIndexAssetKind": "acs",
"azureml.mlIndexAsset": "true",
"azureml.mlIndexAssetSource": "Local Data",
"azureml.mlIndexAssetPipelineRunId": "Local",
},
)
)

print(asset)

# %%
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
# %%[markdown]
# # Build a Faiss Index using MLIndex SDK

# %% Pre-requisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[document_parsing,faiss,hugging_face]>=0.2.0'

# %%
from azureml.rag.mlindex import MLIndex

Expand All @@ -13,7 +9,6 @@
source_uri="../",
source_glob="**/*",
chunk_size=200,
# embeddings_model=sentence_transformers.SentenceTransformer('sentence-transformers/all-mpnet-base-v2'),
embeddings_model="hugging_face://model/sentence-transformers/all-mpnet-base-v2",
embeddings_container="./.embeddings_cache/mlindex_docs_mpnet_faiss",
index_type="faiss",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# %%[markdown]
# # Build a Faiss Index using MLIndex SDK and use it in Promptflow

# %% Pre-requisites
# %pip install 'azure-ai-ml==1.10.0a20230825006' --extra-index-url https://pkgs.dev.azure.com/azure-sdk/public/_packaging/azure-sdk-for-python/pypi/simple/
# %pip install 'azureml-rag[document_parsing,faiss]>=0.2.0'
# %pip install -U 'promptflow[azure]' promptflow-tools promptflow-vectordb

# %% Get Azure Cognitive Search Connection
Expand Down
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import os
import traceback

from azureml.rag.documents.cracking import file_extension_loaders
from azureml.rag.tasks.crack_and_chunk import (
__main__,
crack_and_chunk_arg_parser,
str2bool,
)
from azureml.rag.tasks.crack_and_chunk import main as main_crack_and_chunk
from azureml.rag.utils.connections import get_connection_by_id_v2
from azureml.rag.utils.logging import (
get_logger,
safe_mlflow_start_run,
track_activity,
)

from azure.ai.formrecognizer import DocumentAnalysisClient
from azure.core.credentials import AzureKeyCredential
from document_intelligence_loader import DocumentIntelligencePDFLoader

logger = get_logger("crack_and_chunk_document_intelligence")


def main(args, logger, activity_logger):
if args.doc_intel_connection_id:
document_intelligence_connection = get_connection_by_id_v2(
args.doc_intel_connection_id
)

os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"] = document_intelligence_connection[
"properties"
]["metadata"]["endpoint"]
os.environ["DOCUMENT_INTELLIGENCE_KEY"] = document_intelligence_connection[
"properties"
]["credentials"]["keys"]["api_key"]
os.environ["AZURE_AI_DOCUMENT_INTELLIGENCE_USE_LAYOUT"] = str(args.use_layout)

DocumentIntelligencePDFLoader.document_intelligence_client = (
DocumentAnalysisClient(
endpoint=document_intelligence_connection["properties"]["metadata"][
"endpoint"
],
credential=AzureKeyCredential(
document_intelligence_connection["properties"]["credentials"][
"keys"
]["api_key"]
),
)
)
DocumentIntelligencePDFLoader.use_layout = args.use_layout
else:
raise ValueError("doc_intel_connection_id is required")

# Override default `.pdf` loader to use Azure AI Document Intelligence
file_extension_loaders[".pdf"] = DocumentIntelligencePDFLoader

main_crack_and_chunk(args, logger, activity_logger)


def main_wrapper(args, logger):
with track_activity(
logger, "crack_and_chunk_document_intelligence"
) as activity_logger, safe_mlflow_start_run(logger=logger):
try:
main(args, logger, activity_logger)
except Exception:
activity_logger.error(
f"crack_and_chunk_document_intelligence failed with exception: {traceback.format_exc()}"
)
raise


if __name__ == "__main__":
parser = crack_and_chunk_arg_parser()

parser.add_argument(
"--doc_intel_connection_id",
type=str,
help="Custom Connection to use for Document Intelligence",
)
parser.add_argument(
"--use_layout", type=str2bool, default=False, help="Use layout for PDF cracking"
)

__main__(parser, main_wrapper)
Loading

0 comments on commit 110d25b

Please sign in to comment.