-
Notifications
You must be signed in to change notification settings - Fork 1.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Lupickup/rag/register local mlindex example (#2649)
* Add example of registering a locally produced MLINdex. * Update to use public azure-ai-ml and more examples of registering local MLIndex. * Add custom crack_and_chunk with document_intelligence example. * Apply formatting changes * Various suggested fixes. --------- Co-authored-by: Lucas Pickup <[email protected]>
- Loading branch information
Showing
15 changed files
with
825 additions
and
31 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 1 addition & 4 deletions
5
sdk/python/generative-ai/rag/code_first/data_index_job/local_docs_to_acs_mlindex.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 0 additions & 4 deletions
4
sdk/python/generative-ai/rag/code_first/data_index_job/s3_to_acs_mlindex.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 0 additions & 4 deletions
4
sdk/python/generative-ai/rag/code_first/data_index_job/scheduled_s3_to_asc_mlindex.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 3 additions & 2 deletions
5
...generative-ai/rag/code_first/mlindex_local/local_docs_to_faiss_mlindex_with_promptflow.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file added
BIN
+175 KB
...-ai/rag/notebooks/custom_crack_and_chunk/assets/custom_doc_intel_connection.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
86 changes: 86 additions & 0 deletions
86
...ai/rag/notebooks/custom_crack_and_chunk/crack_and_chunk_with_doc_intel/crack_and_chunk.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import os | ||
import traceback | ||
|
||
from azureml.rag.documents.cracking import file_extension_loaders | ||
from azureml.rag.tasks.crack_and_chunk import ( | ||
__main__, | ||
crack_and_chunk_arg_parser, | ||
str2bool, | ||
) | ||
from azureml.rag.tasks.crack_and_chunk import main as main_crack_and_chunk | ||
from azureml.rag.utils.connections import get_connection_by_id_v2 | ||
from azureml.rag.utils.logging import ( | ||
get_logger, | ||
safe_mlflow_start_run, | ||
track_activity, | ||
) | ||
|
||
from azure.ai.formrecognizer import DocumentAnalysisClient | ||
from azure.core.credentials import AzureKeyCredential | ||
from document_intelligence_loader import DocumentIntelligencePDFLoader | ||
|
||
logger = get_logger("crack_and_chunk_document_intelligence") | ||
|
||
|
||
def main(args, logger, activity_logger): | ||
if args.doc_intel_connection_id: | ||
document_intelligence_connection = get_connection_by_id_v2( | ||
args.doc_intel_connection_id | ||
) | ||
|
||
os.environ["DOCUMENT_INTELLIGENCE_ENDPOINT"] = document_intelligence_connection[ | ||
"properties" | ||
]["metadata"]["endpoint"] | ||
os.environ["DOCUMENT_INTELLIGENCE_KEY"] = document_intelligence_connection[ | ||
"properties" | ||
]["credentials"]["keys"]["api_key"] | ||
os.environ["AZURE_AI_DOCUMENT_INTELLIGENCE_USE_LAYOUT"] = str(args.use_layout) | ||
|
||
DocumentIntelligencePDFLoader.document_intelligence_client = ( | ||
DocumentAnalysisClient( | ||
endpoint=document_intelligence_connection["properties"]["metadata"][ | ||
"endpoint" | ||
], | ||
credential=AzureKeyCredential( | ||
document_intelligence_connection["properties"]["credentials"][ | ||
"keys" | ||
]["api_key"] | ||
), | ||
) | ||
) | ||
DocumentIntelligencePDFLoader.use_layout = args.use_layout | ||
else: | ||
raise ValueError("doc_intel_connection_id is required") | ||
|
||
# Override default `.pdf` loader to use Azure AI Document Intelligence | ||
file_extension_loaders[".pdf"] = DocumentIntelligencePDFLoader | ||
|
||
main_crack_and_chunk(args, logger, activity_logger) | ||
|
||
|
||
def main_wrapper(args, logger): | ||
with track_activity( | ||
logger, "crack_and_chunk_document_intelligence" | ||
) as activity_logger, safe_mlflow_start_run(logger=logger): | ||
try: | ||
main(args, logger, activity_logger) | ||
except Exception: | ||
activity_logger.error( | ||
f"crack_and_chunk_document_intelligence failed with exception: {traceback.format_exc()}" | ||
) | ||
raise | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = crack_and_chunk_arg_parser() | ||
|
||
parser.add_argument( | ||
"--doc_intel_connection_id", | ||
type=str, | ||
help="Custom Connection to use for Document Intelligence", | ||
) | ||
parser.add_argument( | ||
"--use_layout", type=str2bool, default=False, help="Use layout for PDF cracking" | ||
) | ||
|
||
__main__(parser, main_wrapper) |
Oops, something went wrong.