sdk/python/readme.py

# imports
import contextlib
import os
import json
import glob
import argparse
import hashlib

from configparser import ConfigParser

# define constants
ENABLE_MANUAL_CALLING = True  # defines whether the workflow can be invoked or not
NOT_TESTED_NOTEBOOKS = [
    "datastore",
    "mlflow-model-local-inference-test",
    "multicloud-configuration",
    "debug-online-endpoints-locally-in-visual-studio-code",
    "train-hyperparameter-tune-with-sklearn",
    "train-hyperparameter-tune-deploy-with-keras",
    "train-hyperparameter-tune-deploy-with-tensorflow",
    "interactive_data_wrangling",
    # mlflow SDK samples notebooks
    "mlflow_sdk_online_endpoints_progresive",
    "mlflow_sdk_online_endpoints",
    "mlflow_sdk_web_service",
    "scoring_to_mlmodel",
    "track_with_databricks_deploy_aml",
    "model_management",
    "run_history",
    "keras_mnist_with_mlflow",
    "logging_and_customizing_models",
    "xgboost_classification_mlflow",
    "xgboost_nested_runs",
    "xgboost_service_principal",
    "using_mlflow_rest_api",
    "yolov5/tutorial",
    "4.Provision-feature-store",
]  # cannot automate lets exclude
NOT_SCHEDULED_NOTEBOOKS = []  # these are too expensive, lets not run everyday
# define branch where we need this
# use if running on a release candidate, else make it empty
READONLY_HEADER = "# This code is autogenerated.\
\n# Code is generated by running custom script: python3 readme.py\
\n# Any manual changes to this file may cause incorrect behavior.\
\n# Any manual changes will be overwritten if the code is regenerated.\n"
BRANCH = "main"  # default - do not change
# BRANCH = "sdk-preview"  # this should be deleted when this branch is merged to main
GITHUB_CONCURRENCY_GROUP = (
    "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
)
USE_FORECAST_REQUIREMENTS = "USE_FORECAST_REQUIREMENTS"
COMPUTE_NAMES = "COMPUTE_NAMES"


def main(args):
    # get list of notebooks
    notebooks = sorted(glob.glob("**/*.ipynb", recursive=True))

    # write workflows
    write_workflows(notebooks)

    # modify notebooks
    modify_notebooks(notebooks)

    # write readme
    write_readme(notebooks)

    # write pipeline readme
    pipeline_dir = "jobs" + os.sep + "pipelines" + os.sep
    with change_working_dir(pipeline_dir):
        pipeline_notebooks = sorted(glob.glob("**/*.ipynb", recursive=True))
    pipeline_notebooks = [
        f"{pipeline_dir}{notebook}" for notebook in pipeline_notebooks
    ]
    write_readme(pipeline_notebooks, pipeline_folder=pipeline_dir)


def write_workflows(notebooks):
    print("writing .github/workflows...")
    cfg = ConfigParser()
    cfg.read(os.path.join("notebooks_config.ini"))
    for notebook in notebooks:
        notebook_path = notebook.replace(os.sep, "/")
        if not any(excluded in notebook_path for excluded in NOT_TESTED_NOTEBOOKS):
            # get notebook name
            name = os.path.basename(notebook).replace(".ipynb", "")
            folder = os.path.dirname(notebook)
            classification = folder.replace(os.sep, "-")

            enable_scheduled_runs = True
            if any(excluded in notebook_path for excluded in NOT_SCHEDULED_NOTEBOOKS):
                enable_scheduled_runs = False

            # write workflow file
            write_notebook_workflow(
                notebook, name, classification, folder, enable_scheduled_runs, cfg
            )
    print("finished writing .github/workflows")


def get_additional_requirements(req_name, req_path):
    return f"""
    - name: pip install {req_name} reqs
      run: pip install -r {req_path}"""


def get_mlflow_import(notebook, validation_yml):
    with open(notebook, "r", encoding="utf-8") as f:
        string_file = f.read()
        if (
            validation_yml
            or "import mlflow" in string_file
            or "from mlflow" in string_file
        ):
            return get_additional_requirements(
                "mlflow", "sdk/python/mlflow-requirements.txt"
            )
        else:
            return ""


def get_forecast_reqs(notebook_name, nb_config):
    is_required = int(
        nb_config.get(
            section=notebook_name, option=USE_FORECAST_REQUIREMENTS, fallback=0
        )
    )
    if is_required:
        return get_additional_requirements(
            "forecasting", "sdk/python/forecasting-requirements.txt"
        )
    else:
        return ""


def get_validation_yml(notebook_folder, notebook_name):
    validation_yml = ""
    validation_json_file_name = os.path.join(
        "..",
        "..",
        ".github",
        "test",
        "sdk",
        notebook_name.replace(".ipynb", ".json"),
    )

    if os.path.exists(validation_json_file_name):
        with open(validation_json_file_name, "r") as json_file:
            validation_file = json.load(json_file)
            for validation in validation_file["validations"]:
                validation_yml += get_validation_check_yml(
                    notebook_folder, notebook_name, validation
                )

    return validation_yml


def get_validation_check_yml(notebook_folder, notebook_name, validation):
    validation_name = validation["name"]
    validation_file_name = validation_name.replace(" ", "_")
    notebook_output_file = (
        os.path.basename(notebook_name).replace(".", ".output.").replace(os.sep, "/")
    )
    notebook_folder = notebook_folder.replace(os.sep, "/")
    full_folder_name = f"sdk/python/{notebook_folder}"
    github_workspace = "${{ github.workspace }}"

    check_yml = f"""
    - name: {validation_name}
      run: |
         python {github_workspace}/.github/test/scripts/{validation_file_name}.py \\
                --file_name {notebook_output_file} \\
                --folder . \\"""

    for param_name, param_value in validation["params"].items():
        if type(param_value) is list:
            check_yml += f"""
                --{param_name} \\"""

            for param_item in param_value:
                param_item_value = param_item.replace("\n", "\\n")
                check_yml += f"""
                  \"{param_item_value}\" \\"""
        else:
            check_yml += f"""
                --{param_name} {param_value} \\"""

    check_yml += f"""
      working-directory: {full_folder_name} \\"""

    return check_yml[:-2]


def write_notebook_workflow(
    notebook, name, classification, folder, enable_scheduled_runs, nb_config
):
    is_pipeline_notebook = ("jobs-pipelines" in classification) or (
        "assets-component" in classification
    )
    is_spark_notebook_sample = ("jobs-spark" in classification) or ("_spark_" in name)
    is_featurestore_sample = "featurestore_sample" in classification
    # Duplicate name in working directory during checkout
    # https://github.com/actions/checkout/issues/739
    github_workspace = "${{ github.workspace }}"
    forecast_import = get_forecast_reqs(name, nb_config)
    posix_folder = folder.replace(os.sep, "/")
    posix_notebook = notebook.replace(os.sep, "/")

    # Schedule notebooks at different times to reduce maximum quota usage.
    name_hash = int(hashlib.sha512(name.encode()).hexdigest(), 16)
    schedule_minute = name_hash % 60
    hours_between_runs = 12
    schedule_hour = (name_hash // 60) % hours_between_runs

    validation_yml = get_validation_yml(folder, notebook)
    mlflow_import = get_mlflow_import(notebook, validation_yml)

    workflow_yaml = f"""{READONLY_HEADER}
name: sdk-{classification}-{name}
# This file is created by sdk/python/readme.py.
# Please do not edit directly.
on:\n"""
    if ENABLE_MANUAL_CALLING:
        workflow_yaml += f"""  workflow_dispatch:\n"""
    if enable_scheduled_runs:
        workflow_yaml += f"""  schedule:
    - cron: "{schedule_minute} {schedule_hour}/{hours_between_runs} * * *"\n"""
    workflow_yaml += f"""  pull_request:
    branches:
      - main\n"""
    if BRANCH != "main":
        workflow_yaml += f"""      - {BRANCH}\n"""
        if is_pipeline_notebook:
            workflow_yaml += "      - pipeline/*\n"
    workflow_yaml += f"""    paths:
      - sdk/python/{posix_folder}/**
      - .github/workflows/sdk-{classification}-{name}.yml
      - sdk/python/dev-requirements.txt
      - infra/bootstrapping/**
      - sdk/python/setup.sh\n"""
    if is_featurestore_sample:
        workflow_yaml += f"""      - sdk/python/featurestore_sample/**"""
    workflow_yaml += f"""
permissions:
  id-token: write
concurrency:
  group: {GITHUB_CONCURRENCY_GROUP}
  cancel-in-progress: true
jobs:
  build:
    runs-on: ubuntu-latest
    steps:
    - name: check out repo
      uses: actions/checkout@v2
    - name: setup python
      uses: actions/setup-python@v2
      with:
        python-version: "3.10"
    - name: pip install notebook reqs
      run: pip install --no-cache-dir -r sdk/python/dev-requirements.txt{mlflow_import}{forecast_import}
    - name: azure login
      uses: azure/login@v1
      with:
        client-id: ${{{{ secrets.OIDC_AZURE_CLIENT_ID }}}}
        tenant-id: ${{{{ secrets.OIDC_AZURE_TENANT_ID }}}}
        subscription-id: ${{{{ secrets.OIDC_AZURE_SUBSCRIPTION_ID }}}}
    - name: bootstrap resources
      run: |
          echo '{GITHUB_CONCURRENCY_GROUP}';
          bash bootstrap.sh
      working-directory: infra/bootstrapping
      continue-on-error: false
    - name: setup SDK
      run: |
          source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh";
          source "{github_workspace}/infra/bootstrapping/init_environment.sh";
          bash setup.sh
      working-directory: sdk/python
      continue-on-error: true
    - name: validate readme
      run: |
          python check-readme.py "{github_workspace}" "{github_workspace}/sdk/python/{posix_folder}"
      working-directory: infra/bootstrapping
      continue-on-error: false
    - name: setup-cli
      run: |
          source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh";
          source "{github_workspace}/infra/bootstrapping/init_environment.sh";
          bash setup.sh
      working-directory: cli
      continue-on-error: true
    - name: Eagerly cache access tokens for required scopes
      run: |
          # Workaround for azure-cli's lack of support for ID token refresh
          # Taken from: https://github.com/Azure/login/issues/372#issuecomment-2056289617

          # Management
          az account get-access-token --scope https://management.azure.com/.default --output none
          # ML
          az account get-access-token --scope https://ml.azure.com/.default --output none\n"""
    if is_spark_notebook_sample:
        workflow_yaml += get_spark_config_workflow(posix_folder, name)
    if is_featurestore_sample:
        workflow_yaml += get_featurestore_config_workflow(posix_folder, name)
    workflow_yaml += f"""    - name: run {posix_notebook}
      run: |
          source "{github_workspace}/infra/bootstrapping/sdk_helpers.sh";
          source "{github_workspace}/infra/bootstrapping/init_environment.sh";
          bash "{github_workspace}/infra/bootstrapping/sdk_helpers.sh" generate_workspace_config "../../.azureml/config.json";
          bash "{github_workspace}/infra/bootstrapping/sdk_helpers.sh" replace_template_values "{name}.ipynb";
          [ -f "../../.azureml/config" ] && cat "../../.azureml/config";"""

    if name == "debug-online-endpoints-locally-in-visual-studio-code":
        workflow_yaml += f"""
          sed -i -e "s/<ENDPOINT_NAME>/localendpoint/g" {name}.ipynb

          # Create a dummy executable for VSCode
          mkdir -p /tmp/code
          touch /tmp/code/code
          chmod +x /tmp/code/code
          export PATH="/tmp/code:$PATH"\n"""

    papermill_option = ""
    if "endpoints-batch" in classification:
        papermill_option = " --log-output"

    if not ("automl" in folder):
        workflow_yaml += f"""
          papermill -k python {name}.ipynb {name}.output.ipynb{papermill_option}
      working-directory: sdk/python/{posix_folder}"""
    elif "nlp" in folder or "image" in folder:
        # need GPU cluster, so override the compute cluster name to dedicated
        workflow_yaml += f"""
          papermill -k python -p compute_name automl-gpu-cluster {name}.ipynb {name}.output.ipynb
      working-directory: sdk/python/{posix_folder}"""
    else:
        # need CPU cluster, so override the compute cluster name to dedicated
        workflow_yaml += f"""
          papermill -k python -p compute_name automl-cpu-cluster {name}.ipynb {name}.output.ipynb
      working-directory: sdk/python/{posix_folder}"""

    if name == "connections":
        workflow_yaml += """
      env:
        ACR_USERNAME: ${{ secrets.ACR_USERNAME }}
        ACR_PASSWORD: ${{ secrets.ACR_PASSWORD }}
        GIT_PAT: ${{ secrets.GIT_PAT }}
        PYTHON_FEED_SAS: ${{ secrets.PYTHON_FEED_SAS }}"""

    workflow_yaml += validation_yml

    workflow_yaml += f"""
    - name: upload notebook's working folder as an artifact
      if: ${{{{ always() }}}}
      uses: ./.github/actions/upload-artifact
      with:
        name: {name}
        path: sdk/python/{posix_folder}\n"""

    if nb_config.get(section=name, option=COMPUTE_NAMES, fallback=None):
        workflow_yaml += f"""
    - name: Remove the compute if notebook did not done it properly.
      run: bash "{github_workspace}/infra/bootstrapping/remove_computes.sh" {nb_config.get(section=name, option=COMPUTE_NAMES)}\n"""

    workflow_file = os.path.join(
        "..", "..", ".github", "workflows", f"sdk-{classification}-{name}.yml"
    )
    workflow_before = ""
    if os.path.exists(workflow_file):
        with open(workflow_file, "r") as f:
            workflow_before = f.read()

    if workflow_yaml != workflow_before:
        # write workflow
        with open(workflow_file, "w") as f:
            f.write(workflow_yaml)


def write_readme(notebooks, pipeline_folder=None):
    prefix = "prefix.md"
    suffix = "suffix.md"
    readme_file = "README.md"
    if pipeline_folder:
        prefix = os.path.join(pipeline_folder, prefix)
        suffix = os.path.join(pipeline_folder, suffix)
        readme_file = os.path.join(pipeline_folder, readme_file)

    if BRANCH == "":
        branch = "main"
    else:
        branch = BRANCH
        # read in prefix.md and suffix.md
        with open(prefix, "r") as f:
            prefix = f.read()
        with open(suffix, "r") as f:
            suffix = f.read()

        # define markdown tables
        notebook_table = f"Test Status is for branch - **_{branch}_**\n|Area|Sub-Area|Notebook|Description|Status|\n|--|--|--|--|--|\n"
        for notebook in notebooks:
            # get notebook name
            name = notebook.split(os.sep)[-1].replace(".ipynb", "")
            area = notebook.split(os.sep)[0]
            sub_area = notebook.split(os.sep)[1]
            folder = os.path.dirname(notebook)
            classification = folder.replace(os.sep, "-")

            try:
                # read in notebook
                with open(notebook, "r", encoding="utf-8") as f:
                    data = json.load(f)

                description = "*no description*"
                try:
                    if data["metadata"]["description"] is not None:
                        description = data["metadata"]["description"]["description"]
                except BaseException:
                    pass
            except BaseException:
                print("Could not load", notebook)
                pass

            if any(excluded in notebook for excluded in NOT_TESTED_NOTEBOOKS):
                description += " - _This sample is excluded from automated tests_"
            if any(excluded in notebook for excluded in NOT_SCHEDULED_NOTEBOOKS):
                description += " - _This sample is only tested on demand_"

            if pipeline_folder:
                notebook = os.path.relpath(notebook, pipeline_folder)

            # write workflow file
            notebook_table += (
                write_readme_row(
                    branch,
                    notebook.replace(os.sep, "/"),
                    name,
                    classification,
                    area,
                    sub_area,
                    description,
                )
                + "\n"
            )

        print("writing README.md...")
        with open(readme_file, "w") as f:
            f.write(prefix + notebook_table + suffix)
        print("finished writing README.md")


def write_readme_row(
    branch, notebook, name, classification, area, sub_area, description
):
    gh_link = "https://github.com/Azure/azureml-examples/actions/workflows"

    nb_name = f"[{name}]({notebook})"
    status = f"[![{name}]({gh_link}/sdk-{classification}-{name}.yml/badge.svg?branch={branch})]({gh_link}/sdk-{classification}-{name}.yml)"

    row = f"|{area}|{sub_area}|{nb_name}|{description}|{status}|"
    return row


def modify_notebooks(notebooks):
    print("modifying notebooks...")
    # setup variables
    kernelspec = {
        "display_name": "Python 3.10 - SDK V2",
        "language": "python",
        "name": "python310-sdkv2",
    }

    # for each notebooks
    for notebook in notebooks:
        # read in notebook
        with open(notebook, "r", encoding="utf-8") as f:
            data = json.load(f)

        # update metadata
        data["metadata"]["kernelspec"] = kernelspec

        # write notebook
        with open(notebook, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=1, ensure_ascii=False)
            f.write("\n")

    print("finished modifying notebooks...")


def get_spark_config_workflow(folder_name, file_name):
    workflow = f"""    - name: setup spark resources
      run: |
          bash -x jobs/spark/setup_spark.sh jobs/spark/ {folder_name}/{file_name}.ipynb
      working-directory: sdk/python
      continue-on-error: true\n"""

    return workflow


def get_featurestore_config_workflow(folder_name, file_name):
    is_sdk_noteobook = "_sdk_" in file_name
    is_cli_notebook = "_cli_" in file_name
    is_vnet_notebook = "_vnet_" in file_name
    workflow = f"""    - name: setup feature-store resources"""
    if is_sdk_noteobook:
        workflow += f"""
      run: |
          bash -x automation-test/setup-resources.sh automation-test/{file_name}.ipynb
      working-directory: sdk/python/featurestore_sample
      continue-on-error: true\n"""
    if is_cli_notebook:
        workflow += f"""
      run: |
          bash -x automation-test/setup-resources-cli.sh automation-test/{file_name}.ipynb
      working-directory: sdk/python/featurestore_sample
      continue-on-error: true\n"""
    if is_vnet_notebook:
        workflow += f"""
      run: |
          bash -x automation-test/setup-resources-vnet.sh automation-test/{file_name}.ipynb
      working-directory: sdk/python/featurestore_sample
      continue-on-error: true\n"""

    return workflow


@contextlib.contextmanager
def change_working_dir(path):
    """Context manager for changing the current working directory"""

    saved_path = os.getcwd()
    os.chdir(str(path))
    try:
        yield
    finally:
        os.chdir(saved_path)


# run functions
if __name__ == "__main__":
    # setup argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--check-readme", type=bool, default=False)
    args = parser.parse_args()

    # call main
    main(args)