From fd290cd54c20b24886a423097df34c97fe64f301 Mon Sep 17 00:00:00 2001 From: Christoph Kuhnke Date: Tue, 20 Aug 2024 10:16:48 +0200 Subject: [PATCH] #313 enabled fix differing versions of dependency scikit learn (#316) * #313: Enabled fixing differing versions of dependency scikit-learn * Added functionality to notebook sklearn_fix_version * Updated notebook test * Fixed sk_train to remove feature labels * [CodeBuild] --- doc/changes/changes_2.1.0.md | 1 + .../sklearn/sklearn_fix_version.ipynb | 132 ++++++++++++++++++ .../sklearn/sklearn_introduction.ipynb | 11 +- .../sklearn/sklearn_train_telescope.ipynb | 2 +- test/notebooks/nbtest_sklearn.py | 2 + 5 files changed, 142 insertions(+), 6 deletions(-) create mode 100644 exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_fix_version.ipynb diff --git a/doc/changes/changes_2.1.0.md b/doc/changes/changes_2.1.0.md index 27864642..93de0e17 100644 --- a/doc/changes/changes_2.1.0.md +++ b/doc/changes/changes_2.1.0.md @@ -27,6 +27,7 @@ Version: 2.1.0 ## Bug Fixes * #303: Fixed AWS Codebuild +* #313: Fixed differing versions of dependency `scikit-learn` ## Documentation diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_fix_version.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_fix_version.ipynb new file mode 100644 index 00000000..cce1d24f --- /dev/null +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_fix_version.ipynb @@ -0,0 +1,132 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "289d2a8c-953d-46e5-8c73-ad810c29b20f", + "metadata": {}, + "source": [ + "# Fix the Version of Python Library Scikit-learn\n", + "\n", + "This notebook ensures the AI-Lab is using the same version of the python library `scikit-learn` as the one used by the built-in [Script Language Container (SLC)](https://docs.exasol.com/db/latest/database_concepts/udf_scripts/adding_new_packages_script_languages.htm#ScriptLanguageContainer) inside the Exasol database.\n", + "\n", + "## Rationale\n", + "\n", + "Using identical versions is required when transferring the Scikit-learn model from the AI-Lab to the database SLC.\n", + "\n", + "The AI-Lab serializes the Scikit-learn model with [pickle](https://docs.python.org/3/library/pickle.html) and uploads it into the BucketFS of the database. The UDF using the built-in SLC can only _deserialize_ the model if it is using the same version of Scikit-learn as was used for serializing it. The specific version of the library available in the built-in SLC depends on the release version of the database and cannot be controlled by the AI-Lab.\n", + "\n", + "Running the following script will update the version of the library used in the AI-Lab, if required.\n", + "\n", + "## Open Secure Configuration Storage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d86ca808-044e-4fbd-be30-5ba8324f501e", + "metadata": {}, + "outputs": [], + "source": [ + "%run ../utils/access_store_ui.ipynb\n", + "display(get_access_store_ui('../'))" + ] + }, + { + "cell_type": "markdown", + "id": "055ed302-69aa-426c-b5ec-861c63b82d33", + "metadata": {}, + "source": [ + "## Detect the Version of Scikit-learn Used in the SLC\n", + "\n", + "The following cell creates a User Defined Function (UDF) called `detect_scikit_learn_version()` and then executes the UDF using the built-in SLC via an SQL statement.\n", + "\n", + "The UDF inquires and returns the version of Scikit-learn available in the built-in SLC which is then stored in variable `slc_scikit_learn_version`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fa6c628f-853e-4850-8bab-46f7f645856e", + "metadata": {}, + "outputs": [], + "source": [ + "import textwrap\n", + "from exasol.nb_connector.connections import open_pyexasol_connection\n", + "\n", + "sql = textwrap.dedent(\"\"\"\n", + "CREATE OR REPLACE PYTHON3 SCALAR SCRIPT {schema!q}.detect_scikit_learn_version() RETURNS VARCHAR(100) AS\n", + "import sklearn\n", + "def run(ctx):\n", + " return sklearn.__version__ \n", + "/\n", + "\"\"\")\n", + "\n", + "with open_pyexasol_connection(ai_lab_config, compression=True) as conn:\n", + " query_params={'schema': ai_lab_config.db_schema}\n", + " conn.execute(sql, query_params)\n", + " result = conn.execute(\"select {schema!q}.detect_scikit_learn_version()\", query_params).fetchone()\n", + " slc_scikit_learn_version = result[0]" + ] + }, + { + "cell_type": "markdown", + "id": "e4b0dc24-6e02-4305-8fa1-15f68afac360", + "metadata": {}, + "source": [ + "## Compare the Scikit-learn Version and Update the AI-Lab if Required\n", + "\n", + "The next cell compares the Scikit-learn version returned by the UDF with the Scikit-learn version in the AI-Lab environment. If they differ, then the cell installs the UDF's Scikit-learn version in the AI-Lab environment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "50b88871-4c37-4cc1-ac85-841a22e98153", + "metadata": {}, + "outputs": [], + "source": [ + "import sklearn\n", + "from importlib import reload\n", + "\n", + "my_version = sklearn.__version__\n", + "\n", + "if slc_scikit_learn_version == my_version:\n", + " print(f\"AI-Lab scikit-learn version {my_version} is identical to that of the SLC.\\nNothing to do.\")\n", + "else:\n", + " print(f\"AI-Lab scikit-learn version {my_version} differs from SLC.\\nInstalling version {slc_scikit_learn_version} ...\")\n", + " %pip install \"scikit_learn=={slc_scikit_learn_version}\"\n", + " sklearn = reload(sklearn)\n", + " print(f\"Updated AI-Lab scikit-learn to version {sklearn.__version__}.\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b0ea6891-d171-4841-a2d8-edf8ac252d86", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_introduction.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_introduction.ipynb index b95f5ea8..e6042862 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_introduction.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_introduction.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "markdown", - "id": "c2aa3433-eb40-495e-a80e-38ab0bee10cf", + "id": "c7b3a9b5-1b5f-4613-bd8d-6c14f00af33e", "metadata": {}, "source": [ "# Introduction\n", @@ -16,9 +16,10 @@ "## Prerequisites\n", "\n", "Before using this set of notebooks the following steps need to be completed:\n", - "1. [Configure the AI-Lab](../main_config.ipynb).\n", - "2. [Load the MAGIC Gamma Telescope data](../data/data_telescope.ipynb).\n", - "3. [Load the Abalone data](../data/data_abalone.ipynb).\n", + "1. [Configure the AI-Lab](../main_config.ipynb)\n", + "2. [Fix the Version of Python Library Scikit-learn](./sklearn_fix_version.ipynb)\n", + "3. [Load the MAGIC Gamma Telescope data](../data/data_telescope.ipynb)\n", + "4. [Load the Abalone data](../data/data_abalone.ipynb)\n", "\n", "## Content\n", "\n", @@ -58,7 +59,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.12" } }, "nbformat": 4, diff --git a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb index 19cf9998..636eaac0 100644 --- a/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb +++ b/exasol/ds/sandbox/runtime/ansible/roles/jupyter/files/notebook/sklearn/sklearn_train_telescope.ipynb @@ -91,7 +91,7 @@ "\n", "# Create and train the model.\n", "model = tree.DecisionTreeClassifier()\n", - "model.fit(X_train, y_train)\n", + "model.fit(X_train.values, y_train)\n", "\n", "print(f\"Training took: {stopwatch}\")" ] diff --git a/test/notebooks/nbtest_sklearn.py b/test/notebooks/nbtest_sklearn.py index c2ce925b..76149778 100644 --- a/test/notebooks/nbtest_sklearn.py +++ b/test/notebooks/nbtest_sklearn.py @@ -22,6 +22,7 @@ def test_regression(notebook_runner) -> None: os.chdir('./data') notebook_runner('data_abalone.ipynb') os.chdir('../sklearn') + notebook_runner('sklearn_fix_version.ipynb') notebook_runner('sklearn_predict_udf.ipynb') notebook_runner('sklearn_train_abalone.ipynb') notebook_runner('sklearn_predict_abalone.ipynb') @@ -38,6 +39,7 @@ def test_classification(notebook_runner) -> None: os.chdir('./data') notebook_runner('data_telescope.ipynb') os.chdir('../sklearn') + notebook_runner('sklearn_fix_version.ipynb') notebook_runner('sklearn_predict_udf.ipynb') notebook_runner('sklearn_train_telescope.ipynb') notebook_runner('sklearn_predict_telescope.ipynb')