Migration to azure-ai-evaluation package (#43)

* Converting code * Updated column_mapping * Update metrics table with new namespace and metrics * Using new Eval SDK metrics instead of local math metrics * Intermediate JSON metrics file
Azure-Samples · Nov 4, 2024 · abe2e86 · abe2e86
1 parent 33ee9d1
commit abe2e86
Show file tree

Hide file tree

Showing 9 changed files with 573 additions and 295 deletions.
diff --git a/4_eval.ipynb b/4_eval.ipynb
@@ -48,12 +48,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "31e99e68",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#! pip install promptflow-evals"
+    "#! pip install openai azure-ai-evaluation azure-identity promptflow-azure"
    ]
   },
   {
@@ -96,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "31004ebd",
    "metadata": {},
    "outputs": [],
@@ -132,8 +132,8 @@
     "dataset_path_eval_answer_score_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.baseline.jsonl\"\n",
     "\n",
     "# Scored answer metrics files\n",
-    "dataset_path_eval_answer_score_metrics_student = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.jsonl\"\n",
-    "dataset_path_eval_answer_score_metrics_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.jsonl\"\n",
+    "dataset_path_eval_answer_score_metrics_student = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.json\"\n",
+    "dataset_path_eval_answer_score_metrics_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.json\"\n",
     "\n",
     "BASELINE_OPENAI_DEPLOYMENT = os.getenv(\"BASELINE_OPENAI_DEPLOYMENT\")\n",
     "BASELINE_MODEL_API = os.getenv(\"BASELINE_MODEL_API\")\n",
@@ -154,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "83e5f47e",
    "metadata": {},
    "outputs": [],
@@ -180,7 +180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "6bdda3d9",
    "metadata": {},
    "outputs": [],
@@ -194,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "fed06cae",
    "metadata": {},
    "outputs": [],
@@ -214,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "85194f3c",
    "metadata": {},
    "outputs": [],
@@ -230,7 +230,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "76b0827d",
    "metadata": {},
    "outputs": [],
@@ -251,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "8b4a21af",
    "metadata": {},
    "outputs": [],
@@ -273,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "ff092e37",
    "metadata": {},
    "outputs": [],
@@ -309,22 +309,23 @@
     "\n",
     "The table below lists all the built-in evaluators we support. In the following sections, we will select a few of these evaluators to demonstrate how to use them.\n",
     "\n",
-    "| Category       | Namespace                                        | Evaluator Class           | Notes                                             |\n",
-    "|----------------|--------------------------------------------------|---------------------------|---------------------------------------------------|\n",
-    "| Quality        | promptflow.evals.evaluators                      | GroundednessEvaluator     | Measures how well the answer is entailed by the context and is not hallucinated |\n",
-    "|                |                                                  | RelevanceEvaluator        | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |\n",
-    "|                |                                                  | CoherenceEvaluator        | How well all the sentences fit together and sound naturally as a whole. |\n",
-    "|                |                                                  | FluencyEvaluator          | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |\n",
-    "|                |                                                  | SimilarityEvaluator       | Measures the similarity between the predicted answer and the correct answer |\n",
-    "|                |                                                  | F1ScoreEvaluator          | F1 score |\n",
-    "| Content Safety | promptflow.evals.evaluators.content_safety       | ViolenceEvaluator         |                                                   |\n",
-    "|                |                                                  | SexualEvaluator           |                                                   |\n",
-    "|                |                                                  | SelfHarmEvaluator         |                                                   |\n",
-    "|                |                                                  | HateUnfairnessEvaluator   |                                                   |\n",
-    "| Composite      | promptflow.evals.evaluators                      | QAEvaluator               | Built on top of individual quality evaluators.    |\n",
-    "|                |                                                  | ChatEvaluator             | Similar to QAEvaluator but designed for evaluating chat messages. |\n",
-    "|                |                                                  | ContentSafetyEvaluator    | Built on top of individual content safety evaluators. |\n",
-    "\n"
+    "| Category       | Evaluator Class           | Notes                                             |\n",
+    "|----------------|---------------------------|---------------------------------------------------|\n",
+    "| Quality        | GroundednessEvaluator     | Measures how well the answer is entailed by the context and is not hallucinated |\n",
+    "|                | RelevanceEvaluator        | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |\n",
+    "|                | CoherenceEvaluator        | How well all the sentences fit together and sound naturally as a whole. |\n",
+    "|                | FluencyEvaluator          | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |\n",
+    "|                | SimilarityEvaluator       | Measures the similarity between the predicted answer and the correct answer |\n",
+    "| Content Safety | ViolenceEvaluator         |                                                   |\n",
+    "|                | SexualEvaluator           |                                                   |\n",
+    "|                | SelfHarmEvaluator         |                                                   |\n",
+    "|                | HateUnfairnessEvaluator   |                                                   |\n",
+    "| Composite      | QAEvaluator               | Built on top of individual quality evaluators.    |\n",
+    "|                | ChatEvaluator             | Similar to QAEvaluator but designed for evaluating chat messages. |\n",
+    "|                | ContentSafetyEvaluator    | Built on top of individual content safety evaluators. |\n",
+    "| Math           | BleuScoreEvaluator        | BLEU Score |\n",
+    "|                | RougeScoreEvaluator       | ROUGE Score |\n",
+    "|                | F1ScoreEvaluator          | F1 score |\n"
    ]
   },
   {
@@ -337,13 +338,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "a219acb2",
+   "execution_count": 10,
+   "id": "1a3fa9eb",
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
-    "from promptflow.core import OpenAIModelConfiguration, AzureOpenAIModelConfiguration\n",
+    "from azure.ai.evaluation import OpenAIModelConfiguration, AzureOpenAIModelConfiguration\n",
     "\n",
     "openai_base_url = os.environ.get(\"JUDGE_OPENAI_BASE_URL\")\n",
     "azure_endpoint = os.environ.get(\"JUDGE_AZURE_OPENAI_ENDPOINT\")\n",
@@ -372,24 +373,29 @@
     "    print(f\"azure_deployment={azure_deployment}\")\n",
     "    print(f\"api_version={api_version}\")\n",
     "\n",
+    "    args = {\n",
+    "        'azure_endpoint': azure_endpoint,\n",
+    "        'azure_deployment': azure_deployment,\n",
+    "        'api_version': api_version,\n",
+    "    }\n",
+    "    if api_key:\n",
+    "        args['api_key'] = api_key\n",
+    "\n",
     "    # Initialize Azure OpenAI Connection\n",
-    "    model_config = AzureOpenAIModelConfiguration(\n",
-    "        azure_endpoint=azure_endpoint,\n",
-    "        azure_deployment=azure_deployment,\n",
-    "        api_version=api_version,\n",
-    "        api_key=api_key\n",
-    "    )"
+    "    model_config = AzureOpenAIModelConfiguration(args)\n",
+    "\n",
+    "else:\n",
+    "    print(\"Couldn't find a judge endpoint environment variable\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "8965ed9d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from promptflow.evals.evaluators import *\n",
-    "from lib.evaluators import *\n",
+    "from azure.ai.evaluation import CoherenceEvaluator, F1ScoreEvaluator, FluencyEvaluator, GroundednessEvaluator, RelevanceEvaluator, SimilarityEvaluator, BleuScoreEvaluator, RougeScoreEvaluator, RougeType\n",
     "\n",
     "explanations = {\n",
     "    \"groundedness\": \"Measures how well the answer is entailed by the context and is not hallucinated\",\n",
@@ -439,7 +445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "6247d8cb",
    "metadata": {},
    "outputs": [],
@@ -450,7 +456,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "id": "43d90565",
    "metadata": {},
    "outputs": [],
@@ -459,8 +465,8 @@
     "\n",
     "# Running similarity Evaluator on single input row\n",
     "similarity_score = evaluators[\"similarity\"](\n",
-    "    question=sample[\"question\"],\n",
-    "    answer=sample[\"final_answer\"],\n",
+    "    query=sample[\"question\"],\n",
+    "    response=sample[\"final_answer\"],\n",
     "    context=sample[\"context\"],\n",
     "    ground_truth=sample[\"gold_final_answer\"],\n",
     ")\n",
@@ -508,7 +514,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from promptflow.evals.evaluate import evaluate\n",
+    "from azure.ai.evaluation import evaluate\n",
     "\n",
     "def score_dataset(dataset, rows_output_path=None, metrics_output_path=None):\n",
     "    result = evaluate(\n",
@@ -517,20 +523,23 @@
     "        # column mapping\n",
     "        evaluator_config={\n",
     "            \"default\": {\n",
-    "                \"question\": \"${data.question}\",\n",
-    "                \"answer\": \"${data.final_answer}\",\n",
-    "                \"ground_truth\": \"${data.gold_final_answer}\",\n",
-    "                \"context\": \"${data.context}\",\n",
+    "                \"column_mapping\": {\n",
+    "                    \"query\": \"${data.question}\",\n",
+    "                    \"response\": \"${data.final_answer}\",\n",
+    "                    \"ground_truth\": \"${data.gold_final_answer}\",\n",
+    "                    \"context\": \"${data.context}\",\n",
+    "                }\n",
     "            }\n",
     "        },\n",
     "    )\n",
     "\n",
     "    if rows_output_path:\n",
     "        pd.DataFrame.from_dict(result[\"rows\"]).to_json(rows_output_path, orient=\"records\", lines=True)\n",
     "\n",
-    "    #if metrics_output_path:\n",
-    "    #    pd.DataFrame.from_dict(result[\"metrics\"]).to_json(metrics_output_path, orient=\"records\", lines=True)\n",
-    "\n",
+    "    if metrics_output_path:\n",
+    "        import json\n",
+    "        with open(metrics_output_path, \"w\") as f:\n",
+    "            json.dump(result['metrics'], f)\n",
     "    return result"
    ]
   },
@@ -544,7 +553,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "b4059934",
    "metadata": {},
    "outputs": [],
@@ -567,7 +576,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "0dd3c12d",
    "metadata": {},
    "outputs": [],
@@ -619,7 +628,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "id": "5cb28f9a-1f9c-4c2e-8b32-e7da51585f92",
    "metadata": {},
    "outputs": [],
@@ -661,12 +670,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
+   "id": "c42af516-f4b7-4f25-b15b-791d0a9c93b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_json(path):\n",
+    "    import json\n",
+    "    with open(path, \"r\") as f:\n",
+    "        return json.load(f)\n",
+    "\n",
+    "student_metrics = read_json(dataset_path_eval_answer_score_metrics_student)\n",
+    "baseline_metrics = read_json(dataset_path_eval_answer_score_metrics_baseline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "325762f2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "metrics = pd.DataFrame.from_dict({\"baseline\": baseline_result[\"metrics\"], \"student\": student_result[\"metrics\"]})\n",
+    "metrics = pd.DataFrame.from_dict({\"baseline\": baseline_metrics, \"student\": student_metrics})\n",
     "metrics[\"improvement\"] = (metrics[\"student\"] - metrics[\"baseline\"]) / metrics[\"baseline\"]\n",
     "gpt_metric_names = set(filter(lambda e: 'gpt' in e, metrics.index.values))\n",
     "gpt_mask = metrics.index.isin(gpt_metric_names)\n",
@@ -736,7 +761,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "id": "f3ee94f1",
    "metadata": {},
    "outputs": [],

diff --git a/lib/evaluators/__init__.py b/lib/evaluators/__init__.py
diff --git a/lib/evaluators/_bleu/__init__.py b/lib/evaluators/_bleu/__init__.py