From abe2e867133f7bf597058cee6b123c97bb113de3 Mon Sep 17 00:00:00 2001
From: Cedric Vidal <cedric@vidal.biz>
Date: Mon, 4 Nov 2024 09:31:22 -0800
Subject: [PATCH] Migration to azure-ai-evaluation package (#43)

* Converting code
* Updated column_mapping
* Update metrics table with new namespace and metrics
* Using new Eval SDK metrics instead of local math metrics
* Intermediate JSON metrics file
---
 4_eval.ipynb                      | 141 +++++----
 lib/evaluators/__init__.py        |  12 -
 lib/evaluators/_bleu/__init__.py  |   9 -
 lib/evaluators/_bleu/_bleu.py     |  72 -----
 lib/evaluators/_common/utils.py   |  32 --
 lib/evaluators/_rouge/__init__.py |  10 -
 lib/evaluators/_rouge/_rouge.py   |  99 ------
 output.md                         | 484 ++++++++++++++++++++++++++++++
 utils.py                          |   9 +-
 9 files changed, 573 insertions(+), 295 deletions(-)
 delete mode 100644 lib/evaluators/__init__.py
 delete mode 100644 lib/evaluators/_bleu/__init__.py
 delete mode 100644 lib/evaluators/_bleu/_bleu.py
 delete mode 100644 lib/evaluators/_common/utils.py
 delete mode 100644 lib/evaluators/_rouge/__init__.py
 delete mode 100644 lib/evaluators/_rouge/_rouge.py
 create mode 100644 output.md

diff --git a/4_eval.ipynb b/4_eval.ipynb
index d7690dc..7897550 100644
--- a/4_eval.ipynb
+++ b/4_eval.ipynb
@@ -48,12 +48,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 1,
    "id": "31e99e68",
    "metadata": {},
    "outputs": [],
    "source": [
-    "#! pip install promptflow-evals"
+    "#! pip install openai azure-ai-evaluation azure-identity promptflow-azure"
    ]
   },
   {
@@ -96,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "31004ebd",
    "metadata": {},
    "outputs": [],
@@ -132,8 +132,8 @@
     "dataset_path_eval_answer_score_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.baseline.jsonl\"\n",
     "\n",
     "# Scored answer metrics files\n",
-    "dataset_path_eval_answer_score_metrics_student = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.jsonl\"\n",
-    "dataset_path_eval_answer_score_metrics_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.jsonl\"\n",
+    "dataset_path_eval_answer_score_metrics_student = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.json\"\n",
+    "dataset_path_eval_answer_score_metrics_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.json\"\n",
     "\n",
     "BASELINE_OPENAI_DEPLOYMENT = os.getenv(\"BASELINE_OPENAI_DEPLOYMENT\")\n",
     "BASELINE_MODEL_API = os.getenv(\"BASELINE_MODEL_API\")\n",
@@ -154,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "83e5f47e",
    "metadata": {},
    "outputs": [],
@@ -180,7 +180,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "id": "6bdda3d9",
    "metadata": {},
    "outputs": [],
@@ -194,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 5,
    "id": "fed06cae",
    "metadata": {},
    "outputs": [],
@@ -214,7 +214,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 6,
    "id": "85194f3c",
    "metadata": {},
    "outputs": [],
@@ -230,7 +230,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
    "id": "76b0827d",
    "metadata": {},
    "outputs": [],
@@ -251,7 +251,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
    "id": "8b4a21af",
    "metadata": {},
    "outputs": [],
@@ -273,7 +273,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 9,
    "id": "ff092e37",
    "metadata": {},
    "outputs": [],
@@ -309,22 +309,23 @@
     "\n",
     "The table below lists all the built-in evaluators we support. In the following sections, we will select a few of these evaluators to demonstrate how to use them.\n",
     "\n",
-    "| Category       | Namespace                                        | Evaluator Class           | Notes                                             |\n",
-    "|----------------|--------------------------------------------------|---------------------------|---------------------------------------------------|\n",
-    "| Quality        | promptflow.evals.evaluators                      | GroundednessEvaluator     | Measures how well the answer is entailed by the context and is not hallucinated |\n",
-    "|                |                                                  | RelevanceEvaluator        | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |\n",
-    "|                |                                                  | CoherenceEvaluator        | How well all the sentences fit together and sound naturally as a whole. |\n",
-    "|                |                                                  | FluencyEvaluator          | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |\n",
-    "|                |                                                  | SimilarityEvaluator       | Measures the similarity between the predicted answer and the correct answer |\n",
-    "|                |                                                  | F1ScoreEvaluator          | F1 score |\n",
-    "| Content Safety | promptflow.evals.evaluators.content_safety       | ViolenceEvaluator         |                                                   |\n",
-    "|                |                                                  | SexualEvaluator           |                                                   |\n",
-    "|                |                                                  | SelfHarmEvaluator         |                                                   |\n",
-    "|                |                                                  | HateUnfairnessEvaluator   |                                                   |\n",
-    "| Composite      | promptflow.evals.evaluators                      | QAEvaluator               | Built on top of individual quality evaluators.    |\n",
-    "|                |                                                  | ChatEvaluator             | Similar to QAEvaluator but designed for evaluating chat messages. |\n",
-    "|                |                                                  | ContentSafetyEvaluator    | Built on top of individual content safety evaluators. |\n",
-    "\n"
+    "| Category       | Evaluator Class           | Notes                                             |\n",
+    "|----------------|---------------------------|---------------------------------------------------|\n",
+    "| Quality        | GroundednessEvaluator     | Measures how well the answer is entailed by the context and is not hallucinated |\n",
+    "|                | RelevanceEvaluator        | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |\n",
+    "|                | CoherenceEvaluator        | How well all the sentences fit together and sound naturally as a whole. |\n",
+    "|                | FluencyEvaluator          | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |\n",
+    "|                | SimilarityEvaluator       | Measures the similarity between the predicted answer and the correct answer |\n",
+    "| Content Safety | ViolenceEvaluator         |                                                   |\n",
+    "|                | SexualEvaluator           |                                                   |\n",
+    "|                | SelfHarmEvaluator         |                                                   |\n",
+    "|                | HateUnfairnessEvaluator   |                                                   |\n",
+    "| Composite      | QAEvaluator               | Built on top of individual quality evaluators.    |\n",
+    "|                | ChatEvaluator             | Similar to QAEvaluator but designed for evaluating chat messages. |\n",
+    "|                | ContentSafetyEvaluator    | Built on top of individual content safety evaluators. |\n",
+    "| Math           | BleuScoreEvaluator        | BLEU Score |\n",
+    "|                | RougeScoreEvaluator       | ROUGE Score |\n",
+    "|                | F1ScoreEvaluator          | F1 score |\n"
    ]
   },
   {
@@ -337,13 +338,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "a219acb2",
+   "execution_count": 10,
+   "id": "1a3fa9eb",
    "metadata": {},
    "outputs": [],
    "source": [
     "import os\n",
-    "from promptflow.core import OpenAIModelConfiguration, AzureOpenAIModelConfiguration\n",
+    "from azure.ai.evaluation import OpenAIModelConfiguration, AzureOpenAIModelConfiguration\n",
     "\n",
     "openai_base_url = os.environ.get(\"JUDGE_OPENAI_BASE_URL\")\n",
     "azure_endpoint = os.environ.get(\"JUDGE_AZURE_OPENAI_ENDPOINT\")\n",
@@ -372,24 +373,29 @@
     "    print(f\"azure_deployment={azure_deployment}\")\n",
     "    print(f\"api_version={api_version}\")\n",
     "\n",
+    "    args = {\n",
+    "        'azure_endpoint': azure_endpoint,\n",
+    "        'azure_deployment': azure_deployment,\n",
+    "        'api_version': api_version,\n",
+    "    }\n",
+    "    if api_key:\n",
+    "        args['api_key'] = api_key\n",
+    "\n",
     "    # Initialize Azure OpenAI Connection\n",
-    "    model_config = AzureOpenAIModelConfiguration(\n",
-    "        azure_endpoint=azure_endpoint,\n",
-    "        azure_deployment=azure_deployment,\n",
-    "        api_version=api_version,\n",
-    "        api_key=api_key\n",
-    "    )"
+    "    model_config = AzureOpenAIModelConfiguration(args)\n",
+    "\n",
+    "else:\n",
+    "    print(\"Couldn't find a judge endpoint environment variable\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 11,
    "id": "8965ed9d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "from promptflow.evals.evaluators import *\n",
-    "from lib.evaluators import *\n",
+    "from azure.ai.evaluation import CoherenceEvaluator, F1ScoreEvaluator, FluencyEvaluator, GroundednessEvaluator, RelevanceEvaluator, SimilarityEvaluator, BleuScoreEvaluator, RougeScoreEvaluator, RougeType\n",
     "\n",
     "explanations = {\n",
     "    \"groundedness\": \"Measures how well the answer is entailed by the context and is not hallucinated\",\n",
@@ -439,7 +445,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 12,
    "id": "6247d8cb",
    "metadata": {},
    "outputs": [],
@@ -450,7 +456,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 13,
    "id": "43d90565",
    "metadata": {},
    "outputs": [],
@@ -459,8 +465,8 @@
     "\n",
     "# Running similarity Evaluator on single input row\n",
     "similarity_score = evaluators[\"similarity\"](\n",
-    "    question=sample[\"question\"],\n",
-    "    answer=sample[\"final_answer\"],\n",
+    "    query=sample[\"question\"],\n",
+    "    response=sample[\"final_answer\"],\n",
     "    context=sample[\"context\"],\n",
     "    ground_truth=sample[\"gold_final_answer\"],\n",
     ")\n",
@@ -508,7 +514,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from promptflow.evals.evaluate import evaluate\n",
+    "from azure.ai.evaluation import evaluate\n",
     "\n",
     "def score_dataset(dataset, rows_output_path=None, metrics_output_path=None):\n",
     "    result = evaluate(\n",
@@ -517,10 +523,12 @@
     "        # column mapping\n",
     "        evaluator_config={\n",
     "            \"default\": {\n",
-    "                \"question\": \"${data.question}\",\n",
-    "                \"answer\": \"${data.final_answer}\",\n",
-    "                \"ground_truth\": \"${data.gold_final_answer}\",\n",
-    "                \"context\": \"${data.context}\",\n",
+    "                \"column_mapping\": {\n",
+    "                    \"query\": \"${data.question}\",\n",
+    "                    \"response\": \"${data.final_answer}\",\n",
+    "                    \"ground_truth\": \"${data.gold_final_answer}\",\n",
+    "                    \"context\": \"${data.context}\",\n",
+    "                }\n",
     "            }\n",
     "        },\n",
     "    )\n",
@@ -528,9 +536,10 @@
     "    if rows_output_path:\n",
     "        pd.DataFrame.from_dict(result[\"rows\"]).to_json(rows_output_path, orient=\"records\", lines=True)\n",
     "\n",
-    "    #if metrics_output_path:\n",
-    "    #    pd.DataFrame.from_dict(result[\"metrics\"]).to_json(metrics_output_path, orient=\"records\", lines=True)\n",
-    "\n",
+    "    if metrics_output_path:\n",
+    "        import json\n",
+    "        with open(metrics_output_path, \"w\") as f:\n",
+    "            json.dump(result['metrics'], f)\n",
     "    return result"
    ]
   },
@@ -544,7 +553,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 15,
    "id": "b4059934",
    "metadata": {},
    "outputs": [],
@@ -567,7 +576,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": null,
    "id": "0dd3c12d",
    "metadata": {},
    "outputs": [],
@@ -619,7 +628,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": null,
    "id": "5cb28f9a-1f9c-4c2e-8b32-e7da51585f92",
    "metadata": {},
    "outputs": [],
@@ -661,12 +670,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": null,
+   "id": "c42af516-f4b7-4f25-b15b-791d0a9c93b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_json(path):\n",
+    "    import json\n",
+    "    with open(path, \"r\") as f:\n",
+    "        return json.load(f)\n",
+    "\n",
+    "student_metrics = read_json(dataset_path_eval_answer_score_metrics_student)\n",
+    "baseline_metrics = read_json(dataset_path_eval_answer_score_metrics_baseline)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
    "id": "325762f2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "metrics = pd.DataFrame.from_dict({\"baseline\": baseline_result[\"metrics\"], \"student\": student_result[\"metrics\"]})\n",
+    "metrics = pd.DataFrame.from_dict({\"baseline\": baseline_metrics, \"student\": student_metrics})\n",
     "metrics[\"improvement\"] = (metrics[\"student\"] - metrics[\"baseline\"]) / metrics[\"baseline\"]\n",
     "gpt_metric_names = set(filter(lambda e: 'gpt' in e, metrics.index.values))\n",
     "gpt_mask = metrics.index.isin(gpt_metric_names)\n",
@@ -736,7 +761,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": null,
    "id": "f3ee94f1",
    "metadata": {},
    "outputs": [],
diff --git a/lib/evaluators/__init__.py b/lib/evaluators/__init__.py
deleted file mode 100644
index 57e06b8..0000000
--- a/lib/evaluators/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# ---------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
-
-from ._bleu import BleuScoreEvaluator
-from ._rouge import RougeScoreEvaluator, RougeType
-
-__all__ = [
-    "BleuScoreEvaluator",
-    "RougeScoreEvaluator",
-    "RougeType",
-]
\ No newline at end of file
diff --git a/lib/evaluators/_bleu/__init__.py b/lib/evaluators/_bleu/__init__.py
deleted file mode 100644
index 0505c2d..0000000
--- a/lib/evaluators/_bleu/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# ---------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
-
-from ._bleu import BleuScoreEvaluator
-
-__all__ = [
-    "BleuScoreEvaluator",
-]
\ No newline at end of file
diff --git a/lib/evaluators/_bleu/_bleu.py b/lib/evaluators/_bleu/_bleu.py
deleted file mode 100644
index ec7a8de..0000000
--- a/lib/evaluators/_bleu/_bleu.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# ---------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
-from nltk.translate.bleu_score import SmoothingFunction, sentence_bleu
-
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-from .._common.utils import nltk_tokenize
-
-
-class _AsyncBleuScoreEvaluator:
-    def __init__(self):
-        pass
-
-    async def __call__(self, *, answer: str, ground_truth: str, **kwargs):
-        reference_tokens = nltk_tokenize(ground_truth)
-        hypothesis_tokens = nltk_tokenize(answer)
-
-        smoothing_function = SmoothingFunction().method4
-        score = sentence_bleu([reference_tokens], hypothesis_tokens, smoothing_function=smoothing_function)
-
-        return {
-            "bleu_score": score,
-        }
-
-
-class BleuScoreEvaluator:
-    """
-    Evaluator that computes the BLEU Score between two strings.
-
-    BLEU (Bilingual Evaluation Understudy) score is commonly used in natural language processing (NLP) and machine
-    translation. It is widely used in text summarization and text generation use cases. It evaluates how closely the
-    generated text matches the reference text. The BLEU score ranges from 0 to 1, with higher scores indicating
-    better quality.
-
-    **Usage**
-
-    .. code-block:: python
-
-        eval_fn = BleuScoreEvaluator()
-        result = eval_fn(
-            answer="Tokyo is the capital of Japan.",
-            ground_truth="The capital of Japan is Tokyo.")
-
-    **Output format**
-
-    .. code-block:: python
-
-        {
-            "bleu_score": 0.22
-        }
-    """
-
-    def __init__(self):
-        self._async_evaluator = _AsyncBleuScoreEvaluator()
-
-    def __call__(self, *, answer: str, ground_truth: str, **kwargs):
-        """
-        Evaluate the BLEU score between the answer and the ground truth.
-
-        :keyword answer: The answer to be evaluated.
-        :paramtype answer: str
-        :keyword ground_truth: The ground truth to be compared against.
-        :paramtype ground_truth: str
-        :return: The BLEU score.
-        :rtype: dict
-        """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, answer=answer, ground_truth=ground_truth, **kwargs
-        )
-
-    def _to_async(self):
-        return self._async_evaluator
\ No newline at end of file
diff --git a/lib/evaluators/_common/utils.py b/lib/evaluators/_common/utils.py
deleted file mode 100644
index cabf7ac..0000000
--- a/lib/evaluators/_common/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# ---------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
-
-from typing import List, cast
-
-import nltk
-import numpy as np
-
-try:
-    from nltk.tokenize.nist import NISTTokenizer
-except LookupError:
-    nltk.download("perluniprops")
-    nltk.download("punkt")
-    nltk.download("punkt_tab")
-    from nltk.tokenize.nist import NISTTokenizer
-
-
-def nltk_tokenize(text: str) -> List[str]:
-    """Tokenize the input text using the NLTK tokenizer."""
-
-    is_latin_or_numeric = all(
-        ("\u0020" <= c <= "\u007E")  # Basic Latin
-        or ("\u00A0" <= c <= "\u00FF")  # Latin-1 Supplement
-        or ("0" <= c <= "9")  # Digits
-        for c in text
-    )
-
-    if is_latin_or_numeric:
-        return cast(List[str], nltk.word_tokenize(text))
-
-    return list(NISTTokenizer().international_tokenize(text))
\ No newline at end of file
diff --git a/lib/evaluators/_rouge/__init__.py b/lib/evaluators/_rouge/__init__.py
deleted file mode 100644
index ff6658f..0000000
--- a/lib/evaluators/_rouge/__init__.py
+++ /dev/null
@@ -1,10 +0,0 @@
-# ---------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
-
-from ._rouge import RougeScoreEvaluator, RougeType
-
-__all__ = [
-    "RougeScoreEvaluator",
-    "RougeType",
-]
\ No newline at end of file
diff --git a/lib/evaluators/_rouge/_rouge.py b/lib/evaluators/_rouge/_rouge.py
deleted file mode 100644
index 3cbd3a8..0000000
--- a/lib/evaluators/_rouge/_rouge.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# ---------------------------------------------------------
-# Copyright (c) Microsoft Corporation. All rights reserved.
-# ---------------------------------------------------------
-from enum import Enum
-
-from rouge_score import rouge_scorer
-
-from promptflow._utils.async_utils import async_run_allowing_running_loop
-
-
-class RougeType(str, Enum):
-    """
-    Enumeration of ROUGE (Recall-Oriented Understudy for Gisting Evaluation) types.
-    """
-
-    ROUGE_1 = "rouge1"
-    """Overlap of unigrams (single words) between generated and reference text."""
-
-    ROUGE_2 = "rouge2"
-    """Overlap of bigrams (two consecutive words) between generated and reference text."""
-
-    ROUGE_3 = "rouge3"
-    """Overlap of trigrams (three consecutive words) between generated and reference text."""
-
-    ROUGE_4 = "rouge4"
-    """Overlap of four-grams (four consecutive words) between generated and reference text."""
-
-    ROUGE_5 = "rouge5"
-    """Overlap of five-grams (five consecutive words) between generated and reference text."""
-
-    ROUGE_L = "rougeL"
-    """Overlap of L-grams (L consecutive words) between generated and reference text."""
-
-
-class _AsyncRougeScoreEvaluator:
-    def __init__(self, rouge_type: RougeType):
-        self._rouge_type = rouge_type
-
-    async def __call__(self, *, ground_truth: str, answer: str, **kwargs):
-        scorer = rouge_scorer.RougeScorer(rouge_types=[self._rouge_type])
-        metrics = scorer.score(ground_truth, answer)[self._rouge_type]
-        return {
-            "rouge_precision": metrics.precision,
-            "rouge_recall": metrics.recall,
-            "rouge_f1_score": metrics.fmeasure,
-        }
-
-
-class RougeScoreEvaluator:
-    """
-    Evaluator for computes the ROUGE scores between two strings.
-
-    ROUGE (Recall-Oriented Understudy for Gisting Evaluation) is a set of metrics used to evaluate automatic
-    summarization and machine translation. It measures the overlap between generated text and reference summaries.
-    ROUGE focuses on recall-oriented measures to assess how well the generated text covers the reference text. Text
-    summarization and document comparison are among optimal use cases for ROUGE, particularly in scenarios where text
-    coherence and relevance are critical.
-
-    **Usage**
-
-    .. code-block:: python
-
-        eval_fn = RougeScoreEvaluator(rouge_type=RougeType.ROUGE_1)
-        result = eval_fn(
-            answer="Tokyo is the capital of Japan.",
-            ground_truth="The capital of Japan is Tokyo.")
-
-    **Output format**
-
-    .. code-block:: python
-
-        {
-            "rouge_precision": 1.0,
-            "rouge_recall": 1.0,
-            "rouge_f1_score": 1.0
-        }
-    """
-
-    def __init__(self, rouge_type: RougeType):
-        self._rouge_type = rouge_type
-        self._async_evaluator = _AsyncRougeScoreEvaluator(rouge_type)
-
-    def __call__(self, *, ground_truth: str, answer: str, **kwargs):
-        """
-        Evaluate the ROUGE score between the answer and the ground truth.
-
-        :keyword answer: The answer to be evaluated.
-        :paramtype answer: str
-        :keyword ground_truth: The ground truth to be compared against.
-        :paramtype ground_truth: str
-        :return: The ROUGE score.
-        :rtype: dict
-        """
-        return async_run_allowing_running_loop(
-            self._async_evaluator, ground_truth=ground_truth, answer=answer, **kwargs
-        )
-
-    def _to_async(self):
-        return self._async_evaluator
\ No newline at end of file
diff --git a/output.md b/output.md
new file mode 100644
index 0000000..0117102
--- /dev/null
+++ b/output.md
@@ -0,0 +1,484 @@
+# Evaluation of the student and baseline models with the RAFT generated eval dataset split
+
+In this notebook, we will use the evaluation dataset synthetically generated in the [](./1_gen.ipynb) notebook using the RAFT method to evaluate both the baseline model and the student model, then compare the two to analyse the impact of the fine-tuning.
+
+We introduce the `promptflow-evals` package and built-in evaluators. Then, we'll demonstrate how to use the `evaluate` API to assess data using these evaluators.
+
+Finally, we'll draw a diagram showing the performance of the student model against the baseline.
+
+## Overview
+
+- Testing
+  - Run the baseline model on the evaluation split to get its predictions.
+  - Run the student model on the evaluation split to get its predictions.
+- Answers formatting
+  - Convert the baseline model answers to a format suitable for testing
+  - Convert the student model answers to a format suitable for testing
+- Evaluation
+  - Calculate the metrics (such as accuracy, precision, recall, etc.) based on the predictions from the baseline model.
+  - Calculate the metrics based on the predictions from the student model.  
+- Compare metrics
+
+## Overview
+![](./doc/raft-process-eval.png)
+
+## Installing requirements
+
+The requirements should have been automatically installed if you opened the project in Dev Container or Codespaces, but if not, uncomment the following cell to install the requirements
+
+
+```python
+#! pip install openai azure-ai-evaluation azure-identity promptflow-azure
+```
+
+## Running time and cost
+
+The RAFT evaluation script usually takes a few minutes on the default sample document but can take days on bigger domains depending on the number and size of documents and the number of questions being generated for each chunk.
+
+The cost of running this RAFT script on the sample document should be a few dollars. But beware, running it on bigger domains can cost hundreds of dollars if not more.
+
+## Testing
+
+### Overview
+![](./doc/raft-process-eval-test.png)
+
+
+### Define variables we will need
+
+
+```python
+import os
+from dotenv import load_dotenv
+
+# User provided values
+load_dotenv(".env")
+
+# Variables passed by previous notebooks
+load_dotenv(".env.state")
+
+# Let's capture the initial working directory because the evaluate function will change it
+dir = os.getcwd()
+
+experiment_name = os.getenv("DATASET_NAME")
+experiment_dir = f"{dir}/dataset/{experiment_name}-files"
+
+# Dataset generated by the gen notebook that we will evaluate the baseline and student models on
+dataset_path_hf_eval = f"{experiment_dir}/{experiment_name}-hf.eval.jsonl"
+
+# Evaluated answer files
+dataset_path_hf_eval_answer = f"{experiment_dir}/{experiment_name}-hf.eval.answer.jsonl"
+dataset_path_hf_eval_answer_baseline = f"{experiment_dir}/{experiment_name}-hf.eval.answer.baseline.jsonl"
+
+# Formatted answer evaluation files
+dataset_path_eval_answer_student = f"{experiment_dir}/{experiment_name}-eval.answer.student.jsonl"
+dataset_path_eval_answer_baseline = f"{experiment_dir}/{experiment_name}-eval.answer.baseline.jsonl"
+
+# Scored answer files
+dataset_path_eval_answer_score_student = f"{experiment_dir}/{experiment_name}-eval.answer.score.student.jsonl"
+dataset_path_eval_answer_score_baseline = f"{experiment_dir}/{experiment_name}-eval.answer.score.baseline.jsonl"
+
+# Scored answer metrics files
+dataset_path_eval_answer_score_metrics_student = f"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.jsonl"
+dataset_path_eval_answer_score_metrics_baseline = f"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.jsonl"
+
+BASELINE_OPENAI_DEPLOYMENT = os.getenv("BASELINE_OPENAI_DEPLOYMENT")
+BASELINE_MODEL_API = os.getenv("BASELINE_MODEL_API")
+
+STUDENT_DEPLOYMENT_NAME = os.getenv("STUDENT_DEPLOYMENT_NAME")
+STUDENT_MODEL_API = os.getenv("STUDENT_MODEL_API")
+
+print(f"Evaluating the student {STUDENT_MODEL_API} model {STUDENT_DEPLOYMENT_NAME} against the baseline {BASELINE_MODEL_API} model {BASELINE_OPENAI_DEPLOYMENT}")
+```
+
+### Run the baseline model on the evaluation split
+
+
+```python
+! [ ! -f $dataset_path_hf_eval_answer_baseline ] && env $(cat .env .env.state) python .gorilla/raft/eval.py \
+    --question-file $dataset_path_hf_eval \
+    --answer-file $dataset_path_hf_eval_answer_baseline \
+    --model $BASELINE_OPENAI_DEPLOYMENT \
+    --env-prefix BASELINE \
+    --mode $BASELINE_MODEL_API \
+    || echo "Baseline answers file already exists, skipping."
+```
+
+### Format baseline answers
+
+Convert the baseline model answers to a format suitable for testing
+
+
+```python
+! python .gorilla/raft/format.py \
+    --input $dataset_path_hf_eval_answer_baseline \
+    --input-type jsonl \
+    --output $dataset_path_eval_answer_baseline \
+    --output-format eval
+```
+
+
+```python
+from utils import pretty_print_row
+import pandas as pd
+pretty_print_row(pd.read_json(dataset_path_eval_answer_baseline, lines=True), 0)
+```
+
+### Run the student model on the evaluation split
+
+
+```python
+! [ ! -f $dataset_path_hf_eval_answer ] && env $(cat .env .env.state) python .gorilla/raft/eval.py \
+    --question-file $dataset_path_hf_eval \
+    --answer-file $dataset_path_hf_eval_answer \
+    --model $STUDENT_DEPLOYMENT_NAME \
+    --env-prefix STUDENT \
+    --mode $STUDENT_MODEL_API \
+    || echo "Student answers file already exists, skipping."
+```
+
+
+```python
+import pandas as pd
+pd.read_json(dataset_path_hf_eval_answer, lines=True).head(2)
+```
+
+### Format student model answers
+
+Convert the student model answers to a format suitable for testing
+
+
+```python
+! python .gorilla/raft/format.py \
+    --input $dataset_path_hf_eval_answer \
+    --input-type jsonl \
+    --output $dataset_path_eval_answer_student \
+    --output-format eval
+```
+
+### Student model answers
+
+
+```python
+from utils import pretty_print_row
+import pandas as pd
+pretty_print_row(pd.read_json(dataset_path_eval_answer_student, lines=True), 0)
+```
+
+## Evaluation
+
+### Overview
+![](./doc/raft-process-eval-score.png)
+
+### Built-in Evaluators
+
+The table below lists all the built-in evaluators we support. In the following sections, we will select a few of these evaluators to demonstrate how to use them.
+
+| Category       | Namespace                                        | Evaluator Class           | Notes                                             |
+|----------------|--------------------------------------------------|---------------------------|---------------------------------------------------|
+| Quality        | promptflow.evals.evaluators                      | GroundednessEvaluator     | Measures how well the answer is entailed by the context and is not hallucinated |
+|                |                                                  | RelevanceEvaluator        | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |
+|                |                                                  | CoherenceEvaluator        | How well all the sentences fit together and sound naturally as a whole. |
+|                |                                                  | FluencyEvaluator          | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |
+|                |                                                  | SimilarityEvaluator       | Measures the similarity between the predicted answer and the correct answer |
+|                |                                                  | F1ScoreEvaluator          | F1 score |
+| Content Safety | promptflow.evals.evaluators.content_safety       | ViolenceEvaluator         |                                                   |
+|                |                                                  | SexualEvaluator           |                                                   |
+|                |                                                  | SelfHarmEvaluator         |                                                   |
+|                |                                                  | HateUnfairnessEvaluator   |                                                   |
+| Composite      | promptflow.evals.evaluators                      | QAEvaluator               | Built on top of individual quality evaluators.    |
+|                |                                                  | ChatEvaluator             | Similar to QAEvaluator but designed for evaluating chat messages. |
+|                |                                                  | ContentSafetyEvaluator    | Built on top of individual content safety evaluators. |
+
+
+
+#### Quality Evaluator
+
+
+```python
+import os
+from azure.ai.evaluation import OpenAIModelConfiguration, AzureOpenAIModelConfiguration
+
+openai_base_url = os.environ.get("JUDGE_OPENAI_BASE_URL")
+azure_endpoint = os.environ.get("JUDGE_AZURE_OPENAI_ENDPOINT")
+
+if openai_base_url:
+    openai_api_key = os.environ.get("JUDGE_OPENAI_API_KEY")
+    model = os.environ.get(f"JUDGE_OPENAI_DEPLOYMENT")
+
+    print(f"openai_base_url={openai_base_url}")
+    print(f"model={model}")
+
+    # Initialize OpenAI Connection
+    model_config = OpenAIModelConfiguration(
+        base_url=openai_base_url,
+        api_key=openai_api_key,
+        model=model
+    )
+    model_config.api_version = None
+
+elif azure_endpoint:
+    azure_deployment = os.environ.get("JUDGE_AZURE_OPENAI_DEPLOYMENT")
+    api_key = os.environ.get("JUDGE_AZURE_OPENAI_API_KEY")
+    api_version = os.environ.get("JUDGE_OPENAI_API_VERSION")
+
+    print(f"azure_endpoint={azure_endpoint}")
+    print(f"azure_deployment={azure_deployment}")
+    print(f"api_version={api_version}")
+
+    args = {
+        'azure_endpoint': azure_endpoint,
+        'azure_deployment': azure_deployment,
+        'api_version': api_version,
+    }
+    if api_key:
+        args['api_key'] = api_key
+
+    # Initialize Azure OpenAI Connection
+    model_config = AzureOpenAIModelConfiguration(args)
+
+else:
+    print("Couldn't find a judge endpoint environment variable")
+```
+
+
+```python
+from azure.ai.evaluation import CoherenceEvaluator, F1ScoreEvaluator, FluencyEvaluator, GroundednessEvaluator, RelevanceEvaluator, SimilarityEvaluator, BleuScoreEvaluator, RougeScoreEvaluator, RougeType
+
+explanations = {
+    "groundedness": "Measures how well the answer is entailed by the context and is not hallucinated",
+    "relevance": "How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance.",
+    "coherence": "How well all the sentences fit together and sound naturally as a whole.",
+    "fluency": "Quality of individual sentences in the answer, and whether they are well-written and grammatically correct.",
+    "similarity": "Measures the similarity between the predicted answer and the correct answer",
+    "f1_score": "Measures the overlap between the predicted answer and the correct answer",
+}
+
+# Initializing evaluators
+evaluators = {
+
+    # GPT based metrics
+    "coherence" : CoherenceEvaluator(model_config),
+    "f1_score" : F1ScoreEvaluator(),
+    "fluency" : FluencyEvaluator(model_config),
+    "groundedness" : GroundednessEvaluator(model_config),
+    "relevance" : RelevanceEvaluator(model_config),
+    "similarity" : SimilarityEvaluator(model_config),
+
+    # Math metrics
+    "bleu" : BleuScoreEvaluator(),
+    "rouge_1" : RougeScoreEvaluator(RougeType.ROUGE_1),
+    "rouge_2" : RougeScoreEvaluator(RougeType.ROUGE_2),
+
+#    "qa" : QAEvaluator(model_config),
+#    "chat" : ChatEvaluator(model_config),
+
+#    "violence" : ViolenceEvaluator(model_config),
+#    "sexual" : SexualEvaluator(model_config),
+#    "self_harm" : SelfHarmEvaluator(model_config),
+#    "hate_unfairness" : HateUnfairnessEvaluator(model_config),
+
+#    "content_safety" : ContentSafetyEvaluator(model_config),
+#    "content_safety_chat" : ContentSafetyChatEvaluator(model_config),
+}
+```
+
+### Run metrics on a student model answer
+
+
+```python
+df = pd.read_json(dataset_path_eval_answer_student, lines=True)
+pretty_print_row(df, 1)
+```
+
+
+```python
+sample = df.iloc[1]
+
+# Running similarity Evaluator on single input row
+similarity_score = evaluators["similarity"](
+    query=sample["question"],
+    response=sample["final_answer"],
+    context=sample["context"],
+    ground_truth=sample["gold_final_answer"],
+)
+print(similarity_score)
+```
+
+### Using the Evaluate API to calculate the metrics in bulk
+
+In previous sections, we walked you through how to use built-in evaluators to evaluate a single row and how to define your own custom evaluators. Now, we will show you how to use these evaluators with the powerful `evaluate` API to assess an entire dataset.
+
+### Running the metrics
+
+Now, we will invoke the `evaluate` API using a few evaluators that we already initialized
+
+Additionally, we have a column mapping to map the `truth` column from the dataset to `ground_truth`, which is accepted by the evaluator.
+
+
+```python
+from azure.ai.evaluation import evaluate
+
+def score_dataset(dataset, rows_output_path=None, metrics_output_path=None):
+    result = evaluate(
+        data=dataset,
+        evaluators=evaluators,
+        # column mapping
+        evaluator_config={
+            "default": {
+                "column_mapping": {
+                    "query": "${data.question}",
+                    "response": "${data.final_answer}",
+                    "ground_truth": "${data.gold_final_answer}",
+                    "context": "${data.context}",
+                }
+            }
+        },
+    )
+
+    if rows_output_path:
+        pd.DataFrame.from_dict(result["rows"]).to_json(rows_output_path, orient="records", lines=True)
+
+    #if metrics_output_path:
+    #    pd.DataFrame.from_dict(result["metrics"]).to_json(metrics_output_path, orient="records", lines=True)
+
+    return result
+```
+
+#### Baseline model evaluation metrics
+
+
+```python
+pd.read_json(dataset_path_eval_answer_baseline, lines=True).head(2)
+```
+
+
+```python
+baseline_result = score_dataset(dataset_path_eval_answer_baseline, dataset_path_eval_answer_score_baseline, dataset_path_eval_answer_score_metrics_baseline)
+from IPython.display import display, JSON
+
+display(JSON(baseline_result["metrics"]))
+```
+
+
+```python
+# Check the results using Azure AI Studio UI
+studio_url = baseline_result["studio_url"] or "http://127.0.0.1:23333"
+print(f"Results available at {studio_url}")
+```
+
+#### Student model evaluation metrics
+
+
+```python
+pd.read_json(dataset_path_eval_answer_student, lines=True).head(2)
+```
+
+
+```python
+student_result = score_dataset(dataset_path_eval_answer_student, dataset_path_eval_answer_score_student, dataset_path_eval_answer_score_metrics_student)
+from IPython.display import display, JSON
+
+display(JSON(student_result["metrics"]))
+```
+
+
+Finally, let's check the results produced by the evaluate API.
+
+
+```python
+# Check the results using Azure AI Studio UI
+studio_url = student_result["studio_url"] or "http://127.0.0.1:23333"
+print(f"Results available at {studio_url}")
+```
+
+## Let's look at examples
+
+
+```python
+df_baseline=pd.read_json(dataset_path_eval_answer_score_baseline, lines=True)
+df_student=pd.read_json(dataset_path_eval_answer_score_student, lines=True)
+df_merged=pd.merge(df_baseline, df_student, on="inputs.question", suffixes=('_baseline', '_student'))
+df_merged.insert(0, "id", df_merged.index)
+df_merged.head(2)
+```
+
+## Compare the metrics of the student model against the baseline
+
+
+```python
+metrics = pd.DataFrame.from_dict({"baseline": baseline_result["metrics"], "student": student_result["metrics"]})
+metrics["improvement"] = (metrics["student"] - metrics["baseline"]) / metrics["baseline"]
+gpt_metric_names = set(filter(lambda e: 'gpt' in e, metrics.index.values))
+gpt_mask = metrics.index.isin(gpt_metric_names)
+metrics_gpt = metrics[gpt_mask] # between 1 and 5
+metrics_math = metrics[~gpt_mask] # between 0 and 1
+```
+
+
+```python
+metrics_math
+```
+
+
+```python
+import matplotlib.pyplot as plt
+
+#define subplot layout
+fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 7))
+axes[0].set_title('Math metrics')
+metrics_math.drop("improvement", axis=1).plot.barh(rot=0, colormap='Dark2', ax=axes[0])
+axes[1].set_title('GPT metrics')
+metrics_gpt.drop("improvement", axis=1).plot.barh(rot=0, colormap='Dark2', ax=axes[1])
+
+```
+
+
+```python
+import matplotlib.pyplot as plt
+
+#define subplot layout
+fig, axes = plt.subplots(nrows=2, ncols=1, figsize=(7, 7))
+axes[0].set_title('Math metrics')
+metrics_math["improvement"].plot.barh(rot=0, ax=axes[0], color=(metrics_math["improvement"] > 0).map({True: 'g', False: 'r'}))
+axes[1].set_title('GPT metrics')
+metrics_gpt["improvement"].plot.barh(rot=0, ax=axes[1], color=(metrics_gpt["improvement"] > 0).map({True: 'g', False: 'r'}))
+
+```
+
+## Let's look at outliers
+
+### Compute improvement for each sample and metric
+
+
+```python
+suffixes=[column.replace("_baseline", "").replace("outputs.", "") for column in df_merged.columns if column.startswith("outputs.") and column.endswith("_baseline")]
+```
+
+
+```python
+df_improvements = df_merged.copy()
+for suffixe in suffixes:
+    df_improvements[f"improvement_outputs.{suffixe}"] = (df_improvements[f"outputs.{suffixe}_student"] - df_improvements[f"outputs.{suffixe}_baseline"]) / df_improvements[f"outputs.{suffixe}_baseline"]
+df_improvements.head()
+```
+
+### Find samples for the worst GPT Fluency
+
+
+```python
+sort_columns=['improvement_outputs.fluency.gpt_fluency']
+display_columns=["id", "inputs.question", "inputs.final_answer_baseline", "inputs.final_answer_student", "improvement_outputs.fluency.gpt_fluency", "inputs.gold_final_answer_student"]
+df_improvements.sort_values(by=sort_columns, ascending=True)[display_columns].head(3)
+```
+
+### Find samples for the best GPT Fluency
+
+
+```python
+df_improvements.sort_values(by=sort_columns, ascending=False)[display_columns].head(3)
+```
+
+
+```python
+
+```
diff --git a/utils.py b/utils.py
index 43590d6..9253789 100644
--- a/utils.py
+++ b/utils.py
@@ -60,8 +60,7 @@ def file_sha256(filename):
     with open(filename, "rb", buffering=0) as f:
         return hashlib.file_digest(f, "sha256").hexdigest()
 
-def pretty_print_row(df, idx):
-    from IPython.display import display, Markdown
+def row_to_markdown(df, idx):
     sample = df.iloc[idx]
     md = ""
     for name in df.columns.values:
@@ -69,4 +68,8 @@ def pretty_print_row(df, idx):
         value = value.replace("<DOCUMENT>", "`<DOCUMENT>`").replace("</DOCUMENT>", "`</DOCUMENT>`")
         value = value.replace("<ANSWER>", "`<ANSWER>`").replace("##begin_quote##", "`##begin_quote##`").replace("##end_quote##", "`##end_quote##`")
         md += "### " + name + "\n" + value + "\n"
-    display(Markdown(md))
+    return md
+
+def pretty_print_row(df, idx):
+    from IPython.display import display, Markdown
+    display(Markdown(row_to_markdown(df, idx)))