Skip to content

Commit

Permalink
Migration to azure-ai-evaluation package (#43)
Browse files Browse the repository at this point in the history
* Converting code
* Updated column_mapping
* Update metrics table with new namespace and metrics
* Using new Eval SDK metrics instead of local math metrics
* Intermediate JSON metrics file
  • Loading branch information
cedricvidal authored Nov 4, 2024
1 parent 33ee9d1 commit abe2e86
Show file tree
Hide file tree
Showing 9 changed files with 573 additions and 295 deletions.
141 changes: 83 additions & 58 deletions 4_eval.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"id": "31e99e68",
"metadata": {},
"outputs": [],
"source": [
"#! pip install promptflow-evals"
"#! pip install openai azure-ai-evaluation azure-identity promptflow-azure"
]
},
{
Expand Down Expand Up @@ -96,7 +96,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"id": "31004ebd",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -132,8 +132,8 @@
"dataset_path_eval_answer_score_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.baseline.jsonl\"\n",
"\n",
"# Scored answer metrics files\n",
"dataset_path_eval_answer_score_metrics_student = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.jsonl\"\n",
"dataset_path_eval_answer_score_metrics_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.jsonl\"\n",
"dataset_path_eval_answer_score_metrics_student = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.student.json\"\n",
"dataset_path_eval_answer_score_metrics_baseline = f\"{experiment_dir}/{experiment_name}-eval.answer.score.metrics.baseline.json\"\n",
"\n",
"BASELINE_OPENAI_DEPLOYMENT = os.getenv(\"BASELINE_OPENAI_DEPLOYMENT\")\n",
"BASELINE_MODEL_API = os.getenv(\"BASELINE_MODEL_API\")\n",
Expand All @@ -154,7 +154,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"id": "83e5f47e",
"metadata": {},
"outputs": [],
Expand All @@ -180,7 +180,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"id": "6bdda3d9",
"metadata": {},
"outputs": [],
Expand All @@ -194,7 +194,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"id": "fed06cae",
"metadata": {},
"outputs": [],
Expand All @@ -214,7 +214,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"id": "85194f3c",
"metadata": {},
"outputs": [],
Expand All @@ -230,7 +230,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"id": "76b0827d",
"metadata": {},
"outputs": [],
Expand All @@ -251,7 +251,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"id": "8b4a21af",
"metadata": {},
"outputs": [],
Expand All @@ -273,7 +273,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 9,
"id": "ff092e37",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -309,22 +309,23 @@
"\n",
"The table below lists all the built-in evaluators we support. In the following sections, we will select a few of these evaluators to demonstrate how to use them.\n",
"\n",
"| Category | Namespace | Evaluator Class | Notes |\n",
"|----------------|--------------------------------------------------|---------------------------|---------------------------------------------------|\n",
"| Quality | promptflow.evals.evaluators | GroundednessEvaluator | Measures how well the answer is entailed by the context and is not hallucinated |\n",
"| | | RelevanceEvaluator | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |\n",
"| | | CoherenceEvaluator | How well all the sentences fit together and sound naturally as a whole. |\n",
"| | | FluencyEvaluator | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |\n",
"| | | SimilarityEvaluator | Measures the similarity between the predicted answer and the correct answer |\n",
"| | | F1ScoreEvaluator | F1 score |\n",
"| Content Safety | promptflow.evals.evaluators.content_safety | ViolenceEvaluator | |\n",
"| | | SexualEvaluator | |\n",
"| | | SelfHarmEvaluator | |\n",
"| | | HateUnfairnessEvaluator | |\n",
"| Composite | promptflow.evals.evaluators | QAEvaluator | Built on top of individual quality evaluators. |\n",
"| | | ChatEvaluator | Similar to QAEvaluator but designed for evaluating chat messages. |\n",
"| | | ContentSafetyEvaluator | Built on top of individual content safety evaluators. |\n",
"\n"
"| Category | Evaluator Class | Notes |\n",
"|----------------|---------------------------|---------------------------------------------------|\n",
"| Quality | GroundednessEvaluator | Measures how well the answer is entailed by the context and is not hallucinated |\n",
"| | RelevanceEvaluator | How well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. |\n",
"| | CoherenceEvaluator | How well all the sentences fit together and sound naturally as a whole. |\n",
"| | FluencyEvaluator | Quality of individual sentences in the answer, and whether they are well-written and grammatically correct. |\n",
"| | SimilarityEvaluator | Measures the similarity between the predicted answer and the correct answer |\n",
"| Content Safety | ViolenceEvaluator | |\n",
"| | SexualEvaluator | |\n",
"| | SelfHarmEvaluator | |\n",
"| | HateUnfairnessEvaluator | |\n",
"| Composite | QAEvaluator | Built on top of individual quality evaluators. |\n",
"| | ChatEvaluator | Similar to QAEvaluator but designed for evaluating chat messages. |\n",
"| | ContentSafetyEvaluator | Built on top of individual content safety evaluators. |\n",
"| Math | BleuScoreEvaluator | BLEU Score |\n",
"| | RougeScoreEvaluator | ROUGE Score |\n",
"| | F1ScoreEvaluator | F1 score |\n"
]
},
{
Expand All @@ -337,13 +338,13 @@
},
{
"cell_type": "code",
"execution_count": null,
"id": "a219acb2",
"execution_count": 10,
"id": "1a3fa9eb",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from promptflow.core import OpenAIModelConfiguration, AzureOpenAIModelConfiguration\n",
"from azure.ai.evaluation import OpenAIModelConfiguration, AzureOpenAIModelConfiguration\n",
"\n",
"openai_base_url = os.environ.get(\"JUDGE_OPENAI_BASE_URL\")\n",
"azure_endpoint = os.environ.get(\"JUDGE_AZURE_OPENAI_ENDPOINT\")\n",
Expand Down Expand Up @@ -372,24 +373,29 @@
" print(f\"azure_deployment={azure_deployment}\")\n",
" print(f\"api_version={api_version}\")\n",
"\n",
" args = {\n",
" 'azure_endpoint': azure_endpoint,\n",
" 'azure_deployment': azure_deployment,\n",
" 'api_version': api_version,\n",
" }\n",
" if api_key:\n",
" args['api_key'] = api_key\n",
"\n",
" # Initialize Azure OpenAI Connection\n",
" model_config = AzureOpenAIModelConfiguration(\n",
" azure_endpoint=azure_endpoint,\n",
" azure_deployment=azure_deployment,\n",
" api_version=api_version,\n",
" api_key=api_key\n",
" )"
" model_config = AzureOpenAIModelConfiguration(args)\n",
"\n",
"else:\n",
" print(\"Couldn't find a judge endpoint environment variable\")"
]
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 11,
"id": "8965ed9d",
"metadata": {},
"outputs": [],
"source": [
"from promptflow.evals.evaluators import *\n",
"from lib.evaluators import *\n",
"from azure.ai.evaluation import CoherenceEvaluator, F1ScoreEvaluator, FluencyEvaluator, GroundednessEvaluator, RelevanceEvaluator, SimilarityEvaluator, BleuScoreEvaluator, RougeScoreEvaluator, RougeType\n",
"\n",
"explanations = {\n",
" \"groundedness\": \"Measures how well the answer is entailed by the context and is not hallucinated\",\n",
Expand Down Expand Up @@ -439,7 +445,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"id": "6247d8cb",
"metadata": {},
"outputs": [],
Expand All @@ -450,7 +456,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 13,
"id": "43d90565",
"metadata": {},
"outputs": [],
Expand All @@ -459,8 +465,8 @@
"\n",
"# Running similarity Evaluator on single input row\n",
"similarity_score = evaluators[\"similarity\"](\n",
" question=sample[\"question\"],\n",
" answer=sample[\"final_answer\"],\n",
" query=sample[\"question\"],\n",
" response=sample[\"final_answer\"],\n",
" context=sample[\"context\"],\n",
" ground_truth=sample[\"gold_final_answer\"],\n",
")\n",
Expand Down Expand Up @@ -508,7 +514,7 @@
"metadata": {},
"outputs": [],
"source": [
"from promptflow.evals.evaluate import evaluate\n",
"from azure.ai.evaluation import evaluate\n",
"\n",
"def score_dataset(dataset, rows_output_path=None, metrics_output_path=None):\n",
" result = evaluate(\n",
Expand All @@ -517,20 +523,23 @@
" # column mapping\n",
" evaluator_config={\n",
" \"default\": {\n",
" \"question\": \"${data.question}\",\n",
" \"answer\": \"${data.final_answer}\",\n",
" \"ground_truth\": \"${data.gold_final_answer}\",\n",
" \"context\": \"${data.context}\",\n",
" \"column_mapping\": {\n",
" \"query\": \"${data.question}\",\n",
" \"response\": \"${data.final_answer}\",\n",
" \"ground_truth\": \"${data.gold_final_answer}\",\n",
" \"context\": \"${data.context}\",\n",
" }\n",
" }\n",
" },\n",
" )\n",
"\n",
" if rows_output_path:\n",
" pd.DataFrame.from_dict(result[\"rows\"]).to_json(rows_output_path, orient=\"records\", lines=True)\n",
"\n",
" #if metrics_output_path:\n",
" # pd.DataFrame.from_dict(result[\"metrics\"]).to_json(metrics_output_path, orient=\"records\", lines=True)\n",
"\n",
" if metrics_output_path:\n",
" import json\n",
" with open(metrics_output_path, \"w\") as f:\n",
" json.dump(result['metrics'], f)\n",
" return result"
]
},
Expand All @@ -544,7 +553,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"id": "b4059934",
"metadata": {},
"outputs": [],
Expand All @@ -567,7 +576,7 @@
},
{
"cell_type": "code",
"execution_count": 17,
"execution_count": null,
"id": "0dd3c12d",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -619,7 +628,7 @@
},
{
"cell_type": "code",
"execution_count": 21,
"execution_count": null,
"id": "5cb28f9a-1f9c-4c2e-8b32-e7da51585f92",
"metadata": {},
"outputs": [],
Expand Down Expand Up @@ -661,12 +670,28 @@
},
{
"cell_type": "code",
"execution_count": 23,
"execution_count": null,
"id": "c42af516-f4b7-4f25-b15b-791d0a9c93b6",
"metadata": {},
"outputs": [],
"source": [
"def read_json(path):\n",
" import json\n",
" with open(path, \"r\") as f:\n",
" return json.load(f)\n",
"\n",
"student_metrics = read_json(dataset_path_eval_answer_score_metrics_student)\n",
"baseline_metrics = read_json(dataset_path_eval_answer_score_metrics_baseline)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "325762f2",
"metadata": {},
"outputs": [],
"source": [
"metrics = pd.DataFrame.from_dict({\"baseline\": baseline_result[\"metrics\"], \"student\": student_result[\"metrics\"]})\n",
"metrics = pd.DataFrame.from_dict({\"baseline\": baseline_metrics, \"student\": student_metrics})\n",
"metrics[\"improvement\"] = (metrics[\"student\"] - metrics[\"baseline\"]) / metrics[\"baseline\"]\n",
"gpt_metric_names = set(filter(lambda e: 'gpt' in e, metrics.index.values))\n",
"gpt_mask = metrics.index.isin(gpt_metric_names)\n",
Expand Down Expand Up @@ -736,7 +761,7 @@
},
{
"cell_type": "code",
"execution_count": 27,
"execution_count": null,
"id": "f3ee94f1",
"metadata": {},
"outputs": [],
Expand Down
12 changes: 0 additions & 12 deletions lib/evaluators/__init__.py

This file was deleted.

9 changes: 0 additions & 9 deletions lib/evaluators/_bleu/__init__.py

This file was deleted.

Loading

0 comments on commit abe2e86

Please sign in to comment.