Update evals pipeline to accommodate official multilingual eval set (#12

) * Specify python3 in Makefile * Copy over the rewardbench CLI to repo * Fix how custom preference dataset is loaded * Update rewardbench script to include subsets * Update experiment script * Run isort on scripts * Fix run_generative to use new dataset * Update experiment script * Fix API for loading the dataset * Fix import name * Fix incorrect args name * Keep ID in the columns * Fix minor changes from previous preferences * Add Aya models for the API * Update README * Comprehensive saving of results * Use dashes as separators
for-ai · Jul 31, 2024 · a6638f4 · a6638f4
1 parent 8555da7
commit a6638f4
Show file tree

Hide file tree

Showing 8 changed files with 699 additions and 122 deletions.
diff --git a/Makefile b/Makefile
@@ -6,8 +6,8 @@ export PYTHONPATH = src
 check_dirs := scripts tests
 
 style:
-	python -m black --target-version py310 --line-length 119 $(check_dirs) 
-	python -m isort $(check_dirs) --profile black -m 9
+	python3 -m black --target-version py310 --line-length 119 $(check_dirs) 
+	python3 -m isort $(check_dirs) --profile black -m 9
 
 quality:
-	python -m flake8 --max-line-length 119 $(check_dirs)
+	python3 -m flake8 --max-line-length 119 $(check_dirs)
diff --git a/README.md b/README.md
@@ -48,16 +48,17 @@ You can also translate a specifc preference dataset from huggingface to a specif
 
 ### Getting rewards from a Reward Model (RM) on a HuggingFace dataset
 
-Here, we use the `rewardbench` command-line interface and pass a HuggingFace dataset.
+Here, we use the `scripts/run_rewardbench.py` command-line interface and pass a HuggingFace dataset.
 This is useful if the reward model is trained as a Custom classifier (🛠️), Sequence classifier (🔢), or via DPO (🎯).
 For example, if we want to get the reward score of the UltraRM-13b reward model on a preference dataset, we run:
 
 ```sh
-rewardbench \
+python -m scripts.run_rewardbench \
     --model openbmb/UltraRM-13b \
     --chat_template openbmb \
     --dataset $DATASET \
-    --split $SPLIT \
+    --lang_code $LANG_CODE \
+    --split "filtered" \
     --output_dir $OUTDIR \
     --batch_size 8 \
     --trust_remote_code \
@@ -91,10 +92,11 @@ Say we want to obtain the preferences of `gpt-4-2024-04-09`:
 
 ```sh
 export OPENAI_API_KEY=<your openai token>
-python -m scripts/run_generative.py \
+python -m scripts.run_generative \
     --dataset_name $DATASET \
-    --split $SPLIT \
     --model gpt-4-turbo-2024-04-09 \
+    --split "filtered" \
+    --lang_code $LANG_CODE \
     --output_dir $OUTDIR 
 ```
 
@@ -105,7 +107,8 @@ Here's an example using `meta-llama/Meta-Llama-3-70B-Instruct`:
 ```sh
 python -m scripts/run_generative.py \
     --dataset_name $DATASET \
-    --split $SPLIT \
+    --lang_code $LANG_CODE \
+    --split "filtered" \
     --model "meta-llama/Meta-Llama-3-70B-Instruct" \
     --num_gpus 4 \
     --output_dir $OUTDIR
@@ -117,6 +120,7 @@ The first value should be the language a prompt was written in, and the second v
 ```diff
 python -m scripts/run_generative.py \
     --dataset_name $DATASET \
+    --lang_code deu_Latn \
     --split $SPLIT \
     --model "meta-llama/Meta-Llama-3-70B-Instruct" \
     --num_gpus 4 \

diff --git a/experiments/run_llm_evals.sh b/experiments/run_llm_evals.sh
@@ -1,8 +1,76 @@
-python3 -m scripts/run_generative.py \
-    --dataset_name ljvmiranda921/ultrafeedback-multilingual-dpo-test \
-    --model gpt-4-turbo-2024-04-09 \
-    --split test
-python3 -m scripts/run_generative.py \
-    --dataset_name ljvmiranda921/ultrafeedback-english-dpo-test \
-    --model gpt-4-turbo-2024-04-09 \
-    --split test
+#!/bin/bash
+
+# Function to display usage information
+usage() {
+  echo "Usage: $0 [MODEL] [DATASET] [OUTDIR]"
+  echo "  MODEL - The model to evaluate (required)"
+  echo "  DATASET - The dataset to use (optional, default is 'aya-rm-multilingual/multilingual-reward-bench')"
+  echo "  OUTDIR  - The output directory (optional, default is 'output/')"
+  exit 1
+}
+
+# Default values for arguments
+MODEL=""
+DATASET="aya-rm-multilingual/multilingual-reward-bench"
+OUTDIR="output/"
+
+# Check and assign arguments if provided
+if [ $# -gt 3 ]; then
+  echo "Error: Too many arguments."
+  usage
+elif [ $# -ge 1 ]; then
+  MODEL=$1
+fi
+
+if [ $# -ge 2 ]; then
+  DATASET=$2
+fi
+
+if [ $# -ge 3 ]; then
+  OUTDIR=$3
+fi
+
+# Ensure the model is provided
+if [ -z "$MODEL" ]; then
+  echo "Error: MODEL is required."
+  usage
+fi
+
+# Define the languages and their FLORES-200 codes
+declare -A languages=(
+    ["arb_Arab"]="Arabic"
+    ["zho_Hans"]="Chinese_Simplified"
+    ["zho_Hant"]="Chinese_Traditional"
+    ["ces_Latn"]="Czech"
+    ["nld_Latn"]="Dutch"
+    ["fra_Latn"]="French"
+    ["deu_Latn"]="German"
+    ["ell_Grek"]="Greek"
+    ["heb_Hebr"]="Hebrew"
+    ["hin_Deva"]="Hindi"
+    ["ind_Latn"]="Indonesian"
+    ["ita_Latn"]="Italian"
+    ["jpn_Jpan"]="Japanese"
+    ["kor_Hang"]="Korean"
+    ["pes_Arab"]="Persian"
+    ["pol_Latn"]="Polish"
+    ["por_Latn"]="Portuguese"
+    ["ron_Latn"]="Romanian"
+    ["rus_Cyrl"]="Russian"
+    ["spa_Latn"]="Spanish"
+    ["tur_Latn"]="Turkish"
+    ["ukr_Cyrl"]="Ukrainian"
+    ["vie_Latn"]="Vietnamese"
+)
+
+for lang_code in "${!languages[@]}"; do
+  python3 scripts/run_generative.py \
+    --model "$MODEL" \
+    --dataset "$DATASET" \
+    --lang_code "$lang_code" \
+    --split "filtered" \
+    --output_dir "$OUTDIR" \
+    --include_languages "${languages[$lang_code]}" "English" \
+    --trust_remote_code \
+    --save_all
+done
diff --git a/experiments/run_rm_evals.sh b/experiments/run_rm_evals.sh
@@ -6,121 +6,91 @@ export NCCL_P2P_DISABLE=1
 
 # Function to display usage information
 usage() {
-  echo "Usage: $0 [DATASET] [SPLIT] [OUTDIR]"
-  echo "  DATASET - The dataset to use (optional, default is 'ljvmiranda921/multilingual-ultrafeedback-dpi-v0.1-test')"
-  echo "  SPLIT   - The data split to use (optional, default is 'test')"
+  echo "Usage: $0 [MODEL] [DATASET] [OUTDIR]"
+  echo "  MODEL - The model to evaluate (required)"
+  echo "  DATASET - The dataset to use (optional, default is 'aya-rm-multilingual/multilingual-reward-bench')"
   echo "  OUTDIR  - The output directory (optional, default is 'output/')"
+  echo "  CHAT_TEMPLATE  - The chat template to use (optional, default is 'raw')"
+  echo "  BATCH_SIZE     - The batch size to use (optional, default is 8)"
   exit 1
 }
 
 # Default values for arguments
-DATASET="ljvmiranda921/ultrafeedback-multilingual-dpo-test"
-SPLIT="test"
+MODEL=""
+DATASET="aya-rm-multilingual/multilingual-reward-bench"
 OUTDIR="output/"
+CHAT_TEMPLATE="raw"
+BATCH_SIZE=8
 
 # Check and assign arguments if provided
-if [ $# -gt 3 ]; then
+if [ $# -gt 5 ]; then
   echo "Error: Too many arguments."
   usage
 elif [ $# -ge 1 ]; then
-  DATASET=$1
+  MODEL=$1
 fi
 
 if [ $# -ge 2 ]; then
-  SPLIT=$2
+  DATASET=$2
 fi
 
 if [ $# -ge 3 ]; then
   OUTDIR=$3
 fi
 
-rewardbench \
-    --model openbmb/UltraRM-13b \
-    --chat_template openbmb \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 8 \
-    --trust_remote_code \
-    --force_truncation \
-    --save_all 
-
-rewardbench \
-    --model OpenAssistant/oasst-rm-2.1-pythia-1.4b-epoch-2.5 \
-    --chat_template oasst_pythia \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 8 \
-    --trust_remote_code \
-    --force_truncation \
-    --save_all 
-
-rewardbench \
-    --model OpenAssistant/oasst-rm-2-pythia-6.9b-epoch-1 \
-    --chat_template oasst_pythia \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 16 \
-    --trust_remote_code \
-    --force_truncation \
-    --save_all 
+if [ $# -ge 4 ]; then
+  CHAT_TEMPLATE=$4
+fi
 
-rewardbench \
-    --model OpenAssistant/reward-model-deberta-v3-large-v2 \
-    --chat_template raw \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 64 \
-    --trust_remote_code \
-    --force_truncation \
-    --save_all 
+if [ $# -ge 5 ]; then
+  BATCH_SIZE=$5
+fi
 
-rewardbench \
-    --model berkeley-nest/Starling-RM-7B-alpha \
-    --tokenizer meta-llama/Llama-2-7b-chat-hf \
-    --chat_template llama-2 \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 16 \
-    --trust_remote_code \
-    --force_truncation \
-    --save_all 
+# Ensure the model is provided
+if [ -z "$MODEL" ]; then
+  echo "Error: MODEL is required."
+  usage
+fi
 
-rewardbench \
-    --model sfairXC/FsfairX-LLaMA3-RM-v0.1 \
-    --tokenizer sfairXC/FsfairX-LLaMA3-RM-v0.1 \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 4 \
-    --trust_remote_code \
-    --force_truncation \
-    --save_all 
 
-rewardbench \
-    --model openbmb/Eurus-RM-7b \
-    --tokenizer openbmb/Eurus-RM-7b \
-    --chat_template mistral \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 16 \
-    --trust_remote_code \
-    --force_truncation \
-    --save_all 
+# Define the languages and their FLORES-200 codes
+declare -A languages=(
+    ["arb_Arab"]="Arabic"
+    ["zho_Hans"]="Chinese_Simplified"
+    ["zho_Hant"]="Chinese_Traditional"
+    ["ces_Latn"]="Czech"
+    ["nld_Latn"]="Dutch"
+    ["fra_Latn"]="French"
+    ["deu_Latn"]="German"
+    ["ell_Grek"]="Greek"
+    ["heb_Hebr"]="Hebrew"
+    ["hin_Deva"]="Hindi"
+    ["ind_Latn"]="Indonesian"
+    ["ita_Latn"]="Italian"
+    ["jpn_Jpan"]="Japanese"
+    ["kor_Hang"]="Korean"
+    ["pes_Arab"]="Persian"
+    ["pol_Latn"]="Polish"
+    ["por_Latn"]="Portuguese"
+    ["ron_Latn"]="Romanian"
+    ["rus_Cyrl"]="Russian"
+    ["spa_Latn"]="Spanish"
+    ["tur_Latn"]="Turkish"
+    ["ukr_Cyrl"]="Ukrainian"
+    ["vie_Latn"]="Vietnamese"
+)
 
-rewardbench \
-    --model allenai/tulu-v2.5-13b-preference-mix-rm \
-    --tokenizer allenai/tulu-v2.5-13b-preference-mix-rm \
-    --chat_template mistral \
-    --dataset $DATASET \
-    --split $SPLIT \
-    --output_dir $OUTDIR \
-    --batch_size 4 \
+# Loop through each language and run the command
+for lang_code in "${!languages[@]}"; do
+  python3 scripts/run_rewardbench.py \
+    --model "$MODEL" \
+    --chat_template "$CHAT_TEMPLATE" \
+    --dataset "$DATASET" \
+    --lang_code "$lang_code" \
+    --split "filtered" \
+    --output_dir "$OUTDIR" \
+    --batch_size "$BATCH_SIZE" \
     --trust_remote_code \
     --force_truncation \
-    --save_all 
+    --save_all
+done
diff --git a/scripts/generative.py b/scripts/generative.py
@@ -77,6 +77,8 @@
     "command-nightly",
     "command-light",
     "command-light-nightly",
+    "c4ai-aya-23-35b",
+    "c4ai-aya-23-8b",
 )
 
 API_MODEL_LIST = OPENAI_MODEL_LIST + ANTHROPIC_MODEL_LIST + TOGETHER_MODEL_LIST + COHERE_MODEL_LIST
@@ -293,7 +295,6 @@ def format_judge_answers(
                 **kwargs,
             )
     else:
-        print("Using the MT-Bench prompts")
         if multi_turn:
             system_prompt = (
                 MTBENCH_MULTI_V2["system_prompt"]