From 49109cf27ec534bf62380a99eee7d82aa8b0fcdc Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Sat, 9 Nov 2024 02:07:11 -0800 Subject: [PATCH 1/6] update last-updated date --- leaderboard.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/leaderboard.html b/leaderboard.html index 8e67f3d9f..4834b0bca 100644 --- a/leaderboard.html +++ b/leaderboard.html @@ -109,7 +109,7 @@

BFCL Leaderboard

Last Updated: - 2024-10-21 [Change Log]
From d097c1851cac29e7ea968330dab3bd7a2d733336 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Wed, 13 Nov 2024 18:32:14 -0800 Subject: [PATCH 2/6] update data.csv --- data_live.csv | 75 ++++++++++++------------ data_non_live.csv | 107 +++++++++++++++++------------------ data_overall.csv | 141 +++++++++++++++++++++++----------------------- 3 files changed, 160 insertions(+), 163 deletions(-) diff --git a/data_live.csv b/data_live.csv index 7b61686a5..447e79e95 100644 --- a/data_live.csv +++ b/data_live.csv @@ -12,28 +12,28 @@ Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Py 11,Gemini-1.5-Pro-001 (Prompt),73.12%,69.14%,67.44%,69.24%,93.75%,66.67%,80.00%,56.10% 12,Mistral-Medium-2312 (Prompt),73.10%,71.84%,68.60%,73.00%,81.25%,50.00%,100.00%,60.98% 13,o1-preview-2024-09-12 (Prompt),73.08%,77.53%,80.62%,76.76%,75.00%,79.17%,66.29%,73.17% -14,xLAM-8x22b-r (FC),71.97%,79.40%,78.29%,80.14%,75.00%,62.50%,60.00%,85.37% -15,Functionary-Small-v3.1 (FC),70.41%,75.58%,75.19%,75.89%,81.25%,62.50%,61.83%,85.37% -16,Mistral-small-2402 (FC),70.19%,68.16%,63.57%,71.46%,12.50%,12.50%,72.69%,82.93% -17,GPT-4o-mini-2024-07-18 (FC),70.19%,74.23%,72.87%,74.45%,87.50%,70.83%,63.54%,80.49% -18,Hammer2.0-7b (FC),69.79%,76.63%,74.42%,77.15%,81.25%,75.00%,58.17%,95.12% -19,Command-R-Plus (Prompt) (Original),69.75%,69.59%,66.67%,70.30%,68.75%,70.83%,69.83%,73.17% -20,Gemma-2-27b-it (Prompt),69.48%,77.30%,79.46%,77.24%,68.75%,62.50%,56.69%,87.80% -21,Gemma-2-9b-it (Prompt),69.21%,73.11%,73.64%,73.58%,56.25%,58.33%,62.40%,87.80% +14,GoGoAgent,72.46%,72.21%,71.32%,72.42%,87.50%,62.50%,72.11%,87.80% +15,xLAM-8x22b-r (FC),71.97%,79.40%,78.29%,80.14%,75.00%,62.50%,60.00%,85.37% +16,Functionary-Small-v3.1 (FC),70.41%,75.58%,75.19%,75.89%,81.25%,62.50%,61.83%,85.37% +17,Mistral-small-2402 (FC),70.19%,68.16%,63.57%,71.46%,12.50%,12.50%,72.69%,82.93% +18,GPT-4o-mini-2024-07-18 (FC),70.19%,74.23%,72.87%,74.45%,87.50%,70.83%,63.54%,80.49% +19,Hammer2.0-7b (FC),69.79%,76.63%,74.42%,77.15%,81.25%,75.00%,58.17%,95.12% +20,Command-R-Plus (Prompt) (Original),69.75%,69.59%,66.67%,70.30%,68.75%,70.83%,69.83%,73.17% +21,Gemma-2-27b-it (Prompt),69.48%,77.30%,79.46%,77.24%,68.75%,62.50%,56.69%,87.80% 22,Gemini-1.5-Flash-001 (Prompt),69.21%,75.21%,74.42%,75.12%,93.75%,75.00%,59.43%,82.93% -23,xLAM-8x7b-r (FC),69.12%,74.53%,68.22%,76.76%,62.50%,54.17%,60.00%,87.80% -24,GPT-4-turbo-2024-04-09 (Prompt),69.04%,84.64%,85.66%,84.57%,87.50%,75.00%,44.57%,82.93% -25,Open-Mixtral-8x22b (Prompt),68.46%,63.90%,72.87%,61.33%,81.25%,66.67%,75.54%,65.85% -26,mistral-large-2407 (FC),68.37%,79.55%,81.78%,79.27%,68.75%,75.00%,50.97%,75.61% -27,xLAM-7b-r (FC),67.88%,72.28%,71.32%,73.48%,31.25%,58.33%,59.77%,97.56% -28,GPT-3.5-Turbo-0125 (Prompt),67.48%,64.27%,63.57%,64.61%,68.75%,54.17%,71.77%,80.49% -29,Gorilla-OpenFunctions-v2 (FC),67.44%,61.42%,73.64%,58.73%,68.75%,41.67%,76.34%,73.17% -30,Gemini-1.5-Flash-002 (FC),67.35%,57.98%,58.14%,57.96%,68.75%,50.00%,81.94%,60.98% -31,Open-Mixtral-8x22b (FC),66.86%,71.16%,73.26%,72.32%,6.25%,41.67%,59.54%,82.93% -32,Meta-Llama-3-70B-Instruct (Prompt),66.15%,79.10%,78.68%,79.65%,68.75%,66.67%,45.14%,92.68% -33,Qwen2.5-7B-Instruct (Prompt),65.97%,72.13%,72.48%,72.32%,62.50%,66.67%,55.31%,92.68% -34,Gemini-1.5-Pro-001 (FC),65.53%,58.05%,57.75%,58.24%,75.00%,41.67%,77.03%,63.41% -35,Claude-3-Haiku-20240307 (Prompt),65.04%,74.53%,77.13%,74.64%,68.75%,45.83%,49.71%,82.93% +23,Gemma-2-9b-it (Prompt),69.21%,73.11%,73.64%,73.58%,56.25%,58.33%,62.40%,87.80% +24,xLAM-8x7b-r (FC),69.12%,74.53%,68.22%,76.76%,62.50%,54.17%,60.00%,87.80% +25,GPT-4-turbo-2024-04-09 (Prompt),69.04%,84.64%,85.66%,84.57%,87.50%,75.00%,44.57%,82.93% +26,Open-Mixtral-8x22b (Prompt),68.46%,63.90%,72.87%,61.33%,81.25%,66.67%,75.54%,65.85% +27,mistral-large-2407 (FC),68.37%,79.55%,81.78%,79.27%,68.75%,75.00%,50.97%,75.61% +28,xLAM-7b-r (FC),67.88%,72.28%,71.32%,73.48%,31.25%,58.33%,59.77%,97.56% +29,GPT-3.5-Turbo-0125 (Prompt),67.48%,64.27%,63.57%,64.61%,68.75%,54.17%,71.77%,80.49% +30,Gorilla-OpenFunctions-v2 (FC),67.44%,61.42%,73.64%,58.73%,68.75%,41.67%,76.34%,73.17% +31,Gemini-1.5-Flash-002 (FC),67.35%,57.98%,58.14%,57.96%,68.75%,50.00%,81.94%,60.98% +32,Open-Mixtral-8x22b (FC),66.86%,71.16%,73.26%,72.32%,6.25%,41.67%,59.54%,82.93% +33,Meta-Llama-3-70B-Instruct (Prompt),66.15%,79.10%,78.68%,79.65%,68.75%,66.67%,45.14%,92.68% +34,Qwen2.5-7B-Instruct (Prompt),65.97%,72.13%,72.48%,72.32%,62.50%,66.67%,55.31%,92.68% +35,Gemini-1.5-Pro-001 (FC),65.53%,58.05%,57.75%,58.24%,75.00%,41.67%,77.03%,63.41% 36,Open-Mixtral-8x7b (Prompt),64.95%,63.30%,57.36%,65.00%,68.75%,50.00%,67.31%,68.29% 37,Gemini-1.5-Flash-001 (FC),64.90%,59.48%,58.14%,60.46%,43.75%,41.67%,73.49%,58.54% 38,Gemini-1.5-Pro-002 (FC),64.59%,61.05%,58.91%,61.33%,81.25%,58.33%,69.71%,70.73% @@ -53,20 +53,19 @@ Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Py 52,Granite-20b-FunctionCalling (FC),57.49%,57.08%,65.12%,55.35%,43.75%,54.17%,56.34%,95.12% 53,Command-R-Plus (FC) (Original),57.26%,61.50%,66.67%,60.56%,56.25%,50.00%,49.14%,92.68% 54,Hermes-2-Pro-Mistral-7B (FC),56.46%,59.85%,64.73%,59.40%,43.75%,37.50%,50.40%,75.61% -55,Claude-3.5-Sonnet-20240620 (Prompt),54.24%,31.24%,65.12%,22.66%,37.50%,33.33%,90.97%,19.51% -56,Qwen2-7B-Instruct (Prompt),54.24%,61.57%,59.30%,62.20%,50.00%,66.67%,41.49%,87.80% -57,Mistral-Small-2402 (Prompt),53.98%,39.48%,18.22%,45.90%,12.50%,8.33%,76.69%,41.46% -58,Nexusflow-Raven-v2 (FC),53.49%,39.03%,39.92%,38.48%,56.25%,41.67%,74.97%,65.85% -59,xLAM-7b-fc-r (FC),53.44%,60.07%,75.58%,57.28%,43.75%,25.00%,42.51%,70.73% -60,mistral-large-2407 (Prompt),53.35%,67.42%,45.74%,73.10%,68.75%,54.17%,30.17%,90.24% -61,Hammer2.0-0.5b (FC),52.42%,45.17%,48.84%,44.07%,62.50%,41.67%,61.94%,85.37% -62,Llama-3.2-3B-Instruct (Prompt),50.91%,44.49%,47.67%,44.74%,0.00%,29.17%,60.11%,63.41% -63,Meta-Llama-3-8B-Instruct (Prompt),50.51%,59.78%,60.85%,60.75%,37.50%,20.83%,35.20%,75.61% -64,Open-Mistral-Nemo-2407 (Prompt),50.33%,75.06%,78.29%,74.54%,75.00%,62.50%,10.74%,90.24% -65,Gemini-1.0-Pro-002 (Prompt),45.67%,38.13%,41.47%,36.93%,68.75%,33.33%,55.54%,80.49% -66,Llama-3.1-70B-Instruct (FC),44.47%,51.01%,48.45%,52.56%,31.25%,25.00%,31.89%,100.00% -67,Gemma-2-2b-it (Prompt),41.63%,11.46%,11.24%,11.96%,0.00%,0.00%,89.03%,12.20% -68,Qwen2-1.5B-Instruct (Prompt),39.00%,41.87%,50.39%,40.50%,25.00%,20.83%,32.91%,75.61% -69,xLAM-1b-fc-r (FC),38.34%,54.31%,63.18%,54.19%,0.00%,0.00%,11.20%,97.56% -70,Llama-3.1-8B-Instruct (FC),33.23%,47.34%,48.06%,47.64%,31.25%,37.50%,8.91%,92.68% -71,Llama-3.2-1B-Instruct (Prompt),29.85%,8.91%,25.97%,4.82%,6.25%,4.17%,60.91%,48.78% \ No newline at end of file +55,Qwen2-7B-Instruct (Prompt),54.24%,61.57%,59.30%,62.20%,50.00%,66.67%,41.49%,87.80% +56,Mistral-Small-2402 (Prompt),53.98%,39.48%,18.22%,45.90%,12.50%,8.33%,76.69%,41.46% +57,Nexusflow-Raven-v2 (FC),53.49%,39.03%,39.92%,38.48%,56.25%,41.67%,74.97%,65.85% +58,xLAM-7b-fc-r (FC),53.44%,60.07%,75.58%,57.28%,43.75%,25.00%,42.51%,70.73% +59,mistral-large-2407 (Prompt),53.35%,67.42%,45.74%,73.10%,68.75%,54.17%,30.17%,90.24% +60,Hammer2.0-0.5b (FC),52.42%,45.17%,48.84%,44.07%,62.50%,41.67%,61.94%,85.37% +61,Llama-3.2-3B-Instruct (Prompt),50.91%,44.49%,47.67%,44.74%,0.00%,29.17%,60.11%,63.41% +62,Meta-Llama-3-8B-Instruct (Prompt),50.51%,59.78%,60.85%,60.75%,37.50%,20.83%,35.20%,75.61% +63,Open-Mistral-Nemo-2407 (Prompt),50.33%,75.06%,78.29%,74.54%,75.00%,62.50%,10.74%,90.24% +64,Gemini-1.0-Pro-002 (Prompt),45.67%,38.13%,41.47%,36.93%,68.75%,33.33%,55.54%,80.49% +65,Llama-3.1-70B-Instruct (FC),44.47%,51.01%,48.45%,52.56%,31.25%,25.00%,31.89%,100.00% +66,Gemma-2-2b-it (Prompt),41.63%,11.46%,11.24%,11.96%,0.00%,0.00%,89.03%,12.20% +67,Qwen2-1.5B-Instruct (Prompt),39.00%,41.87%,50.39%,40.50%,25.00%,20.83%,32.91%,75.61% +68,xLAM-1b-fc-r (FC),38.34%,54.31%,63.18%,54.19%,0.00%,0.00%,11.20%,97.56% +69,Llama-3.1-8B-Instruct (FC),33.23%,47.34%,48.06%,47.64%,31.25%,37.50%,8.91%,92.68% +70,Llama-3.2-1B-Instruct (Prompt),29.85%,8.91%,25.97%,4.82%,6.25%,4.17%,60.91%,48.78% \ No newline at end of file diff --git a/data_non_live.csv b/data_non_live.csv index 3ac3ea165..2d37967c1 100644 --- a/data_non_live.csv +++ b/data_non_live.csv @@ -6,49 +6,49 @@ Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simpl 5,GPT-4o-mini-2024-07-18 (Prompt),88.69%,86.23%,91.12%,79.42%,93.25%,65.00%,80.00%,93.00%,86.50%,86.00%,100.00%,100.00%,100.00%,96.00%,86.00%,82.50%,88.75% 6,Hammer2.0-7b (FC),88.54%,90.27%,89.25%,80.58%,97.75%,66.00%,78.00%,95.00%,93.50%,92.00%,90.00%,100.00%,80.00%,94.00%,88.00%,85.00%,78.75% 7,Gemini-1.5-Flash-002 (Prompt),87.60%,86.58%,89.48%,75.33%,95.00%,63.00%,68.00%,91.50%,91.50%,88.00%,95.93%,99.00%,92.86%,96.00%,86.00%,80.00%,84.17% -8,xLAM-8x22b-r (FC),87.51%,88.15%,90.11%,81.08%,95.25%,66.00%,82.00%,93.00%,91.50%,87.00%,96.43%,100.00%,92.86%,96.00%,88.00%,80.00%,74.58% -9,Llama-3.1-70B-Instruct (Prompt),87.50%,88.90%,89.34%,76.58%,95.75%,60.00%,74.00%,95.50%,93.50%,90.00%,91.36%,97.00%,85.71%,96.00%,90.00%,80.00%,74.58% -10,Gemma-2-27b-it (Prompt),87.39%,88.52%,87.89%,81.08%,95.25%,64.00%,84.00%,92.50%,91.00%,89.50%,83.57%,100.00%,67.14%,96.00%,92.00%,80.00%,80.83% -11,o1-preview-2024-09-12 (Prompt),87.12%,86.42%,88.88%,78.17%,93.50%,67.00%,74.00%,93.00%,89.50%,85.00%,99.50%,99.00%,100.00%,92.00%,84.00%,80.00%,82.92% -12,Gemini-1.5-Pro-001 (Prompt),86.17%,83.88%,87.52%,73.00%,91.00%,60.00%,68.00%,91.50%,88.00%,83.00%,91.57%,96.00%,87.14%,94.00%,82.00%,82.50%,90.00% -13,GPT-4o-2024-08-06 (FC),86.15%,85.90%,85.64%,74.58%,91.75%,64.00%,68.00%,92.50%,92.00%,84.50%,87.07%,97.00%,77.14%,92.00%,86.00%,77.50%,89.17% -14,Open-Mixtral-8x22b (Prompt),86.08%,86.92%,88.23%,77.67%,94.00%,59.00%,80.00%,92.50%,90.00%,87.50%,91.43%,100.00%,82.86%,96.00%,88.00%,77.50%,74.17% -15,Gemini-1.5-Flash-001 (Prompt),85.74%,86.17%,87.68%,73.17%,89.50%,64.00%,66.00%,90.50%,92.00%,89.00%,84.21%,97.00%,71.43%,94.00%,90.00%,82.50%,76.25% -16,Qwen2.5-7B-Instruct (Prompt),85.58%,85.79%,88.13%,75.67%,96.00%,59.00%,72.00%,96.00%,88.50%,83.00%,94.50%,99.00%,90.00%,92.00%,86.00%,80.00%,74.58% -17,Meta-Llama-3-70B-Instruct (Prompt),85.10%,87.17%,89.21%,75.17%,95.50%,60.00%,70.00%,95.50%,90.50%,87.50%,95.86%,96.00%,95.71%,96.00%,80.00%,85.00%,60.42% -18,Functionary-Small-v3.1 (FC),84.99%,86.42%,85.95%,74.67%,96.00%,62.00%,66.00%,94.50%,89.50%,87.00%,88.79%,99.00%,78.57%,92.00%,88.00%,75.00%,75.42% -19,Gorilla-OpenFunctions-v2 (FC),84.81%,86.29%,86.09%,77.67%,95.00%,62.00%,76.00%,95.00%,89.00%,83.50%,95.86%,96.00%,95.71%,96.00%,80.00%,72.50%,73.75% -20,Granite-20b-FunctionCalling (FC),84.64%,82.33%,85.91%,72.83%,90.50%,66.00%,62.00%,91.50%,84.50%,80.50%,85.64%,97.00%,74.29%,92.00%,86.00%,80.00%,88.75% -21,GPT-4-turbo-2024-04-09 (FC),84.55%,84.67%,84.32%,69.17%,92.50%,59.00%,56.00%,91.00%,90.50%,88.00%,88.29%,98.00%,78.57%,88.00%,86.00%,75.00%,85.00% -22,Gemma-2-9b-it (Prompt),84.52%,84.38%,85.18%,74.50%,93.50%,60.00%,70.00%,92.00%,88.00%,83.00%,84.21%,97.00%,71.43%,94.00%,90.00%,72.50%,82.50% -23,Hammer2.0-1.5b (FC),84.44%,84.06%,88.95%,75.25%,94.75%,65.00%,66.00%,90.50%,88.00%,82.50%,93.29%,98.00%,88.57%,92.00%,88.00%,82.50%,67.92% -24,o1-mini-2024-09-12 (Prompt),83.84%,81.31%,84.00%,73.75%,88.25%,61.00%,72.00%,90.00%,81.00%,80.50%,88.50%,97.00%,80.00%,92.00%,78.00%,77.50%,93.33% -25,GPT-4o-mini-2024-07-18 (FC),83.72%,84.25%,84.12%,73.50%,90.50%,64.00%,66.00%,90.50%,90.00%,83.00%,83.50%,97.00%,70.00%,92.00%,86.00%,75.00%,80.00% -26,Command-R-Plus (Prompt) (Original),82.19%,80.90%,85.07%,71.08%,89.25%,60.00%,64.00%,91.50%,82.00%,79.00%,93.29%,98.00%,88.57%,90.00%,82.00%,75.00%,75.83% -27,mistral-large-2407 (FC),81.41%,86.62%,84.57%,73.00%,96.00%,57.00%,66.00%,92.00%,91.50%,90.00%,73.79%,99.00%,48.57%,94.00%,88.00%,82.50%,47.92% -28,Llama-3.1-8B-Instruct (Prompt),81.15%,83.62%,87.29%,73.00%,94.00%,59.00%,66.00%,94.50%,83.50%,83.50%,85.64%,97.00%,74.29%,96.00%,90.00%,77.50%,46.67% -29,xLAM-7b-r (FC),80.86%,81.40%,83.46%,73.08%,91.25%,56.00%,72.00%,93.50%,79.50%,79.50%,76.86%,98.00%,55.71%,92.00%,90.00%,75.00%,68.33% -30,Open-Mistral-Nemo-2407 (Prompt),79.66%,85.60%,91.23%,77.42%,92.25%,60.00%,80.00%,93.50%,87.00%,84.50%,95.93%,99.00%,92.86%,96.00%,88.00%,85.00%,9.58% -31,Mistral-Medium-2312 (Prompt),79.27%,74.02%,81.73%,70.58%,91.75%,56.00%,64.00%,91.50%,65.50%,68.50%,95.43%,98.00%,92.86%,92.00%,72.00%,67.50%,90.42% -32,Hermes-2-Pro-Llama-3-70B (FC),78.81%,78.85%,80.45%,59.92%,83.75%,54.00%,42.00%,80.00%,88.00%,87.50%,76.29%,94.00%,58.57%,82.00%,86.00%,77.50%,72.08% -33,GPT-3.5-Turbo-0125 (FC),78.52%,84.12%,84.11%,75.50%,94.50%,64.00%,68.00%,93.00%,88.00%,80.00%,95.43%,98.00%,92.86%,90.00%,86.00%,65.00%,33.75% -34,Open-Mistral-Nemo-2407 (FC),78.29%,81.21%,77.04%,63.33%,92.00%,36.00%,62.00%,92.00%,86.50%,83.00%,55.64%,97.00%,14.29%,90.00%,90.00%,72.50%,71.67% -35,Qwen2.5-1.5B-Instruct (Prompt),78.14%,75.19%,82.82%,70.25%,87.75%,55.00%,68.00%,85.50%,73.50%,71.50%,72.79%,97.00%,48.57%,94.00%,82.00%,82.50%,71.25% -36,Qwen2-7B-Instruct (Prompt),75.50%,74.85%,81.70%,67.42%,84.25%,60.00%,58.00%,87.50%,71.00%,73.50%,86.79%,95.00%,78.57%,88.00%,82.00%,70.00%,53.33% -37,Command-R-Plus (FC) (Original),75.47%,76.83%,78.61%,66.33%,87.00%,60.00%,52.00%,90.00%,82.00%,69.00%,88.93%,95.00%,82.86%,88.00%,80.00%,57.50%,57.50% -38,Hermes-2-Pro-Llama-3-8B (FC),74.14%,76.54%,75.48%,64.17%,90.50%,56.00%,46.00%,89.50%,79.50%,73.00%,69.93%,97.00%,42.86%,94.00%,78.00%,60.00%,59.17% -39,Llama-3.2-3B-Instruct (Prompt),74.03%,77.77%,69.41%,64.08%,81.25%,49.00%,62.00%,90.00%,80.50%,76.50%,78.14%,82.00%,74.29%,92.00%,50.00%,57.50%,77.50% -40,xLAM-8x7b-r (FC),73.93%,68.85%,78.43%,68.42%,79.25%,60.00%,66.00%,88.00%,63.50%,55.50%,87.71%,94.00%,81.43%,88.00%,68.00%,70.00%,76.25% -41,Hermes-2-Pro-Mistral-7B (FC),69.78%,72.83%,77.30%,61.33%,86.00%,56.00%,42.00%,87.50%,78.50%,64.00%,61.71%,92.00%,31.43%,94.00%,86.00%,67.50%,27.50% -42,Claude-3.5-Sonnet-20240620 (FC),69.19%,70.04%,66.27%,75.17%,93.50%,64.00%,68.00%,93.50%,64.50%,47.00%,97.57%,98.00%,97.14%,90.00%,40.00%,37.50%,77.50% -43,DBRX-Instruct (Prompt),68.89%,67.04%,75.04%,72.17%,92.50%,54.00%,70.00%,91.50%,56.50%,48.00%,90.14%,96.00%,84.29%,88.00%,62.00%,60.00%,51.67% -44,Hammer2.0-0.5b (FC),68.44%,66.79%,70.43%,62.17%,82.50%,52.00%,52.00%,80.00%,67.50%,57.50%,53.21%,95.00%,11.43%,86.00%,80.00%,62.50%,67.08% -45,xLAM-7b-fc-r (FC),67.87%,74.56%,65.75%,74.25%,93.75%,63.00%,66.00%,92.00%,78.00%,54.00%,84.50%,99.00%,70.00%,90.00%,66.00%,22.50%,49.58% -46,GPT-3.5-Turbo-0125 (Prompt),67.78%,65.04%,67.68%,62.67%,78.00%,48.00%,62.00%,83.00%,65.50%,49.00%,46.21%,91.00%,1.43%,90.00%,72.00%,62.50%,79.17% -47,Open-Mixtral-8x7b (Prompt),64.49%,57.94%,65.91%,68.75%,89.25%,53.00%,64.00%,86.00%,40.00%,37.00%,71.14%,88.00%,54.29%,88.00%,52.00%,52.50%,85.00% -48,GPT-4o-2024-08-06 (Prompt),63.57%,49.35%,69.93%,32.42%,66.25%,11.00%,20.00%,48.00%,74.00%,43.00%,49.71%,88.00%,11.43%,82.00%,78.00%,70.00%,95.00% -49,Claude-3-Opus-20240229 (FC tools-2024-04-04),62.73%,58.67%,62.05%,68.67%,89.00%,61.00%,56.00%,89.00%,41.00%,36.00%,88.71%,96.00%,81.43%,88.00%,44.00%,27.50%,81.67% -50,Claude-3.5-Sonnet-20240620 (Prompt),61.29%,60.58%,54.20%,50.33%,94.00%,31.00%,26.00%,88.00%,43.50%,60.50%,66.79%,75.00%,58.57%,52.00%,38.00%,60.00%,92.50% +8,GoGoAgent,87.54%,86.00%,88.05%,75.50%,92.50%,64.00%,70.00%,92.50%,92.00%,84.00%,94.71%,98.00%,91.43%,94.00%,86.00%,77.50%,91.67% +9,xLAM-8x22b-r (FC),87.51%,88.15%,90.11%,81.08%,95.25%,66.00%,82.00%,93.00%,91.50%,87.00%,96.43%,100.00%,92.86%,96.00%,88.00%,80.00%,74.58% +10,Llama-3.1-70B-Instruct (Prompt),87.50%,88.90%,89.34%,76.58%,95.75%,60.00%,74.00%,95.50%,93.50%,90.00%,91.36%,97.00%,85.71%,96.00%,90.00%,80.00%,74.58% +11,Gemma-2-27b-it (Prompt),87.39%,88.52%,87.89%,81.08%,95.25%,64.00%,84.00%,92.50%,91.00%,89.50%,83.57%,100.00%,67.14%,96.00%,92.00%,80.00%,80.83% +12,o1-preview-2024-09-12 (Prompt),87.12%,86.42%,88.88%,78.17%,93.50%,67.00%,74.00%,93.00%,89.50%,85.00%,99.50%,99.00%,100.00%,92.00%,84.00%,80.00%,82.92% +13,Gemini-1.5-Pro-001 (Prompt),86.17%,83.88%,87.52%,73.00%,91.00%,60.00%,68.00%,91.50%,88.00%,83.00%,91.57%,96.00%,87.14%,94.00%,82.00%,82.50%,90.00% +14,GPT-4o-2024-08-06 (FC),86.15%,85.90%,85.64%,74.58%,91.75%,64.00%,68.00%,92.50%,92.00%,84.50%,87.07%,97.00%,77.14%,92.00%,86.00%,77.50%,89.17% +15,Open-Mixtral-8x22b (Prompt),86.08%,86.92%,88.23%,77.67%,94.00%,59.00%,80.00%,92.50%,90.00%,87.50%,91.43%,100.00%,82.86%,96.00%,88.00%,77.50%,74.17% +16,Gemini-1.5-Flash-001 (Prompt),85.74%,86.17%,87.68%,73.17%,89.50%,64.00%,66.00%,90.50%,92.00%,89.00%,84.21%,97.00%,71.43%,94.00%,90.00%,82.50%,76.25% +17,Qwen2.5-7B-Instruct (Prompt),85.58%,85.79%,88.13%,75.67%,96.00%,59.00%,72.00%,96.00%,88.50%,83.00%,94.50%,99.00%,90.00%,92.00%,86.00%,80.00%,74.58% +18,Meta-Llama-3-70B-Instruct (Prompt),85.10%,87.17%,89.21%,75.17%,95.50%,60.00%,70.00%,95.50%,90.50%,87.50%,95.86%,96.00%,95.71%,96.00%,80.00%,85.00%,60.42% +19,Functionary-Small-v3.1 (FC),84.99%,86.42%,85.95%,74.67%,96.00%,62.00%,66.00%,94.50%,89.50%,87.00%,88.79%,99.00%,78.57%,92.00%,88.00%,75.00%,75.42% +20,Gorilla-OpenFunctions-v2 (FC),84.81%,86.29%,86.09%,77.67%,95.00%,62.00%,76.00%,95.00%,89.00%,83.50%,95.86%,96.00%,95.71%,96.00%,80.00%,72.50%,73.75% +21,Granite-20b-FunctionCalling (FC),84.64%,82.33%,85.91%,72.83%,90.50%,66.00%,62.00%,91.50%,84.50%,80.50%,85.64%,97.00%,74.29%,92.00%,86.00%,80.00%,88.75% +22,GPT-4-turbo-2024-04-09 (FC),84.55%,84.67%,84.32%,69.17%,92.50%,59.00%,56.00%,91.00%,90.50%,88.00%,88.29%,98.00%,78.57%,88.00%,86.00%,75.00%,85.00% +23,Gemma-2-9b-it (Prompt),84.52%,84.38%,85.18%,74.50%,93.50%,60.00%,70.00%,92.00%,88.00%,83.00%,84.21%,97.00%,71.43%,94.00%,90.00%,72.50%,82.50% +24,Hammer2.0-1.5b (FC),84.44%,84.06%,88.95%,75.25%,94.75%,65.00%,66.00%,90.50%,88.00%,82.50%,93.29%,98.00%,88.57%,92.00%,88.00%,82.50%,67.92% +25,o1-mini-2024-09-12 (Prompt),83.84%,81.31%,84.00%,73.75%,88.25%,61.00%,72.00%,90.00%,81.00%,80.50%,88.50%,97.00%,80.00%,92.00%,78.00%,77.50%,93.33% +26,GPT-4o-mini-2024-07-18 (FC),83.72%,84.25%,84.12%,73.50%,90.50%,64.00%,66.00%,90.50%,90.00%,83.00%,83.50%,97.00%,70.00%,92.00%,86.00%,75.00%,80.00% +27,Command-R-Plus (Prompt) (Original),82.19%,80.90%,85.07%,71.08%,89.25%,60.00%,64.00%,91.50%,82.00%,79.00%,93.29%,98.00%,88.57%,90.00%,82.00%,75.00%,75.83% +28,mistral-large-2407 (FC),81.41%,86.62%,84.57%,73.00%,96.00%,57.00%,66.00%,92.00%,91.50%,90.00%,73.79%,99.00%,48.57%,94.00%,88.00%,82.50%,47.92% +29,Llama-3.1-8B-Instruct (Prompt),81.15%,83.62%,87.29%,73.00%,94.00%,59.00%,66.00%,94.50%,83.50%,83.50%,85.64%,97.00%,74.29%,96.00%,90.00%,77.50%,46.67% +30,xLAM-7b-r (FC),80.86%,81.40%,83.46%,73.08%,91.25%,56.00%,72.00%,93.50%,79.50%,79.50%,76.86%,98.00%,55.71%,92.00%,90.00%,75.00%,68.33% +31,Open-Mistral-Nemo-2407 (Prompt),79.66%,85.60%,91.23%,77.42%,92.25%,60.00%,80.00%,93.50%,87.00%,84.50%,95.93%,99.00%,92.86%,96.00%,88.00%,85.00%,9.58% +32,Mistral-Medium-2312 (Prompt),79.27%,74.02%,81.73%,70.58%,91.75%,56.00%,64.00%,91.50%,65.50%,68.50%,95.43%,98.00%,92.86%,92.00%,72.00%,67.50%,90.42% +33,Hermes-2-Pro-Llama-3-70B (FC),78.81%,78.85%,80.45%,59.92%,83.75%,54.00%,42.00%,80.00%,88.00%,87.50%,76.29%,94.00%,58.57%,82.00%,86.00%,77.50%,72.08% +34,GPT-3.5-Turbo-0125 (FC),78.52%,84.12%,84.11%,75.50%,94.50%,64.00%,68.00%,93.00%,88.00%,80.00%,95.43%,98.00%,92.86%,90.00%,86.00%,65.00%,33.75% +35,Open-Mistral-Nemo-2407 (FC),78.29%,81.21%,77.04%,63.33%,92.00%,36.00%,62.00%,92.00%,86.50%,83.00%,55.64%,97.00%,14.29%,90.00%,90.00%,72.50%,71.67% +36,Qwen2.5-1.5B-Instruct (Prompt),78.14%,75.19%,82.82%,70.25%,87.75%,55.00%,68.00%,85.50%,73.50%,71.50%,72.79%,97.00%,48.57%,94.00%,82.00%,82.50%,71.25% +37,Qwen2-7B-Instruct (Prompt),75.50%,74.85%,81.70%,67.42%,84.25%,60.00%,58.00%,87.50%,71.00%,73.50%,86.79%,95.00%,78.57%,88.00%,82.00%,70.00%,53.33% +38,Command-R-Plus (FC) (Original),75.47%,76.83%,78.61%,66.33%,87.00%,60.00%,52.00%,90.00%,82.00%,69.00%,88.93%,95.00%,82.86%,88.00%,80.00%,57.50%,57.50% +39,Hermes-2-Pro-Llama-3-8B (FC),74.14%,76.54%,75.48%,64.17%,90.50%,56.00%,46.00%,89.50%,79.50%,73.00%,69.93%,97.00%,42.86%,94.00%,78.00%,60.00%,59.17% +40,Llama-3.2-3B-Instruct (Prompt),74.03%,77.77%,69.41%,64.08%,81.25%,49.00%,62.00%,90.00%,80.50%,76.50%,78.14%,82.00%,74.29%,92.00%,50.00%,57.50%,77.50% +41,xLAM-8x7b-r (FC),73.93%,68.85%,78.43%,68.42%,79.25%,60.00%,66.00%,88.00%,63.50%,55.50%,87.71%,94.00%,81.43%,88.00%,68.00%,70.00%,76.25% +42,Hermes-2-Pro-Mistral-7B (FC),69.78%,72.83%,77.30%,61.33%,86.00%,56.00%,42.00%,87.50%,78.50%,64.00%,61.71%,92.00%,31.43%,94.00%,86.00%,67.50%,27.50% +43,Claude-3.5-Sonnet-20240620 (FC),69.19%,70.04%,66.27%,75.17%,93.50%,64.00%,68.00%,93.50%,64.50%,47.00%,97.57%,98.00%,97.14%,90.00%,40.00%,37.50%,77.50% +44,DBRX-Instruct (Prompt),68.89%,67.04%,75.04%,72.17%,92.50%,54.00%,70.00%,91.50%,56.50%,48.00%,90.14%,96.00%,84.29%,88.00%,62.00%,60.00%,51.67% +45,Hammer2.0-0.5b (FC),68.44%,66.79%,70.43%,62.17%,82.50%,52.00%,52.00%,80.00%,67.50%,57.50%,53.21%,95.00%,11.43%,86.00%,80.00%,62.50%,67.08% +46,xLAM-7b-fc-r (FC),67.87%,74.56%,65.75%,74.25%,93.75%,63.00%,66.00%,92.00%,78.00%,54.00%,84.50%,99.00%,70.00%,90.00%,66.00%,22.50%,49.58% +47,GPT-3.5-Turbo-0125 (Prompt),67.78%,65.04%,67.68%,62.67%,78.00%,48.00%,62.00%,83.00%,65.50%,49.00%,46.21%,91.00%,1.43%,90.00%,72.00%,62.50%,79.17% +48,Open-Mixtral-8x7b (Prompt),64.49%,57.94%,65.91%,68.75%,89.25%,53.00%,64.00%,86.00%,40.00%,37.00%,71.14%,88.00%,54.29%,88.00%,52.00%,52.50%,85.00% +49,GPT-4o-2024-08-06 (Prompt),63.57%,49.35%,69.93%,32.42%,66.25%,11.00%,20.00%,48.00%,74.00%,43.00%,49.71%,88.00%,11.43%,82.00%,78.00%,70.00%,95.00% +50,Claude-3-Opus-20240229 (FC tools-2024-04-04),62.73%,58.67%,62.05%,68.67%,89.00%,61.00%,56.00%,89.00%,41.00%,36.00%,88.71%,96.00%,81.43%,88.00%,44.00%,27.50%,81.67% 51,Meta-Llama-3-8B-Instruct (Prompt),58.94%,61.02%,66.70%,63.08%,88.25%,49.00%,52.00%,85.50%,51.50%,44.00%,83.29%,88.00%,78.57%,82.00%,44.00%,57.50%,19.58% 52,Gemini-1.0-Pro-002 (Prompt),58.91%,56.29%,62.39%,42.17%,43.50%,39.00%,44.00%,51.00%,68.50%,63.50%,48.57%,70.00%,27.14%,76.00%,70.00%,55.00%,55.42% 53,Mistral-small-2402 (FC),58.70%,57.27%,53.77%,67.08%,91.25%,58.00%,52.00%,93.50%,20.00%,48.50%,87.07%,97.00%,77.14%,92.00%,16.00%,20.00%,84.17% @@ -59,14 +59,13 @@ Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simpl 58,Gemini-1.5-Pro-001 (FC),54.90%,31.77%,70.39%,35.58%,40.75%,24.00%,42.00%,39.50%,26.50%,25.50%,75.07%,83.00%,67.14%,80.00%,74.00%,52.50%,85.42% 59,mistral-large-2407 (Prompt),54.60%,62.27%,56.93%,46.58%,64.75%,39.00%,36.00%,77.00%,70.00%,55.50%,40.21%,59.00%,21.43%,80.00%,70.00%,37.50%,14.58% 60,Qwen2-1.5B-Instruct (Prompt),53.99%,59.73%,58.52%,55.92%,79.75%,42.00%,46.00%,80.00%,55.50%,47.50%,51.07%,85.00%,17.14%,82.00%,56.00%,45.00%,12.92% -61,Claude-3-Haiku-20240307 (Prompt),53.93%,58.21%,57.93%,76.83%,95.50%,63.00%,72.00%,93.50%,38.00%,24.50%,89.71%,98.00%,81.43%,96.00%,26.00%,20.00%,20.83% -62,Gemini-1.5-Flash-002 (FC),53.15%,35.42%,60.84%,49.67%,39.00%,56.00%,54.00%,39.00%,24.00%,29.00%,60.86%,66.00%,55.71%,80.00%,50.00%,52.50%,93.33% -63,Gemini-1.5-Flash-001 (FC),51.40%,33.56%,62.41%,47.25%,41.75%,54.00%,46.00%,40.00%,22.50%,24.50%,53.14%,82.00%,24.29%,76.00%,68.00%,52.50%,78.75% -64,Gemini-1.0-Pro-002 (FC),45.85%,26.21%,58.11%,48.83%,42.50%,56.00%,48.00%,39.00%,7.50%,9.50%,76.43%,80.00%,72.86%,76.00%,60.00%,20.00%,75.42% -65,Claude-3-Haiku-20240307 (FC tools-2024-04-04),42.79%,41.67%,47.52%,70.67%,96.00%,60.00%,56.00%,93.50%,2.00%,0.50%,92.07%,97.00%,87.14%,92.00%,6.00%,0.00%,28.33% -66,Llama-3.1-8B-Instruct (FC),38.61%,36.52%,49.93%,56.08%,50.25%,56.00%,62.00%,55.00%,0.00%,35.00%,58.21%,65.00%,51.43%,58.00%,56.00%,27.50%,1.67% -67,xLAM-1b-fc-r (FC),35.96%,39.94%,40.23%,71.25%,82.75%,59.00%,72.00%,85.50%,1.50%,1.50%,74.93%,97.00%,52.86%,86.00%,0.00%,0.00%,2.92% -68,Llama-3.1-70B-Instruct (FC),31.26%,25.15%,31.23%,49.58%,24.75%,58.00%,66.00%,24.50%,11.50%,15.00%,53.43%,34.00%,72.86%,34.00%,30.00%,7.50%,55.83% -69,Llama-3.2-1B-Instruct (Prompt),23.94%,22.77%,19.11%,25.08%,53.25%,12.00%,10.00%,32.00%,24.00%,10.00%,27.93%,53.00%,2.86%,18.00%,28.00%,2.50%,47.92% -70,Mistral-Small-2402 (Prompt),21.01%,16.33%,9.38%,10.83%,31.50%,1.00%,0.00%,36.50%,11.50%,6.50%,13.00%,16.00%,10.00%,18.00%,4.00%,2.50%,86.25% -71,Gemma-2-2b-it (Prompt),19.01%,12.19%,12.88%,7.25%,15.75%,2.00%,4.00%,41.50%,0.00%,0.00%,5.50%,11.00%,0.00%,46.00%,0.00%,0.00%,70.83% \ No newline at end of file +61,Gemini-1.5-Flash-002 (FC),53.15%,35.42%,60.84%,49.67%,39.00%,56.00%,54.00%,39.00%,24.00%,29.00%,60.86%,66.00%,55.71%,80.00%,50.00%,52.50%,93.33% +62,Gemini-1.5-Flash-001 (FC),51.40%,33.56%,62.41%,47.25%,41.75%,54.00%,46.00%,40.00%,22.50%,24.50%,53.14%,82.00%,24.29%,76.00%,68.00%,52.50%,78.75% +63,Gemini-1.0-Pro-002 (FC),45.85%,26.21%,58.11%,48.83%,42.50%,56.00%,48.00%,39.00%,7.50%,9.50%,76.43%,80.00%,72.86%,76.00%,60.00%,20.00%,75.42% +64,Claude-3-Haiku-20240307 (FC tools-2024-04-04),42.79%,41.67%,47.52%,70.67%,96.00%,60.00%,56.00%,93.50%,2.00%,0.50%,92.07%,97.00%,87.14%,92.00%,6.00%,0.00%,28.33% +65,Llama-3.1-8B-Instruct (FC),38.61%,36.52%,49.93%,56.08%,50.25%,56.00%,62.00%,55.00%,0.00%,35.00%,58.21%,65.00%,51.43%,58.00%,56.00%,27.50%,1.67% +66,xLAM-1b-fc-r (FC),35.96%,39.94%,40.23%,71.25%,82.75%,59.00%,72.00%,85.50%,1.50%,1.50%,74.93%,97.00%,52.86%,86.00%,0.00%,0.00%,2.92% +67,Llama-3.1-70B-Instruct (FC),31.26%,25.15%,31.23%,49.58%,24.75%,58.00%,66.00%,24.50%,11.50%,15.00%,53.43%,34.00%,72.86%,34.00%,30.00%,7.50%,55.83% +68,Llama-3.2-1B-Instruct (Prompt),23.94%,22.77%,19.11%,25.08%,53.25%,12.00%,10.00%,32.00%,24.00%,10.00%,27.93%,53.00%,2.86%,18.00%,28.00%,2.50%,47.92% +69,Mistral-Small-2402 (Prompt),21.01%,16.33%,9.38%,10.83%,31.50%,1.00%,0.00%,36.50%,11.50%,6.50%,13.00%,16.00%,10.00%,18.00%,4.00%,2.50%,86.25% +70,Gemma-2-2b-it (Prompt),19.01%,12.19%,12.88%,7.25%,15.75%,2.00%,4.00%,41.50%,0.00%,0.00%,5.50%,11.00%,0.00%,46.00%,0.00%,0.00%,70.83% \ No newline at end of file diff --git a/data_overall.csv b/data_overall.csv index 66ce6633e..762584dd5 100644 --- a/data_overall.csv +++ b/data_overall.csv @@ -1,72 +1,71 @@ Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Multi Turn Composite,Relevance Detection,Irrelevance Detection,Organization,License -1,62.19%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,4.75,1.05,1.18,2.33,85.90%,74.58%,92.50%,92.00%,84.50%,85.64%,87.07%,92.00%,86.00%,77.50%,75.43%,74.42%,75.12%,81.25%,70.83%,25.00%,41.00%,9.00%,20.00%,30.00%,N/A,63.41%,82.93%,OpenAI,Proprietary -2,62.02%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,14.77,18.52,43.22,89.52%,76.08%,96.50%,95.00%,90.50%,89.77%,97.57%,94.00%,90.00%,77.50%,73.48%,79.46%,81.87%,68.75%,70.83%,23.50%,36.50%,12.50%,27.00%,18.00%,N/A,70.73%,73.32%,MeetKai,MIT -3,61.89%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,19.49,2.28,4.39,4.81,84.67%,69.17%,91.00%,90.50%,88.00%,84.32%,88.29%,88.00%,86.00%,75.00%,76.23%,77.52%,77.63%,81.25%,66.67%,24.88%,37.50%,7.00%,22.00%,33.00%,N/A,73.17%,79.76%,OpenAI,Proprietary -4,60.47%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.34,1.3,8.12,2.38,84.25%,73.50%,90.50%,90.00%,83.00%,84.12%,83.50%,92.00%,86.00%,75.00%,70.19%,72.87%,74.45%,87.50%,70.83%,27.50%,39.50%,10.50%,26.50%,33.50%,N/A,80.49%,71.77%,OpenAI,Proprietary -5,60.44%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,N/A,N/A,N/A,87.06%,76.25%,93.00%,90.00%,89.00%,89.52%,98.57%,94.00%,88.00%,77.50%,74.99%,66.67%,74.93%,81.25%,70.83%,17.38%,18.00%,12.50%,18.00%,21.00%,N/A,80.49%,85.71%,Huawei Noah & USTC,Apache-2.0 -6,59.27%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,146.6,19.49,10.66,39.41,86.42%,78.17%,93.00%,89.50%,85.00%,88.88%,99.50%,92.00%,84.00%,80.00%,73.08%,80.62%,76.76%,75.00%,79.17%,17.62%,26.50%,5.00%,19.50%,19.50%,N/A,73.17%,74.60%,OpenAI,Proprietary -7,58.15%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.43,0.9,5.81,1.42,86.23%,79.42%,93.00%,86.50%,86.00%,91.12%,100.00%,96.00%,86.00%,82.50%,74.63%,79.46%,74.35%,93.75%,70.83%,11.13%,15.50%,4.00%,10.00%,15.00%,N/A,75.61%,81.00%,OpenAI,Proprietary -8,57.99%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,N/A,N/A,N/A,88.15%,81.08%,93.00%,91.50%,87.00%,90.11%,96.43%,96.00%,88.00%,80.00%,71.97%,78.29%,80.14%,75.00%,62.50%,14.50%,23.50%,7.00%,13.00%,14.50%,N/A,85.37%,67.29%,Salesforce,cc-by-nc-4.0 -9,57.92%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.22,0.75,1.13,1.09,86.58%,75.33%,91.50%,91.50%,88.00%,89.48%,95.93%,96.00%,86.00%,80.00%,76.28%,77.91%,78.30%,93.75%,66.67%,9.88%,16.00%,2.00%,9.50%,12.00%,N/A,85.37%,78.54%,Google,Proprietary -10,57.69%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,N/A,N/A,N/A,90.27%,80.58%,95.00%,93.50%,92.00%,89.25%,90.00%,94.00%,88.00%,85.00%,69.79%,74.42%,77.15%,81.25%,75.00%,14.75%,17.00%,3.00%,20.50%,18.50%,N/A,95.12%,68.46%,MadeAgents,cc-by-nc-4.0 -11,57.45%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,19.85,6.74,8.87,14.26,81.31%,73.75%,90.00%,81.00%,80.50%,84.00%,88.50%,92.00%,78.00%,77.50%,75.39%,73.26%,71.07%,75.00%,62.50%,13.12%,23.00%,3.00%,12.00%,14.50%,N/A,48.78%,88.04%,OpenAI,Proprietary -12,57.42%,Claude-3.5-Sonnet-20240620 (FC),https://www.anthropic.com/news/claude-3-5-sonnet,14.4,4.33,4.82,7.86,70.04%,75.17%,93.50%,64.50%,47.00%,66.27%,97.57%,90.00%,40.00%,37.50%,74.68%,80.23%,76.76%,56.25%,58.33%,28.38%,43.00%,16.50%,27.00%,27.00%,N/A,68.29%,74.58%,Anthropic,Proprietary -13,56.80%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,7.92,3.02,9.89,5.67,86.62%,73.00%,92.00%,91.50%,90.00%,84.57%,73.79%,94.00%,88.00%,82.50%,68.37%,81.78%,79.27%,68.75%,75.00%,20.62%,25.50%,12.00%,24.00%,21.00%,N/A,75.61%,49.44%,Mistral AI,Proprietary -14,56.67%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,23.1,21.13,64.91,86.42%,74.67%,94.50%,89.50%,87.00%,85.95%,88.79%,92.00%,88.00%,75.00%,70.41%,75.19%,75.89%,81.25%,62.50%,14.62%,24.00%,2.50%,21.00%,11.00%,N/A,85.37%,68.62%,MeetKai,MIT -15,56.51%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.65,1.41,2.21,2.34,88.96%,79.83%,94.00%,93.00%,89.00%,91.77%,98.57%,96.00%,90.00%,82.50%,74.41%,77.52%,76.76%,87.50%,75.00%,5.50%,9.50%,1.00%,6.00%,5.50%,N/A,65.85%,77.30%,Google,Proprietary -16,55.86%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.29,0.68,0.85,1.02,86.17%,73.17%,90.50%,92.00%,89.00%,87.68%,84.21%,94.00%,90.00%,82.50%,69.21%,74.42%,75.12%,93.75%,75.00%,12.62%,18.50%,4.00%,16.00%,12.00%,N/A,82.93%,67.84%,Google,Proprietary -17,55.78%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,34.28,1.3,3.03,2.41,91.46%,82.33%,95.00%,95.00%,93.50%,90.00%,99.50%,98.00%,80.00%,82.50%,69.04%,85.66%,84.57%,87.50%,75.00%,9.50%,12.00%,3.00%,10.50%,12.50%,N/A,82.93%,58.95%,OpenAI,Proprietary -18,55.10%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.92,1.55,2.84,2.9,83.88%,73.00%,91.50%,88.00%,83.00%,87.52%,91.57%,94.00%,82.00%,82.50%,73.12%,67.44%,69.24%,93.75%,66.67%,6.00%,10.00%,2.00%,7.50%,4.50%,N/A,56.10%,85.00%,Google,Proprietary -19,54.41%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,N/A,N/A,N/A,81.40%,73.08%,93.50%,79.50%,79.50%,83.46%,76.86%,92.00%,90.00%,75.00%,67.88%,71.32%,73.48%,31.25%,58.33%,14.50%,18.00%,10.00%,17.00%,13.00%,N/A,97.56%,64.05%,Salesforce,cc-by-nc-4.0 -20,54.27%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,N/A,N/A,N/A,85.79%,75.67%,96.00%,88.50%,83.00%,88.13%,94.50%,92.00%,86.00%,80.00%,65.97%,72.48%,72.32%,62.50%,66.67%,11.25%,11.00%,10.50%,12.50%,11.00%,N/A,92.68%,64.95%,Qwen,apache-2.0 -21,53.67%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,88.90%,76.58%,95.50%,93.50%,90.00%,89.34%,91.36%,96.00%,90.00%,80.00%,61.13%,77.13%,71.46%,87.50%,62.50%,12.38%,11.50%,7.50%,17.50%,13.00%,N/A,92.68%,58.38%,Meta,Meta Llama 3 Community -22,53.66%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,88.52%,81.08%,92.50%,91.00%,89.50%,87.89%,83.57%,96.00%,92.00%,80.00%,69.48%,79.46%,77.24%,68.75%,62.50%,4.12%,6.50%,1.50%,6.50%,2.00%,N/A,87.80%,68.76%,Google,gemma-terms-of-use -23,53.00%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,0.84,1.08,1.29,1.96,84.12%,75.50%,93.00%,88.00%,80.00%,84.11%,95.43%,90.00%,86.00%,65.00%,61.22%,74.42%,77.82%,43.75%,50.00%,19.25%,32.50%,8.00%,20.50%,16.00%,N/A,97.56%,35.16%,OpenAI,Proprietary -24,52.50%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,84.38%,74.50%,92.00%,88.00%,83.00%,85.18%,84.21%,94.00%,90.00%,72.50%,69.21%,73.64%,73.58%,56.25%,58.33%,3.75%,6.00%,2.50%,4.00%,2.50%,N/A,87.80%,72.45%,Google,gemma-terms-of-use -25,52.11%,Claude-3-Opus-20240229 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,77.36,11.56,8.71,19.82,58.67%,68.67%,89.00%,41.00%,36.00%,62.05%,88.71%,88.00%,44.00%,27.50%,74.10%,74.81%,75.60%,50.00%,41.67%,19.50%,30.00%,7.00%,18.00%,23.00%,N/A,63.41%,77.80%,Anthropic,Proprietary -26,51.81%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,7.11,1.84,5.15,3.83,86.92%,77.67%,92.50%,90.00%,87.50%,88.23%,91.43%,96.00%,88.00%,77.50%,68.46%,72.87%,61.33%,81.25%,66.67%,0.88%,1.50%,1.00%,0.50%,0.50%,N/A,65.85%,74.85%,Mistral AI,Proprietary -27,51.64%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,0.66,1.37,2.45,2.72,81.21%,63.33%,92.00%,86.50%,83.00%,77.04%,55.64%,90.00%,90.00%,72.50%,62.37%,71.71%,67.79%,62.50%,66.67%,14.25%,19.00%,8.50%,15.00%,14.50%,N/A,60.98%,62.40%,Mistral AI,Proprietary -28,51.59%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,N/A,N/A,N/A,84.06%,75.25%,90.50%,88.00%,82.50%,88.95%,93.29%,92.00%,88.00%,82.50%,63.22%,70.54%,68.56%,56.25%,66.67%,7.13%,7.50%,4.00%,7.50%,9.50%,N/A,92.68%,60.64%,MadeAgents,cc-by-nc-4.0 -29,51.50%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,87.17%,75.17%,95.50%,90.50%,87.50%,89.21%,95.86%,96.00%,80.00%,85.00%,66.15%,78.68%,79.65%,68.75%,66.67%,3.25%,2.50%,3.00%,4.50%,3.00%,N/A,92.68%,52.78%,Meta,Meta Llama 3 Community -30,51.21%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,N/A,N/A,N/A,82.33%,72.83%,91.50%,84.50%,80.50%,85.91%,85.64%,92.00%,86.00%,80.00%,57.49%,65.12%,55.35%,43.75%,54.17%,11.50%,13.00%,10.50%,13.50%,9.00%,N/A,95.12%,72.55%,IBM,Apache-2.0 -31,51.00%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,5.79,3.55,15.18,7.87,74.02%,70.58%,91.50%,65.50%,68.50%,81.73%,95.43%,92.00%,72.00%,67.50%,73.10%,68.60%,73.00%,81.25%,50.00%,0.62%,1.00%,0.00%,0.50%,1.00%,N/A,60.98%,95.21%,Mistral AI,Proprietary -32,50.81%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,8.0,1.2,1.02,2.74,80.90%,71.08%,91.50%,82.00%,79.00%,85.07%,93.29%,90.00%,82.00%,75.00%,69.75%,66.67%,70.30%,68.75%,70.83%,0.50%,1.00%,0.00%,0.50%,0.50%,N/A,73.17%,72.83%,Cohere For AI,cc-by-nc-4.0 -33,50.75%,Gorilla-OpenFunctions-v2 (FC),https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html,N/A,6.97,9.8,19.66,86.29%,77.67%,95.00%,89.00%,83.50%,86.09%,95.86%,96.00%,80.00%,72.50%,67.44%,73.64%,58.73%,68.75%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,73.17%,75.05%,Gorilla LLM,Apache 2.0 -34,50.15%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,83.62%,73.00%,94.50%,83.50%,83.50%,87.29%,85.64%,96.00%,90.00%,77.50%,57.93%,71.32%,72.23%,50.00%,45.83%,11.38%,14.00%,7.00%,10.50%,14.00%,N/A,78.05%,41.62%,Meta,Meta Llama 3 Community -35,49.02%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,N/A,N/A,N/A,68.85%,68.42%,88.00%,63.50%,55.50%,78.43%,87.71%,88.00%,68.00%,70.00%,69.12%,68.22%,76.76%,62.50%,54.17%,4.00%,9.00%,2.00%,4.00%,1.00%,N/A,87.80%,68.12%,Salesforce,cc-by-nc-4.0 -36,48.82%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,N/A,N/A,N/A,75.19%,70.25%,85.50%,73.50%,71.50%,82.82%,72.79%,94.00%,82.00%,82.50%,61.71%,64.73%,59.88%,50.00%,41.67%,6.62%,7.50%,6.00%,6.00%,7.00%,N/A,75.61%,67.17%,Qwen,apache-2.0 -37,47.30%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.05,0.85,1.98,1.33,65.04%,62.67%,83.00%,65.50%,49.00%,67.68%,46.21%,90.00%,72.00%,62.50%,67.48%,63.57%,64.61%,68.75%,54.17%,6.62%,8.00%,1.50%,9.00%,8.00%,N/A,80.49%,75.47%,OpenAI,Proprietary -38,47.02%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,N/A,N/A,N/A,76.54%,64.17%,89.50%,79.50%,73.00%,75.48%,69.93%,94.00%,78.00%,60.00%,61.79%,67.44%,64.42%,56.25%,45.83%,5.12%,5.50%,1.50%,8.00%,5.50%,N/A,56.10%,58.50%,NousResearch,apache-2.0 -39,47.00%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,3.2,3.65,8.78,8.31,76.83%,66.33%,90.00%,82.00%,69.00%,78.61%,88.93%,88.00%,80.00%,57.50%,57.26%,66.67%,60.56%,56.25%,50.00%,8.25%,10.00%,0.00%,10.00%,13.00%,N/A,92.68%,53.32%,Cohere For AI,cc-by-nc-4.0 -40,46.96%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,N/A,N/A,N/A,74.85%,67.42%,87.50%,71.00%,73.50%,81.70%,86.79%,88.00%,82.00%,70.00%,54.24%,59.30%,62.20%,50.00%,66.67%,11.12%,9.00%,11.00%,13.00%,11.50%,N/A,87.80%,47.41%,Qwen,apache-2.0 -41,46.52%,Hermes-2-Pro-Llama-3-70B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-70B,N/A,N/A,N/A,N/A,78.85%,59.92%,80.00%,88.00%,87.50%,80.45%,76.29%,82.00%,86.00%,77.50%,60.51%,63.18%,53.04%,56.25%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,60.98%,70.27%,NousResearch,apache-2.0 -42,45.30%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,2.03,1.59,2.9,2.78,57.27%,67.08%,93.50%,20.00%,48.50%,53.77%,87.07%,92.00%,16.00%,20.00%,70.19%,63.57%,71.46%,12.50%,12.50%,7.00%,10.00%,0.00%,7.50%,10.50%,N/A,82.93%,78.43%,Mistral AI,Proprietary -43,44.62%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,N/A,N/A,N/A,72.83%,61.33%,87.50%,78.50%,64.00%,77.30%,61.71%,94.00%,86.00%,67.50%,56.46%,64.73%,59.40%,43.75%,37.50%,7.62%,8.00%,6.00%,10.50%,6.00%,N/A,75.61%,38.95%,NousResearch,apache-2.0 -44,44.35%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,2.05,1.6,1.81,3.53,31.77%,35.58%,39.50%,26.50%,25.50%,70.39%,75.07%,80.00%,74.00%,52.50%,65.53%,57.75%,58.24%,75.00%,41.67%,12.62%,17.50%,3.00%,13.50%,16.50%,N/A,63.41%,81.22%,Google,Proprietary -45,44.34%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,4.39,2.39,10.76,5.89,61.08%,71.33%,94.50%,10.50%,68.00%,63.82%,84.29%,94.00%,22.00%,55.00%,66.86%,73.26%,72.32%,6.25%,41.67%,7.50%,10.50%,1.00%,8.50%,10.00%,N/A,82.93%,43.94%,Mistral AI,Proprietary -46,44.18%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,2.01,1.56,3.72,2.59,38.27%,54.08%,39.50%,29.50%,30.00%,69.54%,69.64%,80.00%,76.00%,52.50%,64.59%,58.91%,61.33%,81.25%,58.33%,11.25%,20.00%,0.50%,12.00%,12.50%,N/A,70.73%,74.44%,Google,Proprietary -47,44.03%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,N/A,N/A,N/A,N/A,67.04%,72.17%,91.50%,56.50%,48.00%,75.04%,90.14%,88.00%,62.00%,60.00%,62.33%,74.81%,71.65%,75.00%,58.33%,0.88%,2.50%,0.00%,0.00%,1.00%,N/A,87.80%,48.98%,Databricks,Databricks Open Model -48,43.98%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,1.33,1.97,7.09,3.04,57.94%,68.75%,86.00%,40.00%,37.00%,65.91%,71.14%,88.00%,52.00%,52.50%,64.95%,57.36%,65.00%,68.75%,50.00%,2.50%,4.50%,0.50%,2.50%,2.50%,N/A,68.29%,76.16%,Mistral AI,Proprietary -49,43.88%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,5.06,0.83,1.46,1.51,49.35%,32.42%,48.00%,74.00%,43.00%,69.93%,49.71%,82.00%,78.00%,70.00%,62.19%,42.64%,42.82%,25.00%,41.67%,5.88%,9.50%,1.00%,6.50%,6.50%,N/A,36.59%,94.19%,OpenAI,Proprietary -50,43.71%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.25,1.15,0.99,2.36,85.60%,77.42%,93.50%,87.00%,84.50%,91.23%,95.93%,96.00%,88.00%,85.00%,50.33%,78.29%,74.54%,75.00%,62.50%,1.12%,0.50%,0.50%,1.50%,2.00%,N/A,90.24%,10.16%,Mistral AI,Proprietary -51,43.69%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.13,0.48,0.64,0.73,33.56%,47.25%,40.00%,22.50%,24.50%,62.41%,53.14%,76.00%,68.00%,52.50%,64.90%,58.14%,60.46%,43.75%,41.67%,14.75%,21.00%,5.00%,16.50%,16.50%,N/A,58.54%,76.12%,Google,Proprietary -52,42.98%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,77.77%,64.08%,90.00%,80.50%,76.50%,69.41%,78.14%,92.00%,50.00%,57.50%,50.91%,47.67%,44.74%,0.00%,29.17%,4.00%,3.50%,1.50%,7.00%,4.00%,N/A,63.41%,68.81%,Meta,Meta Llama 3 Community -53,42.87%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.12,0.62,1.3,0.97,35.42%,49.67%,39.00%,24.00%,29.00%,60.84%,60.86%,80.00%,50.00%,52.50%,67.35%,58.14%,57.96%,68.75%,50.00%,8.13%,14.00%,0.50%,8.50%,9.50%,N/A,60.98%,87.64%,Google,Proprietary -54,40.92%,MiniCPM3-4B (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,N/A,N/A,N/A,63.19%,67.75%,74.00%,60.50%,50.50%,48.70%,44.79%,50.00%,40.00%,60.00%,59.88%,56.98%,49.47%,56.25%,33.33%,5.00%,7.50%,4.00%,5.00%,3.50%,N/A,58.54%,73.64%,openbmb,Apache-2.0 -55,40.69%,Claude-3-Haiku-20240307 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,1.11,1.52,1.35,2.33,41.67%,70.67%,93.50%,2.00%,0.50%,47.52%,92.07%,92.00%,6.00%,0.00%,57.66%,74.03%,77.15%,0.00%,4.17%,21.62%,30.50%,8.50%,24.50%,23.00%,N/A,97.56%,29.37%,Anthropic,Proprietary -56,40.44%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,N/A,N/A,N/A,74.56%,74.25%,92.00%,78.00%,54.00%,65.75%,84.50%,90.00%,66.00%,22.50%,53.44%,75.58%,57.28%,43.75%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,70.73%,46.05%,Salesforce,cc-by-nc-4.0 -57,40.37%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,N/A,N/A,N/A,66.79%,62.17%,80.00%,67.50%,57.50%,70.43%,53.21%,86.00%,80.00%,62.50%,52.42%,48.84%,44.07%,62.50%,41.67%,0.25%,0.00%,0.00%,0.50%,0.50%,N/A,85.37%,64.51%,MadeAgents,cc-by-nc-4.0 -58,40.24%,Claude-3-Haiku-20240307 (Prompt),https://www.anthropic.com/news/claude-3-family,0.59,1.09,0.66,2.03,58.21%,76.83%,93.50%,38.00%,24.50%,57.93%,89.71%,96.00%,26.00%,20.00%,65.04%,77.13%,74.64%,68.75%,45.83%,1.75%,2.50%,0.50%,1.00%,3.00%,N/A,82.93%,35.27%,Anthropic,Proprietary -59,38.68%,Claude-3.5-Sonnet-20240620 (Prompt),https://www.anthropic.com/news/claude-3-5-sonnet,7.25,2.76,1.72,6.41,60.58%,50.33%,88.00%,43.50%,60.50%,54.20%,66.79%,52.00%,38.00%,60.00%,54.24%,65.12%,22.66%,37.50%,33.33%,0.50%,1.00%,0.00%,0.00%,1.00%,N/A,19.51%,91.74%,Anthropic,Proprietary -60,37.57%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,15.33,3.26,6.9,8.42,62.27%,46.58%,77.00%,70.00%,55.50%,56.93%,40.21%,80.00%,70.00%,37.50%,53.35%,45.74%,73.10%,68.75%,54.17%,4.75%,5.50%,4.00%,5.50%,4.00%,N/A,90.24%,22.38%,Mistral AI,Proprietary -61,37.38%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,0.74,1.1,2.45,2.66,26.21%,48.83%,39.00%,7.50%,9.50%,58.11%,76.43%,76.00%,60.00%,20.00%,58.91%,58.91%,56.12%,37.50%,20.83%,7.38%,9.00%,2.00%,9.50%,9.00%,N/A,68.29%,69.31%,Google,Proprietary -62,37.07%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,61.02%,63.08%,85.50%,51.50%,44.00%,66.70%,83.29%,82.00%,44.00%,57.50%,50.51%,60.85%,60.75%,37.50%,20.83%,1.75%,3.00%,0.00%,2.50%,1.50%,N/A,75.61%,27.39%,Meta,Meta Llama 3 Community -63,36.94%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,1.14,1.18,2.49,2.0,56.29%,42.17%,51.00%,68.50%,63.50%,62.39%,48.57%,76.00%,70.00%,55.00%,45.67%,41.47%,36.93%,68.75%,33.33%,6.25%,5.00%,4.50%,8.00%,7.50%,N/A,80.49%,55.48%,Google,Proprietary -64,36.61%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,2.39,1.63,5.86,46.15%,57.58%,53.00%,34.50%,39.50%,57.86%,47.43%,86.00%,38.00%,60.00%,53.49%,39.92%,38.48%,56.25%,41.67%,1.13%,1.50%,0.50%,1.50%,1.00%,N/A,65.85%,77.90%,Nexusflow,Apache 2.0 -65,32.08%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,3.25%,4.50%,1.50%,4.00%,3.00%,N/A,75.61%,22.92%,Qwen,apache-2.0 -66,28.66%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,10.25%,8.00%,11.00%,11.00%,11.00%,N/A,100.00%,43.86%,Meta,Meta Llama 3 Community -67,28.40%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,13.37%,11.00%,8.50%,16.50%,17.50%,N/A,92.68%,5.29%,Meta,Meta Llama 3 Community -68,25.70%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,2.37,1.34,0.84,2.38,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,2.12%,2.00%,1.50%,2.00%,3.00%,N/A,41.46%,81.47%,Mistral AI,Proprietary -69,25.14%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,1.12%,1.00%,0.50%,1.50%,1.50%,N/A,97.56%,7.06%,Salesforce,cc-by-nc-4.0 -70,20.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,12.19%,7.25%,41.50%,0.00%,0.00%,12.88%,5.50%,46.00%,0.00%,0.00%,41.63%,11.24%,11.96%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,12.20%,79.93%,Google,gemma-terms-of-use -71,17.93%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,22.77%,25.08%,32.00%,24.00%,10.00%,19.11%,27.93%,18.00%,28.00%,2.50%,29.85%,25.97%,4.82%,6.25%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,48.78%,54.42%,Meta,Meta Llama 3 Community \ No newline at end of file +1,68.94%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,5.98,1.33,3.67,3.12,85.90%,74.58%,92.50%,92.00%,84.50%,85.64%,87.07%,92.00%,86.00%,77.50%,75.43%,74.42%,75.12%,81.25%,70.83%,45.25%,54.50%,44.00%,34.50%,48.00%,N/A,63.41%,82.93%,OpenAI,Proprietary +2,66.68%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,23.88,2.36,4.77,5.14,84.67%,69.17%,91.00%,90.50%,88.00%,84.32%,88.29%,88.00%,86.00%,75.00%,76.23%,77.52%,77.63%,81.25%,66.67%,39.25%,54.50%,32.50%,29.50%,40.50%,N/A,73.17%,79.76%,OpenAI,Proprietary +3,65.61%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,164.48,18.78,10.9,37.28,86.42%,78.17%,93.00%,89.50%,85.00%,88.88%,99.50%,92.00%,84.00%,80.00%,73.08%,80.62%,76.76%,75.00%,79.17%,36.62%,43.00%,38.50%,32.50%,32.50%,N/A,73.17%,74.60%,OpenAI,Proprietary +4,64.24%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,23.11,6.81,6.64,14.67,81.31%,73.75%,90.00%,81.00%,80.50%,84.00%,88.50%,92.00%,78.00%,77.50%,75.39%,73.26%,71.07%,75.00%,62.50%,33.50%,40.50%,32.50%,26.50%,34.50%,N/A,48.78%,88.04%,OpenAI,Proprietary +5,61.53%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,36.7,1.31,2.66,2.46,91.46%,82.33%,95.00%,95.00%,93.50%,90.00%,99.50%,98.00%,80.00%,82.50%,69.04%,85.66%,84.57%,87.50%,75.00%,26.75%,36.50%,24.00%,17.00%,29.50%,N/A,82.93%,58.95%,OpenAI,Proprietary +6,61.29%,Claude-3.5-Sonnet-20240620 (FC),https://www.anthropic.com/news/claude-3-5-sonnet,7.68,4.02,4.69,6.96,70.04%,75.17%,93.50%,64.50%,47.00%,66.27%,97.57%,90.00%,40.00%,37.50%,74.68%,80.23%,76.76%,56.25%,58.33%,40.00%,46.00%,39.00%,35.00%,40.00%,N/A,68.29%,74.58%,Anthropic,Proprietary +7,60.72%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.37,1.41,11.92,2.63,84.25%,73.50%,90.50%,90.00%,83.00%,84.12%,83.50%,92.00%,86.00%,75.00%,70.19%,72.87%,74.45%,87.50%,70.83%,28.25%,40.50%,15.50%,24.00%,33.00%,N/A,80.49%,71.77%,OpenAI,Proprietary +8,59.94%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,38.24,130.95,76.6,89.52%,76.08%,96.50%,95.00%,90.50%,89.77%,97.57%,94.00%,90.00%,77.50%,73.48%,79.46%,81.87%,68.75%,70.83%,17.25%,28.50%,12.50%,23.50%,4.50%,N/A,70.73%,73.32%,MeetKai,MIT +9,59.55%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,3.85,1.57,3.91,2.44,88.96%,79.83%,94.00%,93.00%,89.00%,91.77%,98.57%,96.00%,90.00%,82.50%,74.41%,77.52%,76.76%,87.50%,75.00%,14.62%,16.00%,18.00%,13.50%,11.00%,N/A,65.85%,77.30%,Google,Proprietary +10,59.27%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.53,1.01,6.36,1.77,86.23%,79.42%,93.00%,86.50%,86.00%,91.12%,100.00%,96.00%,86.00%,82.50%,74.63%,79.46%,74.35%,93.75%,70.83%,14.50%,20.00%,11.50%,10.00%,16.50%,N/A,75.61%,81.00%,OpenAI,Proprietary +11,58.95%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,N/A,N/A,N/A,88.15%,81.08%,93.00%,91.50%,87.00%,90.11%,96.43%,96.00%,88.00%,80.00%,71.97%,78.29%,80.14%,75.00%,62.50%,17.38%,25.50%,20.50%,15.00%,8.50%,N/A,85.37%,67.29%,Salesforce,cc-by-nc-4.0 +12,58.67%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.24,0.73,0.81,1.07,86.58%,75.33%,91.50%,91.50%,88.00%,89.48%,95.93%,96.00%,86.00%,80.00%,76.28%,77.91%,78.30%,93.75%,66.67%,12.12%,14.00%,17.50%,10.50%,6.50%,N/A,85.37%,78.54%,Google,Proprietary +13,57.43%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,3.83,1.52,1.8,2.8,83.88%,73.00%,91.50%,88.00%,83.00%,87.52%,91.57%,94.00%,82.00%,82.50%,73.12%,67.44%,69.24%,93.75%,66.67%,13.00%,15.00%,14.50%,13.50%,9.00%,N/A,56.10%,85.00%,Google,Proprietary +14,57.27%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,N/A,N/A,N/A,87.06%,76.25%,93.00%,90.00%,89.00%,89.52%,98.57%,94.00%,88.00%,77.50%,74.99%,66.67%,74.93%,81.25%,70.83%,7.88%,8.50%,10.50%,5.50%,7.00%,N/A,80.49%,85.71%,Huawei Noah & USTC,Apache-2.0 +15,57.19%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.29,0.65,0.41,0.98,86.17%,73.17%,90.50%,92.00%,89.00%,87.68%,84.21%,94.00%,90.00%,82.50%,69.21%,74.42%,75.12%,93.75%,75.00%,16.62%,23.50%,17.00%,14.00%,12.00%,N/A,82.93%,67.84%,Google,Proprietary +16,55.51%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,8.93,2.84,7.29,5.19,86.62%,73.00%,92.00%,91.50%,90.00%,84.57%,73.79%,94.00%,88.00%,82.50%,68.37%,81.78%,79.27%,68.75%,75.00%,16.75%,23.00%,12.50%,15.50%,16.00%,N/A,75.61%,49.44%,Mistral AI,Proprietary +17,54.98%,Claude-3-Opus-20240229 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,28.03,10.99,7.8,18.69,58.67%,68.67%,89.00%,41.00%,36.00%,62.05%,88.71%,88.00%,44.00%,27.50%,74.10%,74.81%,75.60%,50.00%,41.67%,28.12%,30.00%,29.50%,28.00%,25.00%,N/A,63.41%,77.80%,Anthropic,Proprietary +18,54.65%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,N/A,N/A,N/A,90.27%,80.58%,95.00%,93.50%,92.00%,89.25%,90.00%,94.00%,88.00%,85.00%,69.79%,74.42%,77.15%,81.25%,75.00%,5.62%,9.50%,2.00%,7.50%,3.50%,N/A,95.12%,68.46%,MadeAgents,cc-by-nc-4.0 +19,54.59%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,1.68,1.97,4.29,86.42%,74.67%,94.50%,89.50%,87.00%,85.95%,88.79%,92.00%,88.00%,75.00%,70.41%,75.19%,75.89%,81.25%,62.50%,8.38%,15.50%,0.50%,12.50%,5.00%,N/A,85.37%,68.62%,MeetKai,MIT +20,54.29%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,88.90%,76.58%,95.50%,93.50%,90.00%,89.34%,91.36%,96.00%,90.00%,80.00%,61.13%,77.13%,71.46%,87.50%,62.50%,14.25%,18.50%,15.50%,10.00%,13.00%,N/A,92.68%,58.38%,Meta,Meta Llama 3 Community +21,53.42%,GoGoAgent,https://gogoagent.ai,N/A,56.08,38.06,117.98,86.00%,75.50%,92.50%,92.00%,84.00%,88.05%,94.71%,94.00%,86.00%,77.50%,72.46%,71.32%,72.42%,87.50%,62.50%,0.25%,0.50%,0.50%,0.00%,0.00%,N/A,87.80%,81.89%,BitAgent,Proprietary +22,53.00%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,88.52%,81.08%,92.50%,91.00%,89.50%,87.89%,83.57%,96.00%,92.00%,80.00%,69.48%,79.46%,77.24%,68.75%,62.50%,2.12%,3.50%,2.00%,1.50%,1.50%,N/A,87.80%,68.76%,Google,gemma-terms-of-use +23,52.64%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,N/A,N/A,N/A,85.79%,75.67%,96.00%,88.50%,83.00%,88.13%,94.50%,92.00%,86.00%,80.00%,65.97%,72.48%,72.32%,62.50%,66.67%,6.38%,8.00%,7.50%,6.00%,4.00%,N/A,92.68%,64.95%,Qwen,apache-2.0 +24,52.31%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,N/A,N/A,N/A,68.85%,68.42%,88.00%,63.50%,55.50%,78.43%,87.71%,88.00%,68.00%,70.00%,69.12%,68.22%,76.76%,62.50%,54.17%,13.88%,18.50%,14.00%,12.50%,10.50%,N/A,87.80%,68.12%,Salesforce,cc-by-nc-4.0 +25,52.25%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,87.17%,75.17%,95.50%,90.50%,87.50%,89.21%,95.86%,96.00%,80.00%,85.00%,66.15%,78.68%,79.65%,68.75%,66.67%,5.50%,9.50%,4.50%,5.50%,2.50%,N/A,92.68%,52.78%,Meta,Meta Llama 3 Community +26,52.20%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.03,1.02,1.33,1.76,84.12%,75.50%,93.00%,88.00%,80.00%,84.11%,95.43%,90.00%,86.00%,65.00%,61.22%,74.42%,77.82%,43.75%,50.00%,16.88%,28.00%,13.00%,17.00%,9.50%,N/A,97.56%,35.16%,OpenAI,Proprietary +27,51.87%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,N/A,N/A,N/A,81.40%,73.08%,93.50%,79.50%,79.50%,83.46%,76.86%,92.00%,90.00%,75.00%,67.88%,71.32%,73.48%,31.25%,58.33%,6.88%,11.50%,7.00%,6.00%,3.00%,N/A,97.56%,64.05%,Salesforce,cc-by-nc-4.0 +28,51.68%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,8.7,2.93,13.22,4.14,86.92%,77.67%,92.50%,90.00%,87.50%,88.23%,91.43%,96.00%,88.00%,77.50%,68.46%,72.87%,61.33%,81.25%,66.67%,0.50%,0.50%,0.50%,0.00%,1.00%,N/A,65.85%,74.85%,Mistral AI,Proprietary +29,51.50%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,84.38%,74.50%,92.00%,88.00%,83.00%,85.18%,84.21%,94.00%,90.00%,72.50%,69.21%,73.64%,73.58%,56.25%,58.33%,0.75%,1.00%,2.00%,0.00%,0.00%,N/A,87.80%,72.45%,Google,gemma-terms-of-use +30,50.96%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,7.27,3.45,13.59,7.45,74.02%,70.58%,91.50%,65.50%,68.50%,81.73%,95.43%,92.00%,72.00%,67.50%,73.10%,68.60%,73.00%,81.25%,50.00%,0.50%,1.50%,0.00%,0.00%,0.50%,N/A,60.98%,95.21%,Mistral AI,Proprietary +31,50.77%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,11.8,1.11,0.71,2.07,80.90%,71.08%,91.50%,82.00%,79.00%,85.07%,93.29%,90.00%,82.00%,75.00%,69.75%,66.67%,70.30%,68.75%,70.83%,0.38%,1.00%,0.00%,0.00%,0.50%,N/A,73.17%,72.83%,Cohere For AI,cc-by-nc-4.0 +32,50.75%,Gorilla-OpenFunctions-v2 (FC),https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html,N/A,6.96,9.79,19.61,86.29%,77.67%,95.00%,89.00%,83.50%,86.09%,95.86%,96.00%,80.00%,72.50%,67.44%,73.64%,58.73%,68.75%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,73.17%,75.05%,Gorilla LLM,Apache 2.0 +33,49.78%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,83.62%,73.00%,94.50%,83.50%,83.50%,87.29%,85.64%,96.00%,90.00%,77.50%,57.93%,71.32%,72.23%,50.00%,45.83%,10.25%,14.00%,10.00%,7.50%,9.50%,N/A,78.05%,41.62%,Meta,Meta Llama 3 Community +34,49.68%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,N/A,N/A,N/A,84.06%,75.25%,90.50%,88.00%,82.50%,88.95%,93.29%,92.00%,88.00%,82.50%,63.22%,70.54%,68.56%,56.25%,66.67%,1.38%,2.50%,0.50%,1.00%,1.50%,N/A,92.68%,60.64%,MadeAgents,cc-by-nc-4.0 +35,49.56%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,0.81,1.28,6.09,2.44,81.21%,63.33%,92.00%,86.50%,83.00%,77.04%,55.64%,90.00%,90.00%,72.50%,62.37%,71.71%,67.79%,62.50%,66.67%,8.00%,12.00%,5.00%,10.50%,4.50%,N/A,60.98%,62.40%,Mistral AI,Proprietary +36,48.29%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,N/A,N/A,N/A,82.33%,72.83%,91.50%,84.50%,80.50%,85.91%,85.64%,92.00%,86.00%,80.00%,57.49%,65.12%,55.35%,43.75%,54.17%,2.75%,5.00%,1.50%,3.00%,1.50%,N/A,95.12%,72.55%,IBM,Apache-2.0 +37,47.80%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,7.37,0.99,2.11,2.16,49.35%,32.42%,48.00%,74.00%,43.00%,69.93%,49.71%,82.00%,78.00%,70.00%,62.19%,42.64%,42.82%,25.00%,41.67%,17.62%,21.50%,14.00%,15.00%,20.00%,N/A,36.59%,94.19%,OpenAI,Proprietary +38,47.12%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,N/A,N/A,N/A,75.19%,70.25%,85.50%,73.50%,71.50%,82.82%,72.79%,94.00%,82.00%,82.50%,61.71%,64.73%,59.88%,50.00%,41.67%,1.50%,2.00%,2.00%,1.00%,1.00%,N/A,75.61%,67.17%,Qwen,apache-2.0 +39,47.00%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.42,0.85,1.86,1.39,65.04%,62.67%,83.00%,65.50%,49.00%,67.68%,46.21%,90.00%,72.00%,62.50%,67.48%,63.57%,64.61%,68.75%,54.17%,5.75%,7.50%,7.00%,4.00%,4.50%,N/A,80.49%,75.47%,OpenAI,Proprietary +40,46.52%,Hermes-2-Pro-Llama-3-70B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-70B,N/A,N/A,N/A,N/A,78.85%,59.92%,80.00%,88.00%,87.50%,80.45%,76.29%,82.00%,86.00%,77.50%,60.51%,63.18%,53.04%,56.25%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,60.98%,70.27%,NousResearch,apache-2.0 +41,45.44%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,N/A,N/A,N/A,76.54%,64.17%,89.50%,79.50%,73.00%,75.48%,69.93%,94.00%,78.00%,60.00%,61.79%,67.44%,64.42%,56.25%,45.83%,0.38%,1.00%,0.00%,0.50%,0.00%,N/A,56.10%,58.50%,NousResearch,apache-2.0 +42,44.73%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,2.91,1.49,2.43,2.56,38.27%,54.08%,39.50%,29.50%,30.00%,69.54%,69.64%,80.00%,76.00%,52.50%,64.59%,58.91%,61.33%,81.25%,58.33%,12.88%,16.00%,9.00%,11.00%,15.50%,N/A,70.73%,74.44%,Google,Proprietary +43,44.70%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,4.14,2.99,5.51,6.27,76.83%,66.33%,90.00%,82.00%,69.00%,78.61%,88.93%,88.00%,80.00%,57.50%,57.26%,66.67%,60.56%,56.25%,50.00%,1.38%,1.50%,0.00%,1.50%,2.50%,N/A,92.68%,53.32%,Cohere For AI,cc-by-nc-4.0 +44,44.12%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,N/A,N/A,N/A,74.85%,67.42%,87.50%,71.00%,73.50%,81.70%,86.79%,88.00%,82.00%,70.00%,54.24%,59.30%,62.20%,50.00%,66.67%,2.63%,3.50%,3.50%,1.50%,2.00%,N/A,87.80%,47.41%,Qwen,apache-2.0 +45,43.77%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,3.02,1.5,1.25,2.81,31.77%,35.58%,39.50%,26.50%,25.50%,70.39%,75.07%,80.00%,74.00%,52.50%,65.53%,57.75%,58.24%,75.00%,41.67%,10.88%,13.50%,7.50%,9.00%,13.50%,N/A,63.41%,81.22%,Google,Proprietary +46,43.74%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,5.29,3.62,7.87,16.88,67.04%,72.17%,91.50%,56.50%,48.00%,75.04%,90.14%,88.00%,62.00%,60.00%,62.33%,74.81%,71.65%,75.00%,58.33%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,87.80%,48.98%,Databricks,Databricks Open Model +47,43.67%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,2.34,1.56,4.23,2.75,57.27%,67.08%,93.50%,20.00%,48.50%,53.77%,87.07%,92.00%,16.00%,20.00%,70.19%,63.57%,71.46%,12.50%,12.50%,2.12%,3.50%,0.00%,2.50%,2.50%,N/A,82.93%,78.43%,Mistral AI,Proprietary +48,43.37%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.2,1.02,0.66,1.96,85.60%,77.42%,93.50%,87.00%,84.50%,91.23%,95.93%,96.00%,88.00%,85.00%,50.33%,78.29%,74.54%,75.00%,62.50%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,90.24%,10.16%,Mistral AI,Proprietary +49,43.35%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,1.79,1.51,4.79,2.93,57.94%,68.75%,86.00%,40.00%,37.00%,65.91%,71.14%,88.00%,52.00%,52.50%,64.95%,57.36%,65.00%,68.75%,50.00%,0.62%,1.50%,0.00%,0.00%,1.00%,N/A,68.29%,76.16%,Mistral AI,Proprietary +50,42.87%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.17,0.62,1.26,0.98,35.42%,49.67%,39.00%,24.00%,29.00%,60.84%,60.86%,80.00%,50.00%,52.50%,67.35%,58.14%,57.96%,68.75%,50.00%,8.12%,11.00%,5.50%,8.00%,8.00%,N/A,60.98%,87.64%,Google,Proprietary +51,42.35%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,77.77%,64.08%,90.00%,80.50%,76.50%,69.41%,78.14%,92.00%,50.00%,57.50%,50.91%,47.67%,44.74%,0.00%,29.17%,2.12%,1.50%,2.00%,2.00%,3.00%,N/A,63.41%,68.81%,Meta,Meta Llama 3 Community +52,42.17%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,N/A,N/A,N/A,72.83%,61.33%,87.50%,78.50%,64.00%,77.30%,61.71%,94.00%,86.00%,67.50%,56.46%,64.73%,59.40%,43.75%,37.50%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,75.61%,38.95%,NousResearch,apache-2.0 +53,42.05%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,4.84,2.83,14.59,5.36,61.08%,71.33%,94.50%,10.50%,68.00%,63.82%,84.29%,94.00%,22.00%,55.00%,66.86%,73.26%,72.32%,6.25%,41.67%,0.62%,1.00%,0.00%,1.00%,0.50%,N/A,82.93%,43.94%,Mistral AI,Proprietary +54,41.69%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.19,0.51,0.58,0.82,33.56%,47.25%,40.00%,22.50%,24.50%,62.41%,53.14%,76.00%,68.00%,52.50%,64.90%,58.14%,60.46%,43.75%,41.67%,8.75%,10.00%,6.50%,9.50%,9.00%,N/A,58.54%,76.12%,Google,Proprietary +55,40.44%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,N/A,N/A,N/A,74.56%,74.25%,92.00%,78.00%,54.00%,65.75%,84.50%,90.00%,66.00%,22.50%,53.44%,75.58%,57.28%,43.75%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,70.73%,46.05%,Salesforce,cc-by-nc-4.0 +56,40.41%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,N/A,N/A,N/A,66.79%,62.17%,80.00%,67.50%,57.50%,70.43%,53.21%,86.00%,80.00%,62.50%,52.42%,48.84%,44.07%,62.50%,41.67%,0.38%,0.50%,0.00%,0.50%,0.50%,N/A,85.37%,64.51%,MadeAgents,cc-by-nc-4.0 +57,40.36%,Claude-3-Haiku-20240307 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,0.29,1.46,1.22,2.21,41.67%,70.67%,93.50%,2.00%,0.50%,47.52%,92.07%,92.00%,6.00%,0.00%,57.66%,74.03%,77.15%,0.00%,4.17%,20.62%,27.50%,15.00%,17.50%,22.50%,N/A,97.56%,29.37%,Anthropic,Proprietary +58,39.54%,MiniCPM3-4B (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,N/A,N/A,N/A,63.19%,67.75%,74.00%,60.50%,50.50%,48.70%,44.79%,50.00%,40.00%,60.00%,59.88%,56.98%,49.47%,56.25%,33.33%,0.88%,1.50%,2.00%,0.00%,0.00%,N/A,58.54%,73.64%,openbmb,Apache-2.0 +59,39.19%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,15.57,2.74,7.73,5.95,62.27%,46.58%,77.00%,70.00%,55.50%,56.93%,40.21%,80.00%,70.00%,37.50%,53.35%,45.74%,73.10%,68.75%,54.17%,9.62%,14.50%,11.00%,6.00%,7.00%,N/A,90.24%,22.38%,Mistral AI,Proprietary +60,36.52%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,2.92,2.17,7.4,46.15%,57.58%,53.00%,34.50%,39.50%,57.86%,47.43%,86.00%,38.00%,60.00%,53.49%,39.92%,38.48%,56.25%,41.67%,0.88%,1.50%,0.50%,0.50%,1.00%,N/A,65.85%,77.90%,Nexusflow,Apache 2.0 +61,36.48%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,61.02%,63.08%,85.50%,51.50%,44.00%,66.70%,83.29%,82.00%,44.00%,57.50%,50.51%,60.85%,60.75%,37.50%,20.83%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,75.61%,27.39%,Meta,Meta Llama 3 Community +62,35.71%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.1,1.18,2.45,3.2,26.21%,48.83%,39.00%,7.50%,9.50%,58.11%,76.43%,76.00%,60.00%,20.00%,58.91%,58.91%,56.12%,37.50%,20.83%,2.38%,2.50%,1.00%,3.00%,3.00%,N/A,68.29%,69.31%,Google,Proprietary +63,35.11%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,1.43,1.21,1.48,2.68,56.29%,42.17%,51.00%,68.50%,63.50%,62.39%,48.57%,76.00%,70.00%,55.00%,45.67%,41.47%,36.93%,68.75%,33.33%,0.75%,1.50%,1.50%,0.00%,0.00%,N/A,80.49%,55.48%,Google,Proprietary +64,31.04%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,75.61%,22.92%,Qwen,apache-2.0 +65,25.24%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,100.00%,43.86%,Meta,Meta Llama 3 Community +66,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,2.55,1.28,0.58,2.34,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,41.46%,81.47%,Mistral AI,Proprietary +67,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,N/A,97.56%,7.06%,Salesforce,cc-by-nc-4.0 +68,23.95%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,92.68%,5.29%,Meta,Meta Llama 3 Community +69,20.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,12.19%,7.25%,41.50%,0.00%,0.00%,12.88%,5.50%,46.00%,0.00%,0.00%,41.63%,11.24%,11.96%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,12.20%,79.93%,Google,gemma-terms-of-use +70,17.93%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,22.77%,25.08%,32.00%,24.00%,10.00%,19.11%,27.93%,18.00%,28.00%,2.50%,29.85%,25.97%,4.82%,6.25%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,48.78%,54.42%,Meta,Meta Llama 3 Community \ No newline at end of file From a99dc64821a2a49d37afe696d497b8a1a5482293 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Thu, 14 Nov 2024 19:25:28 -0800 Subject: [PATCH 3/6] update data.csv --- data_overall.csv | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data_overall.csv b/data_overall.csv index 762584dd5..3b7910111 100644 --- a/data_overall.csv +++ b/data_overall.csv @@ -63,7 +63,7 @@ Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s 62,35.71%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.1,1.18,2.45,3.2,26.21%,48.83%,39.00%,7.50%,9.50%,58.11%,76.43%,76.00%,60.00%,20.00%,58.91%,58.91%,56.12%,37.50%,20.83%,2.38%,2.50%,1.00%,3.00%,3.00%,N/A,68.29%,69.31%,Google,Proprietary 63,35.11%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,1.43,1.21,1.48,2.68,56.29%,42.17%,51.00%,68.50%,63.50%,62.39%,48.57%,76.00%,70.00%,55.00%,45.67%,41.47%,36.93%,68.75%,33.33%,0.75%,1.50%,1.50%,0.00%,0.00%,N/A,80.49%,55.48%,Google,Proprietary 64,31.04%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,75.61%,22.92%,Qwen,apache-2.0 -65,25.24%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,100.00%,43.86%,Meta,Meta Llama 3 Community +65,26.16%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,10.55,35.79,24.18,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,2.75%,4.50%,2.00%,2.00%,2.50%,N/A,100.00%,43.86%,Meta,Meta Llama 3 Community 66,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,2.55,1.28,0.58,2.34,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,41.46%,81.47%,Mistral AI,Proprietary 67,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,N/A,97.56%,7.06%,Salesforce,cc-by-nc-4.0 68,23.95%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,92.68%,5.29%,Meta,Meta Llama 3 Community From d9b1ca0c2d0c4caf0f4a98c91a54077daa09d958 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Sun, 17 Nov 2024 00:32:55 -0800 Subject: [PATCH 4/6] update data.csv --- data_live.csv | 92 ++++++++++++++-------------- data_multi_turn.csv | 71 ++++++++++++++++++++++ data_non_live.csv | 102 +++++++++++++++---------------- data_overall.csv | 142 ++++++++++++++++++++++---------------------- leaderboard.html | 2 +- 5 files changed, 240 insertions(+), 169 deletions(-) create mode 100644 data_multi_turn.csv diff --git a/data_live.csv b/data_live.csv index 447e79e95..fb95e7403 100644 --- a/data_live.csv +++ b/data_live.csv @@ -1,53 +1,53 @@ Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Python Parallel AST,Python Parallel Multiple AST,Irrelevance Detection,Relevance Detection -1,Gemini-1.5-Flash-002 (Prompt),76.28%,78.20%,77.91%,78.30%,93.75%,66.67%,72.91%,85.37% -2,GPT-4-turbo-2024-04-09 (FC),76.23%,77.45%,77.52%,77.63%,81.25%,66.67%,74.51%,73.17% -3,GPT-4o-2024-08-06 (FC),75.43%,74.98%,74.42%,75.12%,81.25%,70.83%,76.69%,63.41% -4,o1-mini-2024-09-12 (Prompt),75.39%,71.39%,73.26%,71.07%,75.00%,62.50%,82.74%,48.78% +1,GPT-4-turbo-2024-04-09 (FC),76.23%,77.45%,77.52%,77.63%,81.25%,66.67%,74.51%,73.17% +2,GPT-4o-2024-08-06 (FC),75.43%,74.98%,74.42%,75.12%,81.25%,70.83%,76.69%,63.41% +3,o1-mini-2024-09-12 (Prompt),75.39%,71.39%,73.26%,71.07%,75.00%,62.50%,82.74%,48.78% +4,Gemini-1.5-Flash-002 (FC),75.12%,71.24%,71.32%,70.97%,81.25%,75.00%,81.71%,60.98% 5,ToolACE-8B (FC),74.99%,73.33%,66.67%,74.93%,81.25%,70.83%,77.26%,80.49% 6,Claude-3.5-Sonnet-20240620 (FC),74.68%,76.85%,80.23%,76.76%,56.25%,58.33%,71.66%,68.29% 7,GPT-4o-mini-2024-07-18 (Prompt),74.63%,75.51%,79.46%,74.35%,93.75%,70.83%,73.26%,75.61% -8,Gemini-1.5-Pro-002 (Prompt),74.41%,77.00%,77.52%,76.76%,87.50%,75.00%,70.86%,65.85% +8,Gemini-1.5-Pro-002 (Prompt),74.28%,78.28%,79.84%,77.72%,87.50%,79.17%,68.11%,75.61% 9,Claude-3-Opus-20240229 (FC tools-2024-04-04),74.10%,74.53%,74.81%,75.60%,50.00%,41.67%,73.94%,63.41% -10,Functionary-Medium-v3.1 (FC),73.48%,81.05%,79.46%,81.87%,68.75%,70.83%,62.06%,70.73% -11,Gemini-1.5-Pro-001 (Prompt),73.12%,69.14%,67.44%,69.24%,93.75%,66.67%,80.00%,56.10% -12,Mistral-Medium-2312 (Prompt),73.10%,71.84%,68.60%,73.00%,81.25%,50.00%,100.00%,60.98% -13,o1-preview-2024-09-12 (Prompt),73.08%,77.53%,80.62%,76.76%,75.00%,79.17%,66.29%,73.17% -14,GoGoAgent,72.46%,72.21%,71.32%,72.42%,87.50%,62.50%,72.11%,87.80% -15,xLAM-8x22b-r (FC),71.97%,79.40%,78.29%,80.14%,75.00%,62.50%,60.00%,85.37% -16,Functionary-Small-v3.1 (FC),70.41%,75.58%,75.19%,75.89%,81.25%,62.50%,61.83%,85.37% -17,Mistral-small-2402 (FC),70.19%,68.16%,63.57%,71.46%,12.50%,12.50%,72.69%,82.93% -18,GPT-4o-mini-2024-07-18 (FC),70.19%,74.23%,72.87%,74.45%,87.50%,70.83%,63.54%,80.49% -19,Hammer2.0-7b (FC),69.79%,76.63%,74.42%,77.15%,81.25%,75.00%,58.17%,95.12% -20,Command-R-Plus (Prompt) (Original),69.75%,69.59%,66.67%,70.30%,68.75%,70.83%,69.83%,73.17% -21,Gemma-2-27b-it (Prompt),69.48%,77.30%,79.46%,77.24%,68.75%,62.50%,56.69%,87.80% -22,Gemini-1.5-Flash-001 (Prompt),69.21%,75.21%,74.42%,75.12%,93.75%,75.00%,59.43%,82.93% -23,Gemma-2-9b-it (Prompt),69.21%,73.11%,73.64%,73.58%,56.25%,58.33%,62.40%,87.80% -24,xLAM-8x7b-r (FC),69.12%,74.53%,68.22%,76.76%,62.50%,54.17%,60.00%,87.80% -25,GPT-4-turbo-2024-04-09 (Prompt),69.04%,84.64%,85.66%,84.57%,87.50%,75.00%,44.57%,82.93% -26,Open-Mixtral-8x22b (Prompt),68.46%,63.90%,72.87%,61.33%,81.25%,66.67%,75.54%,65.85% -27,mistral-large-2407 (FC),68.37%,79.55%,81.78%,79.27%,68.75%,75.00%,50.97%,75.61% -28,xLAM-7b-r (FC),67.88%,72.28%,71.32%,73.48%,31.25%,58.33%,59.77%,97.56% -29,GPT-3.5-Turbo-0125 (Prompt),67.48%,64.27%,63.57%,64.61%,68.75%,54.17%,71.77%,80.49% -30,Gorilla-OpenFunctions-v2 (FC),67.44%,61.42%,73.64%,58.73%,68.75%,41.67%,76.34%,73.17% -31,Gemini-1.5-Flash-002 (FC),67.35%,57.98%,58.14%,57.96%,68.75%,50.00%,81.94%,60.98% -32,Open-Mixtral-8x22b (FC),66.86%,71.16%,73.26%,72.32%,6.25%,41.67%,59.54%,82.93% -33,Meta-Llama-3-70B-Instruct (Prompt),66.15%,79.10%,78.68%,79.65%,68.75%,66.67%,45.14%,92.68% -34,Qwen2.5-7B-Instruct (Prompt),65.97%,72.13%,72.48%,72.32%,62.50%,66.67%,55.31%,92.68% -35,Gemini-1.5-Pro-001 (FC),65.53%,58.05%,57.75%,58.24%,75.00%,41.67%,77.03%,63.41% -36,Open-Mixtral-8x7b (Prompt),64.95%,63.30%,57.36%,65.00%,68.75%,50.00%,67.31%,68.29% -37,Gemini-1.5-Flash-001 (FC),64.90%,59.48%,58.14%,60.46%,43.75%,41.67%,73.49%,58.54% -38,Gemini-1.5-Pro-002 (FC),64.59%,61.05%,58.91%,61.33%,81.25%,58.33%,69.71%,70.73% -39,Hammer2.0-1.5b (FC),63.22%,68.76%,70.54%,68.56%,56.25%,66.67%,53.37%,92.68% -40,Open-Mistral-Nemo-2407 (FC),62.37%,68.46%,71.71%,67.79%,62.50%,66.67%,53.14%,60.98% -41,DBRX-Instruct (Prompt),62.33%,72.06%,74.81%,71.65%,75.00%,58.33%,46.29%,87.80% -42,GPT-4o-2024-08-06 (Prompt),62.19%,42.55%,42.64%,42.82%,25.00%,41.67%,93.37%,36.59% -43,Hermes-2-Pro-Llama-3-8B (FC),61.79%,64.57%,67.44%,64.42%,56.25%,45.83%,57.83%,56.10% -44,Qwen2.5-1.5B-Instruct (Prompt),61.71%,60.37%,64.73%,59.88%,50.00%,41.67%,63.09%,75.61% -45,GPT-3.5-Turbo-0125 (FC),61.22%,76.25%,74.42%,77.82%,43.75%,50.00%,36.57%,97.56% -46,Llama-3.1-70B-Instruct (Prompt),61.13%,72.58%,77.13%,71.46%,87.50%,62.50%,42.17%,92.68% -47,Hermes-2-Pro-Llama-3-70B (FC),60.51%,55.28%,63.18%,53.04%,56.25%,66.67%,68.46%,60.98% -48,MiniCPM3-4B (FC),59.88%,50.71%,56.98%,49.47%,56.25%,33.33%,73.94%,58.54% -49,Gemini-1.0-Pro-002 (FC),58.91%,55.81%,58.91%,56.12%,37.50%,20.83%,63.20%,68.29% +10,Gemini-1.5-Pro-001 (Prompt),73.83%,72.96%,74.03%,72.32%,93.75%,75.00%,75.66%,63.41% +11,Functionary-Medium-v3.1 (FC),73.48%,81.05%,79.46%,81.87%,68.75%,70.83%,62.06%,70.73% +12,Gemini-1.5-Flash-002 (Prompt),73.21%,75.13%,77.52%,74.73%,87.50%,58.33%,70.06%,78.05% +13,Mistral-Medium-2312 (Prompt),73.10%,71.84%,68.60%,73.00%,81.25%,50.00%,100.00%,60.98% +14,o1-preview-2024-09-12 (Prompt),73.08%,77.53%,80.62%,76.76%,75.00%,79.17%,66.29%,73.17% +15,Gemini-1.5-Flash-001 (FC),72.81%,73.03%,72.48%,73.67%,62.50%,58.33%,72.91%,63.41% +16,Gemini-1.5-Pro-001 (FC),72.81%,71.16%,73.64%,70.59%,81.25%,62.50%,75.77%,63.41% +17,GoGoAgent,72.46%,72.21%,71.32%,72.42%,87.50%,62.50%,72.11%,87.80% +18,Gemini-1.5-Pro-002 (FC),72.41%,74.76%,74.81%,74.64%,87.50%,70.83%,68.80%,73.17% +19,xLAM-8x22b-r (FC),71.97%,79.40%,78.29%,80.14%,75.00%,62.50%,60.00%,85.37% +20,Functionary-Small-v3.1 (FC),70.41%,75.58%,75.19%,75.89%,81.25%,62.50%,61.83%,85.37% +21,Mistral-small-2402 (FC),70.19%,68.16%,63.57%,71.46%,12.50%,12.50%,72.69%,82.93% +22,GPT-4o-mini-2024-07-18 (FC),70.19%,74.23%,72.87%,74.45%,87.50%,70.83%,63.54%,80.49% +23,Hammer2.0-7b (FC),69.79%,76.63%,74.42%,77.15%,81.25%,75.00%,58.17%,95.12% +24,Command-R-Plus (Prompt) (Original),69.75%,69.59%,66.67%,70.30%,68.75%,70.83%,69.83%,73.17% +25,Gemma-2-27b-it (Prompt),69.48%,77.30%,79.46%,77.24%,68.75%,62.50%,56.69%,87.80% +26,Gemma-2-9b-it (Prompt),69.21%,73.11%,73.64%,73.58%,56.25%,58.33%,62.40%,87.80% +27,xLAM-8x7b-r (FC),69.12%,74.53%,68.22%,76.76%,62.50%,54.17%,60.00%,87.80% +28,GPT-4-turbo-2024-04-09 (Prompt),69.04%,84.64%,85.66%,84.57%,87.50%,75.00%,44.57%,82.93% +29,Open-Mixtral-8x22b (Prompt),68.46%,63.90%,72.87%,61.33%,81.25%,66.67%,75.54%,65.85% +30,mistral-large-2407 (FC),68.37%,79.55%,81.78%,79.27%,68.75%,75.00%,50.97%,75.61% +31,Gemini-1.5-Flash-001 (Prompt),68.24%,76.18%,74.81%,76.18%,93.75%,79.17%,55.20%,87.80% +32,xLAM-7b-r (FC),67.88%,72.28%,71.32%,73.48%,31.25%,58.33%,59.77%,97.56% +33,GPT-3.5-Turbo-0125 (Prompt),67.48%,64.27%,63.57%,64.61%,68.75%,54.17%,71.77%,80.49% +34,Gorilla-OpenFunctions-v2 (FC),67.44%,61.42%,73.64%,58.73%,68.75%,41.67%,76.34%,73.17% +35,Open-Mixtral-8x22b (FC),66.86%,71.16%,73.26%,72.32%,6.25%,41.67%,59.54%,82.93% +36,Meta-Llama-3-70B-Instruct (Prompt),66.15%,79.10%,78.68%,79.65%,68.75%,66.67%,45.14%,92.68% +37,Gemini-1.0-Pro-002 (FC),66.10%,67.04%,75.19%,65.96%,50.00%,37.50%,64.57%,68.29% +38,Qwen2.5-7B-Instruct (Prompt),65.97%,72.13%,72.48%,72.32%,62.50%,66.67%,55.31%,92.68% +39,Open-Mixtral-8x7b (Prompt),64.95%,63.30%,57.36%,65.00%,68.75%,50.00%,67.31%,68.29% +40,Hammer2.0-1.5b (FC),63.22%,68.76%,70.54%,68.56%,56.25%,66.67%,53.37%,92.68% +41,Open-Mistral-Nemo-2407 (FC),62.37%,68.46%,71.71%,67.79%,62.50%,66.67%,53.14%,60.98% +42,DBRX-Instruct (Prompt),62.33%,72.06%,74.81%,71.65%,75.00%,58.33%,46.29%,87.80% +43,GPT-4o-2024-08-06 (Prompt),62.19%,42.55%,42.64%,42.82%,25.00%,41.67%,93.37%,36.59% +44,Hermes-2-Pro-Llama-3-8B (FC),61.79%,64.57%,67.44%,64.42%,56.25%,45.83%,57.83%,56.10% +45,Qwen2.5-1.5B-Instruct (Prompt),61.71%,60.37%,64.73%,59.88%,50.00%,41.67%,63.09%,75.61% +46,GPT-3.5-Turbo-0125 (FC),61.22%,76.25%,74.42%,77.82%,43.75%,50.00%,36.57%,97.56% +47,Llama-3.1-70B-Instruct (Prompt),61.13%,72.58%,77.13%,71.46%,87.50%,62.50%,42.17%,92.68% +48,Hermes-2-Pro-Llama-3-70B (FC),60.51%,55.28%,63.18%,53.04%,56.25%,66.67%,68.46%,60.98% +49,MiniCPM3-4B (FC),59.88%,50.71%,56.98%,49.47%,56.25%,33.33%,73.94%,58.54% 50,Llama-3.1-8B-Instruct (Prompt),57.93%,71.31%,71.32%,72.23%,50.00%,45.83%,36.57%,78.05% 51,Claude-3-Haiku-20240307 (FC tools-2024-04-04),57.66%,74.31%,74.03%,77.15%,0.00%,4.17%,30.40%,97.56% 52,Granite-20b-FunctionCalling (FC),57.49%,57.08%,65.12%,55.35%,43.75%,54.17%,56.34%,95.12% @@ -62,7 +62,7 @@ Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Py 61,Llama-3.2-3B-Instruct (Prompt),50.91%,44.49%,47.67%,44.74%,0.00%,29.17%,60.11%,63.41% 62,Meta-Llama-3-8B-Instruct (Prompt),50.51%,59.78%,60.85%,60.75%,37.50%,20.83%,35.20%,75.61% 63,Open-Mistral-Nemo-2407 (Prompt),50.33%,75.06%,78.29%,74.54%,75.00%,62.50%,10.74%,90.24% -64,Gemini-1.0-Pro-002 (Prompt),45.67%,38.13%,41.47%,36.93%,68.75%,33.33%,55.54%,80.49% +64,Gemini-1.0-Pro-002 (Prompt),48.38%,48.61%,50.00%,48.41%,56.25%,37.50%,46.29%,85.37% 65,Llama-3.1-70B-Instruct (FC),44.47%,51.01%,48.45%,52.56%,31.25%,25.00%,31.89%,100.00% 66,Gemma-2-2b-it (Prompt),41.63%,11.46%,11.24%,11.96%,0.00%,0.00%,89.03%,12.20% 67,Qwen2-1.5B-Instruct (Prompt),39.00%,41.87%,50.39%,40.50%,25.00%,20.83%,32.91%,75.61% diff --git a/data_multi_turn.csv b/data_multi_turn.csv new file mode 100644 index 000000000..0721afea6 --- /dev/null +++ b/data_multi_turn.csv @@ -0,0 +1,71 @@ +Rank,Model,Multi Turn Overall Acc,Base,Miss Func,Miss Param,Long Context +1,GPT-4o-2024-08-06 (FC),45.25%,54.50%,44.00%,34.50%,48.00% +2,Claude-3.5-Sonnet-20240620 (FC),40.00%,46.00%,39.00%,35.00%,40.00% +3,GPT-4-turbo-2024-04-09 (FC),39.25%,54.50%,32.50%,29.50%,40.50% +4,o1-preview-2024-09-12 (Prompt),36.62%,43.00%,38.50%,32.50%,32.50% +5,o1-mini-2024-09-12 (Prompt),33.50%,40.50%,32.50%,26.50%,34.50% +6,GPT-4o-mini-2024-07-18 (FC),28.25%,40.50%,15.50%,24.00%,33.00% +7,Claude-3-Opus-20240229 (FC tools-2024-04-04),28.12%,30.00%,29.50%,28.00%,25.00% +8,GPT-4-turbo-2024-04-09 (Prompt),26.75%,36.50%,24.00%,17.00%,29.50% +9,Claude-3-Haiku-20240307 (FC tools-2024-04-04),20.62%,27.50%,15.00%,17.50%,22.50% +10,Gemini-1.5-Pro-002 (FC),19.13%,26.00%,13.50%,19.50%,17.50% +11,Gemini-1.5-Flash-001 (Prompt),17.62%,25.50%,16.00%,12.00%,17.00% +12,GPT-4o-2024-08-06 (Prompt),17.62%,21.50%,14.00%,15.00%,20.00% +13,xLAM-8x22b-r (FC),17.38%,25.50%,20.50%,15.00%,8.50% +14,Functionary-Medium-v3.1 (FC),17.25%,28.50%,12.50%,23.50%,4.50% +15,GPT-3.5-Turbo-0125 (FC),16.88%,28.00%,13.00%,17.00%,9.50% +16,mistral-large-2407 (FC),16.75%,23.00%,12.50%,15.50%,16.00% +17,Gemini-1.5-Pro-002 (Prompt),16.25%,20.00%,15.00%,14.50%,15.50% +18,GPT-4o-mini-2024-07-18 (Prompt),14.50%,20.00%,11.50%,10.00%,16.50% +19,Llama-3.1-70B-Instruct (Prompt),14.25%,18.50%,15.50%,10.00%,13.00% +20,xLAM-8x7b-r (FC),13.88%,18.50%,14.00%,12.50%,10.50% +21,Gemini-1.5-Pro-001 (Prompt),13.12%,14.50%,13.50%,13.50%,11.00% +22,Gemini-1.5-Pro-001 (FC),12.75%,16.00%,11.00%,12.50%,11.50% +23,Gemini-1.5-Flash-002 (Prompt),12.50%,15.00%,14.50%,9.00%,11.50% +24,Gemini-1.5-Flash-001 (FC),10.88%,13.00%,10.00%,13.00%,7.50% +25,Llama-3.1-8B-Instruct (Prompt),10.50%,14.00%,10.50%,8.00%,9.50% +26,Gemini-1.5-Flash-002 (FC),9.75%,15.00%,5.00%,8.00%,11.00% +27,mistral-large-2407 (Prompt),9.62%,14.50%,11.00%,6.00%,7.00% +28,Functionary-Small-v3.1 (FC),8.38%,15.50%,0.50%,12.50%,5.00% +29,Open-Mistral-Nemo-2407 (FC),8.00%,12.00%,5.00%,10.50%,4.50% +30,ToolACE-8B (FC),7.88%,8.50%,10.50%,5.50%,7.00% +31,xLAM-7b-r (FC),6.88%,11.50%,7.00%,6.00%,3.00% +32,Qwen2.5-7B-Instruct (Prompt),6.38%,8.00%,7.50%,6.00%,4.00% +33,GPT-3.5-Turbo-0125 (Prompt),5.75%,7.50%,7.00%,4.00%,4.50% +34,Hammer2.0-7b (FC),5.62%,9.50%,2.00%,7.50%,3.50% +35,Meta-Llama-3-70B-Instruct (Prompt),5.50%,9.50%,4.50%,5.50%,2.50% +36,Llama-3.1-70B-Instruct (FC),2.75%,4.50%,2.00%,2.00%,2.50% +37,Granite-20b-FunctionCalling (FC),2.75%,5.00%,1.50%,3.00%,1.50% +38,Qwen2-7B-Instruct (Prompt),2.63%,3.50%,3.50%,1.50%,2.00% +39,Gemini-1.0-Pro-002 (FC),2.50%,4.00%,2.50%,2.50%,1.00% +40,Mistral-small-2402 (FC),2.12%,3.50%,0.00%,2.50%,2.50% +41,Gemma-2-27b-it (Prompt),2.12%,3.50%,2.00%,1.50%,1.50% +42,Llama-3.2-3B-Instruct (Prompt),2.12%,1.50%,2.00%,2.00%,3.00% +43,Qwen2.5-1.5B-Instruct (Prompt),1.50%,2.00%,2.00%,1.00%,1.00% +44,Hammer2.0-1.5b (FC),1.38%,2.50%,0.50%,1.00%,1.50% +45,Command-R-Plus (FC) (Original),1.38%,1.50%,0.00%,1.50%,2.50% +46,Gemini-1.0-Pro-002 (Prompt),1.25%,1.00%,3.50%,0.00%,0.50% +47,MiniCPM3-4B (FC),0.88%,1.50%,2.00%,0.00%,0.00% +48,Nexusflow-Raven-v2 (FC),0.88%,1.50%,0.50%,0.50%,1.00% +49,Gemma-2-9b-it (Prompt),0.75%,1.00%,2.00%,0.00%,0.00% +50,Open-Mixtral-8x22b (FC),0.62%,1.00%,0.00%,1.00%,0.50% +51,Open-Mixtral-8x7b (Prompt),0.62%,1.50%,0.00%,0.00%,1.00% +52,Mistral-Medium-2312 (Prompt),0.50%,1.50%,0.00%,0.00%,0.50% +53,Open-Mixtral-8x22b (Prompt),0.50%,0.50%,0.50%,0.00%,1.00% +54,Hermes-2-Pro-Llama-3-8B (FC),0.38%,1.00%,0.00%,0.50%,0.00% +55,Hammer2.0-0.5b (FC),0.38%,0.50%,0.00%,0.50%,0.50% +56,Command-R-Plus (Prompt) (Original),0.38%,1.00%,0.00%,0.00%,0.50% +57,Hermes-2-Pro-Mistral-7B (FC),0.25%,0.50%,0.00%,0.00%,0.50% +58,Mistral-Small-2402 (Prompt),0.25%,0.50%,0.00%,0.00%,0.50% +59,Hermes-2-Pro-Llama-3-70B (FC),0.25%,0.50%,0.00%,0.00%,0.50% +60,GoGoAgent,0.25%,0.50%,0.50%,0.00%,0.00% +61,Open-Mistral-Nemo-2407 (Prompt),0.12%,0.00%,0.50%,0.00%,0.00% +62,xLAM-1b-fc-r (FC),0.12%,0.00%,0.00%,0.00%,0.50% +63,Qwen2-1.5B-Instruct (Prompt),0.12%,0.00%,0.50%,0.00%,0.00% +64,DBRX-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +65,Gemma-2-2b-it (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +66,Llama-3.2-1B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +67,Llama-3.1-8B-Instruct (FC),0.00%,0.00%,0.00%,0.00%,0.00% +68,xLAM-7b-fc-r (FC),0.00%,0.00%,0.00%,0.00%,0.00% +69,Gorilla-OpenFunctions-v2 (FC),0.00%,0.00%,0.00%,0.00%,0.00% +70,Meta-Llama-3-8B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% \ No newline at end of file diff --git a/data_non_live.csv b/data_non_live.csv index 2d37967c1..5e5cd0eab 100644 --- a/data_non_live.csv +++ b/data_non_live.csv @@ -1,21 +1,21 @@ Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simple AST,Java Simple AST,JavaScript Simple AST,Multiple AST,Parallel AST,Parallel Multiple AST,Simple Exec,Python Simple Exec,REST Simple Exec,Multiple Exec,Parallel Exec,Parallel Multiple Exec,Irrelevance Detection -1,Gemini-1.5-Pro-002 (Prompt),89.63%,88.96%,91.77%,79.83%,94.50%,65.00%,80.00%,94.00%,93.00%,89.00%,98.57%,100.00%,97.14%,96.00%,90.00%,82.50%,83.75% -2,Functionary-Medium-v3.1 (FC),89.08%,89.52%,89.77%,76.08%,96.25%,64.00%,68.00%,96.50%,95.00%,90.50%,97.57%,98.00%,97.14%,94.00%,90.00%,77.50%,84.58% -3,ToolACE-8B (FC),88.94%,87.06%,89.52%,76.25%,89.75%,65.00%,74.00%,93.00%,90.00%,89.00%,98.57%,100.00%,97.14%,94.00%,88.00%,77.50%,94.17% +1,Functionary-Medium-v3.1 (FC),89.08%,89.52%,89.77%,76.08%,96.25%,64.00%,68.00%,96.50%,95.00%,90.50%,97.57%,98.00%,97.14%,94.00%,90.00%,77.50%,84.58% +2,ToolACE-8B (FC),88.94%,87.06%,89.52%,76.25%,89.75%,65.00%,74.00%,93.00%,90.00%,89.00%,98.57%,100.00%,97.14%,94.00%,88.00%,77.50%,94.17% +3,Gemini-1.5-Pro-002 (Prompt),88.86%,88.00%,91.41%,77.50%,95.50%,63.00%,74.00%,93.00%,92.50%,89.00%,97.14%,100.00%,94.29%,94.00%,92.00%,82.50%,82.08% 4,GPT-4-turbo-2024-04-09 (Prompt),88.80%,91.46%,90.00%,82.33%,97.00%,68.00%,82.00%,95.00%,95.00%,93.50%,99.50%,99.00%,100.00%,98.00%,80.00%,82.50%,73.33% 5,GPT-4o-mini-2024-07-18 (Prompt),88.69%,86.23%,91.12%,79.42%,93.25%,65.00%,80.00%,93.00%,86.50%,86.00%,100.00%,100.00%,100.00%,96.00%,86.00%,82.50%,88.75% 6,Hammer2.0-7b (FC),88.54%,90.27%,89.25%,80.58%,97.75%,66.00%,78.00%,95.00%,93.50%,92.00%,90.00%,100.00%,80.00%,94.00%,88.00%,85.00%,78.75% -7,Gemini-1.5-Flash-002 (Prompt),87.60%,86.58%,89.48%,75.33%,95.00%,63.00%,68.00%,91.50%,91.50%,88.00%,95.93%,99.00%,92.86%,96.00%,86.00%,80.00%,84.17% -8,GoGoAgent,87.54%,86.00%,88.05%,75.50%,92.50%,64.00%,70.00%,92.50%,92.00%,84.00%,94.71%,98.00%,91.43%,94.00%,86.00%,77.50%,91.67% -9,xLAM-8x22b-r (FC),87.51%,88.15%,90.11%,81.08%,95.25%,66.00%,82.00%,93.00%,91.50%,87.00%,96.43%,100.00%,92.86%,96.00%,88.00%,80.00%,74.58% -10,Llama-3.1-70B-Instruct (Prompt),87.50%,88.90%,89.34%,76.58%,95.75%,60.00%,74.00%,95.50%,93.50%,90.00%,91.36%,97.00%,85.71%,96.00%,90.00%,80.00%,74.58% -11,Gemma-2-27b-it (Prompt),87.39%,88.52%,87.89%,81.08%,95.25%,64.00%,84.00%,92.50%,91.00%,89.50%,83.57%,100.00%,67.14%,96.00%,92.00%,80.00%,80.83% -12,o1-preview-2024-09-12 (Prompt),87.12%,86.42%,88.88%,78.17%,93.50%,67.00%,74.00%,93.00%,89.50%,85.00%,99.50%,99.00%,100.00%,92.00%,84.00%,80.00%,82.92% -13,Gemini-1.5-Pro-001 (Prompt),86.17%,83.88%,87.52%,73.00%,91.00%,60.00%,68.00%,91.50%,88.00%,83.00%,91.57%,96.00%,87.14%,94.00%,82.00%,82.50%,90.00% -14,GPT-4o-2024-08-06 (FC),86.15%,85.90%,85.64%,74.58%,91.75%,64.00%,68.00%,92.50%,92.00%,84.50%,87.07%,97.00%,77.14%,92.00%,86.00%,77.50%,89.17% -15,Open-Mixtral-8x22b (Prompt),86.08%,86.92%,88.23%,77.67%,94.00%,59.00%,80.00%,92.50%,90.00%,87.50%,91.43%,100.00%,82.86%,96.00%,88.00%,77.50%,74.17% -16,Gemini-1.5-Flash-001 (Prompt),85.74%,86.17%,87.68%,73.17%,89.50%,64.00%,66.00%,90.50%,92.00%,89.00%,84.21%,97.00%,71.43%,94.00%,90.00%,82.50%,76.25% -17,Qwen2.5-7B-Instruct (Prompt),85.58%,85.79%,88.13%,75.67%,96.00%,59.00%,72.00%,96.00%,88.50%,83.00%,94.50%,99.00%,90.00%,92.00%,86.00%,80.00%,74.58% +7,GoGoAgent,87.54%,86.00%,88.05%,75.50%,92.50%,64.00%,70.00%,92.50%,92.00%,84.00%,94.71%,98.00%,91.43%,94.00%,86.00%,77.50%,91.67% +8,xLAM-8x22b-r (FC),87.51%,88.15%,90.11%,81.08%,95.25%,66.00%,82.00%,93.00%,91.50%,87.00%,96.43%,100.00%,92.86%,96.00%,88.00%,80.00%,74.58% +9,Llama-3.1-70B-Instruct (Prompt),87.50%,88.90%,89.34%,76.58%,95.75%,60.00%,74.00%,95.50%,93.50%,90.00%,91.36%,97.00%,85.71%,96.00%,90.00%,80.00%,74.58% +10,Gemma-2-27b-it (Prompt),87.39%,88.52%,87.89%,81.08%,95.25%,64.00%,84.00%,92.50%,91.00%,89.50%,83.57%,100.00%,67.14%,96.00%,92.00%,80.00%,80.83% +11,o1-preview-2024-09-12 (Prompt),87.12%,86.42%,88.88%,78.17%,93.50%,67.00%,74.00%,93.00%,89.50%,85.00%,99.50%,99.00%,100.00%,92.00%,84.00%,80.00%,82.92% +12,GPT-4o-2024-08-06 (FC),86.15%,85.90%,85.64%,74.58%,91.75%,64.00%,68.00%,92.50%,92.00%,84.50%,87.07%,97.00%,77.14%,92.00%,86.00%,77.50%,89.17% +13,Open-Mixtral-8x22b (Prompt),86.08%,86.92%,88.23%,77.67%,94.00%,59.00%,80.00%,92.50%,90.00%,87.50%,91.43%,100.00%,82.86%,96.00%,88.00%,77.50%,74.17% +14,Gemini-1.5-Pro-001 (FC),86.01%,83.98%,88.39%,69.42%,92.25%,54.00%,62.00%,93.00%,91.00%,82.50%,91.57%,96.00%,87.14%,92.00%,90.00%,80.00%,84.58% +15,Gemini-1.5-Pro-002 (FC),85.85%,87.96%,85.82%,74.83%,94.50%,58.00%,72.00%,95.00%,91.50%,90.50%,78.79%,99.00%,58.57%,94.00%,88.00%,82.50%,77.50% +16,Qwen2.5-7B-Instruct (Prompt),85.58%,85.79%,88.13%,75.67%,96.00%,59.00%,72.00%,96.00%,88.50%,83.00%,94.50%,99.00%,90.00%,92.00%,86.00%,80.00%,74.58% +17,Gemini-1.5-Pro-001 (Prompt),85.48%,83.94%,86.30%,72.75%,92.25%,60.00%,66.00%,90.00%,91.50%,81.50%,93.71%,96.00%,91.43%,88.00%,86.00%,77.50%,88.33% 18,Meta-Llama-3-70B-Instruct (Prompt),85.10%,87.17%,89.21%,75.17%,95.50%,60.00%,70.00%,95.50%,90.50%,87.50%,95.86%,96.00%,95.71%,96.00%,80.00%,85.00%,60.42% 19,Functionary-Small-v3.1 (FC),84.99%,86.42%,85.95%,74.67%,96.00%,62.00%,66.00%,94.50%,89.50%,87.00%,88.79%,99.00%,78.57%,92.00%,88.00%,75.00%,75.42% 20,Gorilla-OpenFunctions-v2 (FC),84.81%,86.29%,86.09%,77.67%,95.00%,62.00%,76.00%,95.00%,89.00%,83.50%,95.86%,96.00%,95.71%,96.00%,80.00%,72.50%,73.75% @@ -25,43 +25,43 @@ Rank,Model,Non_Live Overall Acc,AST Summary,Exec Summary,Simple AST,Python Simpl 24,Hammer2.0-1.5b (FC),84.44%,84.06%,88.95%,75.25%,94.75%,65.00%,66.00%,90.50%,88.00%,82.50%,93.29%,98.00%,88.57%,92.00%,88.00%,82.50%,67.92% 25,o1-mini-2024-09-12 (Prompt),83.84%,81.31%,84.00%,73.75%,88.25%,61.00%,72.00%,90.00%,81.00%,80.50%,88.50%,97.00%,80.00%,92.00%,78.00%,77.50%,93.33% 26,GPT-4o-mini-2024-07-18 (FC),83.72%,84.25%,84.12%,73.50%,90.50%,64.00%,66.00%,90.50%,90.00%,83.00%,83.50%,97.00%,70.00%,92.00%,86.00%,75.00%,80.00% -27,Command-R-Plus (Prompt) (Original),82.19%,80.90%,85.07%,71.08%,89.25%,60.00%,64.00%,91.50%,82.00%,79.00%,93.29%,98.00%,88.57%,90.00%,82.00%,75.00%,75.83% -28,mistral-large-2407 (FC),81.41%,86.62%,84.57%,73.00%,96.00%,57.00%,66.00%,92.00%,91.50%,90.00%,73.79%,99.00%,48.57%,94.00%,88.00%,82.50%,47.92% -29,Llama-3.1-8B-Instruct (Prompt),81.15%,83.62%,87.29%,73.00%,94.00%,59.00%,66.00%,94.50%,83.50%,83.50%,85.64%,97.00%,74.29%,96.00%,90.00%,77.50%,46.67% -30,xLAM-7b-r (FC),80.86%,81.40%,83.46%,73.08%,91.25%,56.00%,72.00%,93.50%,79.50%,79.50%,76.86%,98.00%,55.71%,92.00%,90.00%,75.00%,68.33% -31,Open-Mistral-Nemo-2407 (Prompt),79.66%,85.60%,91.23%,77.42%,92.25%,60.00%,80.00%,93.50%,87.00%,84.50%,95.93%,99.00%,92.86%,96.00%,88.00%,85.00%,9.58% -32,Mistral-Medium-2312 (Prompt),79.27%,74.02%,81.73%,70.58%,91.75%,56.00%,64.00%,91.50%,65.50%,68.50%,95.43%,98.00%,92.86%,92.00%,72.00%,67.50%,90.42% -33,Hermes-2-Pro-Llama-3-70B (FC),78.81%,78.85%,80.45%,59.92%,83.75%,54.00%,42.00%,80.00%,88.00%,87.50%,76.29%,94.00%,58.57%,82.00%,86.00%,77.50%,72.08% -34,GPT-3.5-Turbo-0125 (FC),78.52%,84.12%,84.11%,75.50%,94.50%,64.00%,68.00%,93.00%,88.00%,80.00%,95.43%,98.00%,92.86%,90.00%,86.00%,65.00%,33.75% -35,Open-Mistral-Nemo-2407 (FC),78.29%,81.21%,77.04%,63.33%,92.00%,36.00%,62.00%,92.00%,86.50%,83.00%,55.64%,97.00%,14.29%,90.00%,90.00%,72.50%,71.67% -36,Qwen2.5-1.5B-Instruct (Prompt),78.14%,75.19%,82.82%,70.25%,87.75%,55.00%,68.00%,85.50%,73.50%,71.50%,72.79%,97.00%,48.57%,94.00%,82.00%,82.50%,71.25% -37,Qwen2-7B-Instruct (Prompt),75.50%,74.85%,81.70%,67.42%,84.25%,60.00%,58.00%,87.50%,71.00%,73.50%,86.79%,95.00%,78.57%,88.00%,82.00%,70.00%,53.33% -38,Command-R-Plus (FC) (Original),75.47%,76.83%,78.61%,66.33%,87.00%,60.00%,52.00%,90.00%,82.00%,69.00%,88.93%,95.00%,82.86%,88.00%,80.00%,57.50%,57.50% -39,Hermes-2-Pro-Llama-3-8B (FC),74.14%,76.54%,75.48%,64.17%,90.50%,56.00%,46.00%,89.50%,79.50%,73.00%,69.93%,97.00%,42.86%,94.00%,78.00%,60.00%,59.17% -40,Llama-3.2-3B-Instruct (Prompt),74.03%,77.77%,69.41%,64.08%,81.25%,49.00%,62.00%,90.00%,80.50%,76.50%,78.14%,82.00%,74.29%,92.00%,50.00%,57.50%,77.50% -41,xLAM-8x7b-r (FC),73.93%,68.85%,78.43%,68.42%,79.25%,60.00%,66.00%,88.00%,63.50%,55.50%,87.71%,94.00%,81.43%,88.00%,68.00%,70.00%,76.25% -42,Hermes-2-Pro-Mistral-7B (FC),69.78%,72.83%,77.30%,61.33%,86.00%,56.00%,42.00%,87.50%,78.50%,64.00%,61.71%,92.00%,31.43%,94.00%,86.00%,67.50%,27.50% -43,Claude-3.5-Sonnet-20240620 (FC),69.19%,70.04%,66.27%,75.17%,93.50%,64.00%,68.00%,93.50%,64.50%,47.00%,97.57%,98.00%,97.14%,90.00%,40.00%,37.50%,77.50% -44,DBRX-Instruct (Prompt),68.89%,67.04%,75.04%,72.17%,92.50%,54.00%,70.00%,91.50%,56.50%,48.00%,90.14%,96.00%,84.29%,88.00%,62.00%,60.00%,51.67% -45,Hammer2.0-0.5b (FC),68.44%,66.79%,70.43%,62.17%,82.50%,52.00%,52.00%,80.00%,67.50%,57.50%,53.21%,95.00%,11.43%,86.00%,80.00%,62.50%,67.08% -46,xLAM-7b-fc-r (FC),67.87%,74.56%,65.75%,74.25%,93.75%,63.00%,66.00%,92.00%,78.00%,54.00%,84.50%,99.00%,70.00%,90.00%,66.00%,22.50%,49.58% -47,GPT-3.5-Turbo-0125 (Prompt),67.78%,65.04%,67.68%,62.67%,78.00%,48.00%,62.00%,83.00%,65.50%,49.00%,46.21%,91.00%,1.43%,90.00%,72.00%,62.50%,79.17% -48,Open-Mixtral-8x7b (Prompt),64.49%,57.94%,65.91%,68.75%,89.25%,53.00%,64.00%,86.00%,40.00%,37.00%,71.14%,88.00%,54.29%,88.00%,52.00%,52.50%,85.00% -49,GPT-4o-2024-08-06 (Prompt),63.57%,49.35%,69.93%,32.42%,66.25%,11.00%,20.00%,48.00%,74.00%,43.00%,49.71%,88.00%,11.43%,82.00%,78.00%,70.00%,95.00% -50,Claude-3-Opus-20240229 (FC tools-2024-04-04),62.73%,58.67%,62.05%,68.67%,89.00%,61.00%,56.00%,89.00%,41.00%,36.00%,88.71%,96.00%,81.43%,88.00%,44.00%,27.50%,81.67% -51,Meta-Llama-3-8B-Instruct (Prompt),58.94%,61.02%,66.70%,63.08%,88.25%,49.00%,52.00%,85.50%,51.50%,44.00%,83.29%,88.00%,78.57%,82.00%,44.00%,57.50%,19.58% -52,Gemini-1.0-Pro-002 (Prompt),58.91%,56.29%,62.39%,42.17%,43.50%,39.00%,44.00%,51.00%,68.50%,63.50%,48.57%,70.00%,27.14%,76.00%,70.00%,55.00%,55.42% -53,Mistral-small-2402 (FC),58.70%,57.27%,53.77%,67.08%,91.25%,58.00%,52.00%,93.50%,20.00%,48.50%,87.07%,97.00%,77.14%,92.00%,16.00%,20.00%,84.17% -54,Open-Mixtral-8x22b (FC),58.66%,61.08%,63.82%,71.33%,93.00%,65.00%,56.00%,94.50%,10.50%,68.00%,84.29%,100.00%,68.57%,94.00%,22.00%,55.00%,28.33% -55,MiniCPM3-4B (FC),57.87%,63.19%,48.70%,67.75%,83.25%,54.00%,66.00%,74.00%,60.50%,50.50%,44.79%,51.00%,38.57%,50.00%,40.00%,60.00%,73.33% -56,Gemini-1.5-Pro-002 (FC),56.71%,38.27%,69.54%,54.08%,41.25%,55.00%,66.00%,39.50%,29.50%,30.00%,69.64%,85.00%,54.29%,80.00%,76.00%,52.50%,79.17% -57,Nexusflow-Raven-v2 (FC),55.21%,46.15%,57.86%,57.58%,37.75%,63.00%,72.00%,53.00%,34.50%,39.50%,47.43%,82.00%,12.86%,86.00%,38.00%,60.00%,80.83% -58,Gemini-1.5-Pro-001 (FC),54.90%,31.77%,70.39%,35.58%,40.75%,24.00%,42.00%,39.50%,26.50%,25.50%,75.07%,83.00%,67.14%,80.00%,74.00%,52.50%,85.42% -59,mistral-large-2407 (Prompt),54.60%,62.27%,56.93%,46.58%,64.75%,39.00%,36.00%,77.00%,70.00%,55.50%,40.21%,59.00%,21.43%,80.00%,70.00%,37.50%,14.58% -60,Qwen2-1.5B-Instruct (Prompt),53.99%,59.73%,58.52%,55.92%,79.75%,42.00%,46.00%,80.00%,55.50%,47.50%,51.07%,85.00%,17.14%,82.00%,56.00%,45.00%,12.92% -61,Gemini-1.5-Flash-002 (FC),53.15%,35.42%,60.84%,49.67%,39.00%,56.00%,54.00%,39.00%,24.00%,29.00%,60.86%,66.00%,55.71%,80.00%,50.00%,52.50%,93.33% -62,Gemini-1.5-Flash-001 (FC),51.40%,33.56%,62.41%,47.25%,41.75%,54.00%,46.00%,40.00%,22.50%,24.50%,53.14%,82.00%,24.29%,76.00%,68.00%,52.50%,78.75% -63,Gemini-1.0-Pro-002 (FC),45.85%,26.21%,58.11%,48.83%,42.50%,56.00%,48.00%,39.00%,7.50%,9.50%,76.43%,80.00%,72.86%,76.00%,60.00%,20.00%,75.42% +27,Gemini-1.5-Flash-001 (Prompt),82.84%,85.31%,83.79%,70.75%,84.25%,64.00%,64.00%,90.00%,91.00%,89.50%,79.14%,84.00%,74.29%,92.00%,84.00%,80.00%,69.17% +28,Command-R-Plus (Prompt) (Original),82.19%,80.90%,85.07%,71.08%,89.25%,60.00%,64.00%,91.50%,82.00%,79.00%,93.29%,98.00%,88.57%,90.00%,82.00%,75.00%,75.83% +29,mistral-large-2407 (FC),81.41%,86.62%,84.57%,73.00%,96.00%,57.00%,66.00%,92.00%,91.50%,90.00%,73.79%,99.00%,48.57%,94.00%,88.00%,82.50%,47.92% +30,Llama-3.1-8B-Instruct (Prompt),81.15%,83.62%,87.29%,73.00%,94.00%,59.00%,66.00%,94.50%,83.50%,83.50%,85.64%,97.00%,74.29%,96.00%,90.00%,77.50%,46.67% +31,xLAM-7b-r (FC),80.86%,81.40%,83.46%,73.08%,91.25%,56.00%,72.00%,93.50%,79.50%,79.50%,76.86%,98.00%,55.71%,92.00%,90.00%,75.00%,68.33% +32,Gemini-1.5-Flash-002 (Prompt),80.29%,79.69%,80.64%,74.25%,94.75%,60.00%,68.00%,91.50%,87.00%,66.00%,93.57%,100.00%,87.14%,92.00%,82.00%,55.00%,81.25% +33,Open-Mistral-Nemo-2407 (Prompt),79.66%,85.60%,91.23%,77.42%,92.25%,60.00%,80.00%,93.50%,87.00%,84.50%,95.93%,99.00%,92.86%,96.00%,88.00%,85.00%,9.58% +34,Mistral-Medium-2312 (Prompt),79.27%,74.02%,81.73%,70.58%,91.75%,56.00%,64.00%,91.50%,65.50%,68.50%,95.43%,98.00%,92.86%,92.00%,72.00%,67.50%,90.42% +35,Hermes-2-Pro-Llama-3-70B (FC),78.81%,78.85%,80.45%,59.92%,83.75%,54.00%,42.00%,80.00%,88.00%,87.50%,76.29%,94.00%,58.57%,82.00%,86.00%,77.50%,72.08% +36,Gemini-1.5-Flash-002 (FC),78.78%,81.21%,73.04%,65.83%,86.50%,57.00%,54.00%,91.50%,80.50%,87.00%,68.14%,72.00%,64.29%,90.00%,54.00%,80.00%,92.08% +37,GPT-3.5-Turbo-0125 (FC),78.52%,84.12%,84.11%,75.50%,94.50%,64.00%,68.00%,93.00%,88.00%,80.00%,95.43%,98.00%,92.86%,90.00%,86.00%,65.00%,33.75% +38,Open-Mistral-Nemo-2407 (FC),78.29%,81.21%,77.04%,63.33%,92.00%,36.00%,62.00%,92.00%,86.50%,83.00%,55.64%,97.00%,14.29%,90.00%,90.00%,72.50%,71.67% +39,Qwen2.5-1.5B-Instruct (Prompt),78.14%,75.19%,82.82%,70.25%,87.75%,55.00%,68.00%,85.50%,73.50%,71.50%,72.79%,97.00%,48.57%,94.00%,82.00%,82.50%,71.25% +40,Gemini-1.5-Flash-001 (FC),76.45%,77.21%,75.12%,64.83%,93.50%,55.00%,46.00%,94.50%,73.00%,76.50%,61.50%,93.00%,30.00%,88.00%,76.00%,75.00%,78.75% +41,Qwen2-7B-Instruct (Prompt),75.50%,74.85%,81.70%,67.42%,84.25%,60.00%,58.00%,87.50%,71.00%,73.50%,86.79%,95.00%,78.57%,88.00%,82.00%,70.00%,53.33% +42,Command-R-Plus (FC) (Original),75.47%,76.83%,78.61%,66.33%,87.00%,60.00%,52.00%,90.00%,82.00%,69.00%,88.93%,95.00%,82.86%,88.00%,80.00%,57.50%,57.50% +43,Hermes-2-Pro-Llama-3-8B (FC),74.14%,76.54%,75.48%,64.17%,90.50%,56.00%,46.00%,89.50%,79.50%,73.00%,69.93%,97.00%,42.86%,94.00%,78.00%,60.00%,59.17% +44,Llama-3.2-3B-Instruct (Prompt),74.03%,77.77%,69.41%,64.08%,81.25%,49.00%,62.00%,90.00%,80.50%,76.50%,78.14%,82.00%,74.29%,92.00%,50.00%,57.50%,77.50% +45,xLAM-8x7b-r (FC),73.93%,68.85%,78.43%,68.42%,79.25%,60.00%,66.00%,88.00%,63.50%,55.50%,87.71%,94.00%,81.43%,88.00%,68.00%,70.00%,76.25% +46,Hermes-2-Pro-Mistral-7B (FC),69.78%,72.83%,77.30%,61.33%,86.00%,56.00%,42.00%,87.50%,78.50%,64.00%,61.71%,92.00%,31.43%,94.00%,86.00%,67.50%,27.50% +47,Claude-3.5-Sonnet-20240620 (FC),69.19%,70.04%,66.27%,75.17%,93.50%,64.00%,68.00%,93.50%,64.50%,47.00%,97.57%,98.00%,97.14%,90.00%,40.00%,37.50%,77.50% +48,DBRX-Instruct (Prompt),68.89%,67.04%,75.04%,72.17%,92.50%,54.00%,70.00%,91.50%,56.50%,48.00%,90.14%,96.00%,84.29%,88.00%,62.00%,60.00%,51.67% +49,Hammer2.0-0.5b (FC),68.44%,66.79%,70.43%,62.17%,82.50%,52.00%,52.00%,80.00%,67.50%,57.50%,53.21%,95.00%,11.43%,86.00%,80.00%,62.50%,67.08% +50,xLAM-7b-fc-r (FC),67.87%,74.56%,65.75%,74.25%,93.75%,63.00%,66.00%,92.00%,78.00%,54.00%,84.50%,99.00%,70.00%,90.00%,66.00%,22.50%,49.58% +51,GPT-3.5-Turbo-0125 (Prompt),67.78%,65.04%,67.68%,62.67%,78.00%,48.00%,62.00%,83.00%,65.50%,49.00%,46.21%,91.00%,1.43%,90.00%,72.00%,62.50%,79.17% +52,Open-Mixtral-8x7b (Prompt),64.49%,57.94%,65.91%,68.75%,89.25%,53.00%,64.00%,86.00%,40.00%,37.00%,71.14%,88.00%,54.29%,88.00%,52.00%,52.50%,85.00% +53,GPT-4o-2024-08-06 (Prompt),63.57%,49.35%,69.93%,32.42%,66.25%,11.00%,20.00%,48.00%,74.00%,43.00%,49.71%,88.00%,11.43%,82.00%,78.00%,70.00%,95.00% +54,Claude-3-Opus-20240229 (FC tools-2024-04-04),62.73%,58.67%,62.05%,68.67%,89.00%,61.00%,56.00%,89.00%,41.00%,36.00%,88.71%,96.00%,81.43%,88.00%,44.00%,27.50%,81.67% +55,Gemini-1.0-Pro-002 (FC),62.08%,56.52%,65.14%,67.08%,93.25%,54.00%,54.00%,94.00%,39.50%,25.50%,84.07%,91.00%,77.14%,88.00%,66.00%,22.50%,72.08% +56,Gemini-1.0-Pro-002 (Prompt),59.28%,60.17%,59.66%,50.17%,63.50%,33.00%,54.00%,66.50%,60.50%,63.50%,45.14%,66.00%,24.29%,78.00%,68.00%,47.50%,54.17% +57,Meta-Llama-3-8B-Instruct (Prompt),58.94%,61.02%,66.70%,63.08%,88.25%,49.00%,52.00%,85.50%,51.50%,44.00%,83.29%,88.00%,78.57%,82.00%,44.00%,57.50%,19.58% +58,Mistral-small-2402 (FC),58.70%,57.27%,53.77%,67.08%,91.25%,58.00%,52.00%,93.50%,20.00%,48.50%,87.07%,97.00%,77.14%,92.00%,16.00%,20.00%,84.17% +59,Open-Mixtral-8x22b (FC),58.66%,61.08%,63.82%,71.33%,93.00%,65.00%,56.00%,94.50%,10.50%,68.00%,84.29%,100.00%,68.57%,94.00%,22.00%,55.00%,28.33% +60,MiniCPM3-4B (FC),57.87%,63.19%,48.70%,67.75%,83.25%,54.00%,66.00%,74.00%,60.50%,50.50%,44.79%,51.00%,38.57%,50.00%,40.00%,60.00%,73.33% +61,Nexusflow-Raven-v2 (FC),55.21%,46.15%,57.86%,57.58%,37.75%,63.00%,72.00%,53.00%,34.50%,39.50%,47.43%,82.00%,12.86%,86.00%,38.00%,60.00%,80.83% +62,mistral-large-2407 (Prompt),54.60%,62.27%,56.93%,46.58%,64.75%,39.00%,36.00%,77.00%,70.00%,55.50%,40.21%,59.00%,21.43%,80.00%,70.00%,37.50%,14.58% +63,Qwen2-1.5B-Instruct (Prompt),53.99%,59.73%,58.52%,55.92%,79.75%,42.00%,46.00%,80.00%,55.50%,47.50%,51.07%,85.00%,17.14%,82.00%,56.00%,45.00%,12.92% 64,Claude-3-Haiku-20240307 (FC tools-2024-04-04),42.79%,41.67%,47.52%,70.67%,96.00%,60.00%,56.00%,93.50%,2.00%,0.50%,92.07%,97.00%,87.14%,92.00%,6.00%,0.00%,28.33% 65,Llama-3.1-8B-Instruct (FC),38.61%,36.52%,49.93%,56.08%,50.25%,56.00%,62.00%,55.00%,0.00%,35.00%,58.21%,65.00%,51.43%,58.00%,56.00%,27.50%,1.67% 66,xLAM-1b-fc-r (FC),35.96%,39.94%,40.23%,71.25%,82.75%,59.00%,72.00%,85.50%,1.50%,1.50%,74.93%,97.00%,52.86%,86.00%,0.00%,0.00%,2.92% diff --git a/data_overall.csv b/data_overall.csv index 3b7910111..98eb1e271 100644 --- a/data_overall.csv +++ b/data_overall.csv @@ -1,71 +1,71 @@ -Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Multi Turn Composite,Relevance Detection,Irrelevance Detection,Organization,License -1,68.94%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,5.98,1.33,3.67,3.12,85.90%,74.58%,92.50%,92.00%,84.50%,85.64%,87.07%,92.00%,86.00%,77.50%,75.43%,74.42%,75.12%,81.25%,70.83%,45.25%,54.50%,44.00%,34.50%,48.00%,N/A,63.41%,82.93%,OpenAI,Proprietary -2,66.68%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,23.88,2.36,4.77,5.14,84.67%,69.17%,91.00%,90.50%,88.00%,84.32%,88.29%,88.00%,86.00%,75.00%,76.23%,77.52%,77.63%,81.25%,66.67%,39.25%,54.50%,32.50%,29.50%,40.50%,N/A,73.17%,79.76%,OpenAI,Proprietary -3,65.61%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,164.48,18.78,10.9,37.28,86.42%,78.17%,93.00%,89.50%,85.00%,88.88%,99.50%,92.00%,84.00%,80.00%,73.08%,80.62%,76.76%,75.00%,79.17%,36.62%,43.00%,38.50%,32.50%,32.50%,N/A,73.17%,74.60%,OpenAI,Proprietary -4,64.24%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,23.11,6.81,6.64,14.67,81.31%,73.75%,90.00%,81.00%,80.50%,84.00%,88.50%,92.00%,78.00%,77.50%,75.39%,73.26%,71.07%,75.00%,62.50%,33.50%,40.50%,32.50%,26.50%,34.50%,N/A,48.78%,88.04%,OpenAI,Proprietary -5,61.53%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,36.7,1.31,2.66,2.46,91.46%,82.33%,95.00%,95.00%,93.50%,90.00%,99.50%,98.00%,80.00%,82.50%,69.04%,85.66%,84.57%,87.50%,75.00%,26.75%,36.50%,24.00%,17.00%,29.50%,N/A,82.93%,58.95%,OpenAI,Proprietary -6,61.29%,Claude-3.5-Sonnet-20240620 (FC),https://www.anthropic.com/news/claude-3-5-sonnet,7.68,4.02,4.69,6.96,70.04%,75.17%,93.50%,64.50%,47.00%,66.27%,97.57%,90.00%,40.00%,37.50%,74.68%,80.23%,76.76%,56.25%,58.33%,40.00%,46.00%,39.00%,35.00%,40.00%,N/A,68.29%,74.58%,Anthropic,Proprietary -7,60.72%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.37,1.41,11.92,2.63,84.25%,73.50%,90.50%,90.00%,83.00%,84.12%,83.50%,92.00%,86.00%,75.00%,70.19%,72.87%,74.45%,87.50%,70.83%,28.25%,40.50%,15.50%,24.00%,33.00%,N/A,80.49%,71.77%,OpenAI,Proprietary -8,59.94%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,38.24,130.95,76.6,89.52%,76.08%,96.50%,95.00%,90.50%,89.77%,97.57%,94.00%,90.00%,77.50%,73.48%,79.46%,81.87%,68.75%,70.83%,17.25%,28.50%,12.50%,23.50%,4.50%,N/A,70.73%,73.32%,MeetKai,MIT -9,59.55%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,3.85,1.57,3.91,2.44,88.96%,79.83%,94.00%,93.00%,89.00%,91.77%,98.57%,96.00%,90.00%,82.50%,74.41%,77.52%,76.76%,87.50%,75.00%,14.62%,16.00%,18.00%,13.50%,11.00%,N/A,65.85%,77.30%,Google,Proprietary -10,59.27%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.53,1.01,6.36,1.77,86.23%,79.42%,93.00%,86.50%,86.00%,91.12%,100.00%,96.00%,86.00%,82.50%,74.63%,79.46%,74.35%,93.75%,70.83%,14.50%,20.00%,11.50%,10.00%,16.50%,N/A,75.61%,81.00%,OpenAI,Proprietary -11,58.95%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,N/A,N/A,N/A,88.15%,81.08%,93.00%,91.50%,87.00%,90.11%,96.43%,96.00%,88.00%,80.00%,71.97%,78.29%,80.14%,75.00%,62.50%,17.38%,25.50%,20.50%,15.00%,8.50%,N/A,85.37%,67.29%,Salesforce,cc-by-nc-4.0 -12,58.67%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.24,0.73,0.81,1.07,86.58%,75.33%,91.50%,91.50%,88.00%,89.48%,95.93%,96.00%,86.00%,80.00%,76.28%,77.91%,78.30%,93.75%,66.67%,12.12%,14.00%,17.50%,10.50%,6.50%,N/A,85.37%,78.54%,Google,Proprietary -13,57.43%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,3.83,1.52,1.8,2.8,83.88%,73.00%,91.50%,88.00%,83.00%,87.52%,91.57%,94.00%,82.00%,82.50%,73.12%,67.44%,69.24%,93.75%,66.67%,13.00%,15.00%,14.50%,13.50%,9.00%,N/A,56.10%,85.00%,Google,Proprietary -14,57.27%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,N/A,N/A,N/A,87.06%,76.25%,93.00%,90.00%,89.00%,89.52%,98.57%,94.00%,88.00%,77.50%,74.99%,66.67%,74.93%,81.25%,70.83%,7.88%,8.50%,10.50%,5.50%,7.00%,N/A,80.49%,85.71%,Huawei Noah & USTC,Apache-2.0 -15,57.19%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.29,0.65,0.41,0.98,86.17%,73.17%,90.50%,92.00%,89.00%,87.68%,84.21%,94.00%,90.00%,82.50%,69.21%,74.42%,75.12%,93.75%,75.00%,16.62%,23.50%,17.00%,14.00%,12.00%,N/A,82.93%,67.84%,Google,Proprietary -16,55.51%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,8.93,2.84,7.29,5.19,86.62%,73.00%,92.00%,91.50%,90.00%,84.57%,73.79%,94.00%,88.00%,82.50%,68.37%,81.78%,79.27%,68.75%,75.00%,16.75%,23.00%,12.50%,15.50%,16.00%,N/A,75.61%,49.44%,Mistral AI,Proprietary -17,54.98%,Claude-3-Opus-20240229 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,28.03,10.99,7.8,18.69,58.67%,68.67%,89.00%,41.00%,36.00%,62.05%,88.71%,88.00%,44.00%,27.50%,74.10%,74.81%,75.60%,50.00%,41.67%,28.12%,30.00%,29.50%,28.00%,25.00%,N/A,63.41%,77.80%,Anthropic,Proprietary -18,54.65%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,N/A,N/A,N/A,90.27%,80.58%,95.00%,93.50%,92.00%,89.25%,90.00%,94.00%,88.00%,85.00%,69.79%,74.42%,77.15%,81.25%,75.00%,5.62%,9.50%,2.00%,7.50%,3.50%,N/A,95.12%,68.46%,MadeAgents,cc-by-nc-4.0 -19,54.59%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,1.68,1.97,4.29,86.42%,74.67%,94.50%,89.50%,87.00%,85.95%,88.79%,92.00%,88.00%,75.00%,70.41%,75.19%,75.89%,81.25%,62.50%,8.38%,15.50%,0.50%,12.50%,5.00%,N/A,85.37%,68.62%,MeetKai,MIT -20,54.29%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,88.90%,76.58%,95.50%,93.50%,90.00%,89.34%,91.36%,96.00%,90.00%,80.00%,61.13%,77.13%,71.46%,87.50%,62.50%,14.25%,18.50%,15.50%,10.00%,13.00%,N/A,92.68%,58.38%,Meta,Meta Llama 3 Community -21,53.42%,GoGoAgent,https://gogoagent.ai,N/A,56.08,38.06,117.98,86.00%,75.50%,92.50%,92.00%,84.00%,88.05%,94.71%,94.00%,86.00%,77.50%,72.46%,71.32%,72.42%,87.50%,62.50%,0.25%,0.50%,0.50%,0.00%,0.00%,N/A,87.80%,81.89%,BitAgent,Proprietary -22,53.00%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,88.52%,81.08%,92.50%,91.00%,89.50%,87.89%,83.57%,96.00%,92.00%,80.00%,69.48%,79.46%,77.24%,68.75%,62.50%,2.12%,3.50%,2.00%,1.50%,1.50%,N/A,87.80%,68.76%,Google,gemma-terms-of-use -23,52.64%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,N/A,N/A,N/A,85.79%,75.67%,96.00%,88.50%,83.00%,88.13%,94.50%,92.00%,86.00%,80.00%,65.97%,72.48%,72.32%,62.50%,66.67%,6.38%,8.00%,7.50%,6.00%,4.00%,N/A,92.68%,64.95%,Qwen,apache-2.0 -24,52.31%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,N/A,N/A,N/A,68.85%,68.42%,88.00%,63.50%,55.50%,78.43%,87.71%,88.00%,68.00%,70.00%,69.12%,68.22%,76.76%,62.50%,54.17%,13.88%,18.50%,14.00%,12.50%,10.50%,N/A,87.80%,68.12%,Salesforce,cc-by-nc-4.0 -25,52.25%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,87.17%,75.17%,95.50%,90.50%,87.50%,89.21%,95.86%,96.00%,80.00%,85.00%,66.15%,78.68%,79.65%,68.75%,66.67%,5.50%,9.50%,4.50%,5.50%,2.50%,N/A,92.68%,52.78%,Meta,Meta Llama 3 Community -26,52.20%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.03,1.02,1.33,1.76,84.12%,75.50%,93.00%,88.00%,80.00%,84.11%,95.43%,90.00%,86.00%,65.00%,61.22%,74.42%,77.82%,43.75%,50.00%,16.88%,28.00%,13.00%,17.00%,9.50%,N/A,97.56%,35.16%,OpenAI,Proprietary -27,51.87%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,N/A,N/A,N/A,81.40%,73.08%,93.50%,79.50%,79.50%,83.46%,76.86%,92.00%,90.00%,75.00%,67.88%,71.32%,73.48%,31.25%,58.33%,6.88%,11.50%,7.00%,6.00%,3.00%,N/A,97.56%,64.05%,Salesforce,cc-by-nc-4.0 -28,51.68%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,8.7,2.93,13.22,4.14,86.92%,77.67%,92.50%,90.00%,87.50%,88.23%,91.43%,96.00%,88.00%,77.50%,68.46%,72.87%,61.33%,81.25%,66.67%,0.50%,0.50%,0.50%,0.00%,1.00%,N/A,65.85%,74.85%,Mistral AI,Proprietary -29,51.50%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,84.38%,74.50%,92.00%,88.00%,83.00%,85.18%,84.21%,94.00%,90.00%,72.50%,69.21%,73.64%,73.58%,56.25%,58.33%,0.75%,1.00%,2.00%,0.00%,0.00%,N/A,87.80%,72.45%,Google,gemma-terms-of-use -30,50.96%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,7.27,3.45,13.59,7.45,74.02%,70.58%,91.50%,65.50%,68.50%,81.73%,95.43%,92.00%,72.00%,67.50%,73.10%,68.60%,73.00%,81.25%,50.00%,0.50%,1.50%,0.00%,0.00%,0.50%,N/A,60.98%,95.21%,Mistral AI,Proprietary -31,50.77%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,11.8,1.11,0.71,2.07,80.90%,71.08%,91.50%,82.00%,79.00%,85.07%,93.29%,90.00%,82.00%,75.00%,69.75%,66.67%,70.30%,68.75%,70.83%,0.38%,1.00%,0.00%,0.00%,0.50%,N/A,73.17%,72.83%,Cohere For AI,cc-by-nc-4.0 -32,50.75%,Gorilla-OpenFunctions-v2 (FC),https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html,N/A,6.96,9.79,19.61,86.29%,77.67%,95.00%,89.00%,83.50%,86.09%,95.86%,96.00%,80.00%,72.50%,67.44%,73.64%,58.73%,68.75%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,73.17%,75.05%,Gorilla LLM,Apache 2.0 -33,49.78%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,83.62%,73.00%,94.50%,83.50%,83.50%,87.29%,85.64%,96.00%,90.00%,77.50%,57.93%,71.32%,72.23%,50.00%,45.83%,10.25%,14.00%,10.00%,7.50%,9.50%,N/A,78.05%,41.62%,Meta,Meta Llama 3 Community -34,49.68%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,N/A,N/A,N/A,84.06%,75.25%,90.50%,88.00%,82.50%,88.95%,93.29%,92.00%,88.00%,82.50%,63.22%,70.54%,68.56%,56.25%,66.67%,1.38%,2.50%,0.50%,1.00%,1.50%,N/A,92.68%,60.64%,MadeAgents,cc-by-nc-4.0 -35,49.56%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,0.81,1.28,6.09,2.44,81.21%,63.33%,92.00%,86.50%,83.00%,77.04%,55.64%,90.00%,90.00%,72.50%,62.37%,71.71%,67.79%,62.50%,66.67%,8.00%,12.00%,5.00%,10.50%,4.50%,N/A,60.98%,62.40%,Mistral AI,Proprietary -36,48.29%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,N/A,N/A,N/A,82.33%,72.83%,91.50%,84.50%,80.50%,85.91%,85.64%,92.00%,86.00%,80.00%,57.49%,65.12%,55.35%,43.75%,54.17%,2.75%,5.00%,1.50%,3.00%,1.50%,N/A,95.12%,72.55%,IBM,Apache-2.0 -37,47.80%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,7.37,0.99,2.11,2.16,49.35%,32.42%,48.00%,74.00%,43.00%,69.93%,49.71%,82.00%,78.00%,70.00%,62.19%,42.64%,42.82%,25.00%,41.67%,17.62%,21.50%,14.00%,15.00%,20.00%,N/A,36.59%,94.19%,OpenAI,Proprietary -38,47.12%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,N/A,N/A,N/A,75.19%,70.25%,85.50%,73.50%,71.50%,82.82%,72.79%,94.00%,82.00%,82.50%,61.71%,64.73%,59.88%,50.00%,41.67%,1.50%,2.00%,2.00%,1.00%,1.00%,N/A,75.61%,67.17%,Qwen,apache-2.0 -39,47.00%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.42,0.85,1.86,1.39,65.04%,62.67%,83.00%,65.50%,49.00%,67.68%,46.21%,90.00%,72.00%,62.50%,67.48%,63.57%,64.61%,68.75%,54.17%,5.75%,7.50%,7.00%,4.00%,4.50%,N/A,80.49%,75.47%,OpenAI,Proprietary -40,46.52%,Hermes-2-Pro-Llama-3-70B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-70B,N/A,N/A,N/A,N/A,78.85%,59.92%,80.00%,88.00%,87.50%,80.45%,76.29%,82.00%,86.00%,77.50%,60.51%,63.18%,53.04%,56.25%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,60.98%,70.27%,NousResearch,apache-2.0 -41,45.44%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,N/A,N/A,N/A,76.54%,64.17%,89.50%,79.50%,73.00%,75.48%,69.93%,94.00%,78.00%,60.00%,61.79%,67.44%,64.42%,56.25%,45.83%,0.38%,1.00%,0.00%,0.50%,0.00%,N/A,56.10%,58.50%,NousResearch,apache-2.0 -42,44.73%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,2.91,1.49,2.43,2.56,38.27%,54.08%,39.50%,29.50%,30.00%,69.54%,69.64%,80.00%,76.00%,52.50%,64.59%,58.91%,61.33%,81.25%,58.33%,12.88%,16.00%,9.00%,11.00%,15.50%,N/A,70.73%,74.44%,Google,Proprietary -43,44.70%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,4.14,2.99,5.51,6.27,76.83%,66.33%,90.00%,82.00%,69.00%,78.61%,88.93%,88.00%,80.00%,57.50%,57.26%,66.67%,60.56%,56.25%,50.00%,1.38%,1.50%,0.00%,1.50%,2.50%,N/A,92.68%,53.32%,Cohere For AI,cc-by-nc-4.0 -44,44.12%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,N/A,N/A,N/A,74.85%,67.42%,87.50%,71.00%,73.50%,81.70%,86.79%,88.00%,82.00%,70.00%,54.24%,59.30%,62.20%,50.00%,66.67%,2.63%,3.50%,3.50%,1.50%,2.00%,N/A,87.80%,47.41%,Qwen,apache-2.0 -45,43.77%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,3.02,1.5,1.25,2.81,31.77%,35.58%,39.50%,26.50%,25.50%,70.39%,75.07%,80.00%,74.00%,52.50%,65.53%,57.75%,58.24%,75.00%,41.67%,10.88%,13.50%,7.50%,9.00%,13.50%,N/A,63.41%,81.22%,Google,Proprietary -46,43.74%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,5.29,3.62,7.87,16.88,67.04%,72.17%,91.50%,56.50%,48.00%,75.04%,90.14%,88.00%,62.00%,60.00%,62.33%,74.81%,71.65%,75.00%,58.33%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,87.80%,48.98%,Databricks,Databricks Open Model -47,43.67%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,2.34,1.56,4.23,2.75,57.27%,67.08%,93.50%,20.00%,48.50%,53.77%,87.07%,92.00%,16.00%,20.00%,70.19%,63.57%,71.46%,12.50%,12.50%,2.12%,3.50%,0.00%,2.50%,2.50%,N/A,82.93%,78.43%,Mistral AI,Proprietary -48,43.37%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.2,1.02,0.66,1.96,85.60%,77.42%,93.50%,87.00%,84.50%,91.23%,95.93%,96.00%,88.00%,85.00%,50.33%,78.29%,74.54%,75.00%,62.50%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,90.24%,10.16%,Mistral AI,Proprietary -49,43.35%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,1.79,1.51,4.79,2.93,57.94%,68.75%,86.00%,40.00%,37.00%,65.91%,71.14%,88.00%,52.00%,52.50%,64.95%,57.36%,65.00%,68.75%,50.00%,0.62%,1.50%,0.00%,0.00%,1.00%,N/A,68.29%,76.16%,Mistral AI,Proprietary -50,42.87%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.17,0.62,1.26,0.98,35.42%,49.67%,39.00%,24.00%,29.00%,60.84%,60.86%,80.00%,50.00%,52.50%,67.35%,58.14%,57.96%,68.75%,50.00%,8.12%,11.00%,5.50%,8.00%,8.00%,N/A,60.98%,87.64%,Google,Proprietary -51,42.35%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,77.77%,64.08%,90.00%,80.50%,76.50%,69.41%,78.14%,92.00%,50.00%,57.50%,50.91%,47.67%,44.74%,0.00%,29.17%,2.12%,1.50%,2.00%,2.00%,3.00%,N/A,63.41%,68.81%,Meta,Meta Llama 3 Community -52,42.17%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,N/A,N/A,N/A,72.83%,61.33%,87.50%,78.50%,64.00%,77.30%,61.71%,94.00%,86.00%,67.50%,56.46%,64.73%,59.40%,43.75%,37.50%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,75.61%,38.95%,NousResearch,apache-2.0 -53,42.05%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,4.84,2.83,14.59,5.36,61.08%,71.33%,94.50%,10.50%,68.00%,63.82%,84.29%,94.00%,22.00%,55.00%,66.86%,73.26%,72.32%,6.25%,41.67%,0.62%,1.00%,0.00%,1.00%,0.50%,N/A,82.93%,43.94%,Mistral AI,Proprietary -54,41.69%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.19,0.51,0.58,0.82,33.56%,47.25%,40.00%,22.50%,24.50%,62.41%,53.14%,76.00%,68.00%,52.50%,64.90%,58.14%,60.46%,43.75%,41.67%,8.75%,10.00%,6.50%,9.50%,9.00%,N/A,58.54%,76.12%,Google,Proprietary -55,40.44%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,N/A,N/A,N/A,74.56%,74.25%,92.00%,78.00%,54.00%,65.75%,84.50%,90.00%,66.00%,22.50%,53.44%,75.58%,57.28%,43.75%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,70.73%,46.05%,Salesforce,cc-by-nc-4.0 -56,40.41%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,N/A,N/A,N/A,66.79%,62.17%,80.00%,67.50%,57.50%,70.43%,53.21%,86.00%,80.00%,62.50%,52.42%,48.84%,44.07%,62.50%,41.67%,0.38%,0.50%,0.00%,0.50%,0.50%,N/A,85.37%,64.51%,MadeAgents,cc-by-nc-4.0 -57,40.36%,Claude-3-Haiku-20240307 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,0.29,1.46,1.22,2.21,41.67%,70.67%,93.50%,2.00%,0.50%,47.52%,92.07%,92.00%,6.00%,0.00%,57.66%,74.03%,77.15%,0.00%,4.17%,20.62%,27.50%,15.00%,17.50%,22.50%,N/A,97.56%,29.37%,Anthropic,Proprietary -58,39.54%,MiniCPM3-4B (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,N/A,N/A,N/A,63.19%,67.75%,74.00%,60.50%,50.50%,48.70%,44.79%,50.00%,40.00%,60.00%,59.88%,56.98%,49.47%,56.25%,33.33%,0.88%,1.50%,2.00%,0.00%,0.00%,N/A,58.54%,73.64%,openbmb,Apache-2.0 -59,39.19%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,15.57,2.74,7.73,5.95,62.27%,46.58%,77.00%,70.00%,55.50%,56.93%,40.21%,80.00%,70.00%,37.50%,53.35%,45.74%,73.10%,68.75%,54.17%,9.62%,14.50%,11.00%,6.00%,7.00%,N/A,90.24%,22.38%,Mistral AI,Proprietary -60,36.52%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,2.92,2.17,7.4,46.15%,57.58%,53.00%,34.50%,39.50%,57.86%,47.43%,86.00%,38.00%,60.00%,53.49%,39.92%,38.48%,56.25%,41.67%,0.88%,1.50%,0.50%,0.50%,1.00%,N/A,65.85%,77.90%,Nexusflow,Apache 2.0 -61,36.48%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,61.02%,63.08%,85.50%,51.50%,44.00%,66.70%,83.29%,82.00%,44.00%,57.50%,50.51%,60.85%,60.75%,37.50%,20.83%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,75.61%,27.39%,Meta,Meta Llama 3 Community -62,35.71%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.1,1.18,2.45,3.2,26.21%,48.83%,39.00%,7.50%,9.50%,58.11%,76.43%,76.00%,60.00%,20.00%,58.91%,58.91%,56.12%,37.50%,20.83%,2.38%,2.50%,1.00%,3.00%,3.00%,N/A,68.29%,69.31%,Google,Proprietary -63,35.11%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,1.43,1.21,1.48,2.68,56.29%,42.17%,51.00%,68.50%,63.50%,62.39%,48.57%,76.00%,70.00%,55.00%,45.67%,41.47%,36.93%,68.75%,33.33%,0.75%,1.50%,1.50%,0.00%,0.00%,N/A,80.49%,55.48%,Google,Proprietary -64,31.04%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,75.61%,22.92%,Qwen,apache-2.0 -65,26.16%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,10.55,35.79,24.18,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,2.75%,4.50%,2.00%,2.00%,2.50%,N/A,100.00%,43.86%,Meta,Meta Llama 3 Community -66,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,2.55,1.28,0.58,2.34,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,41.46%,81.47%,Mistral AI,Proprietary -67,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,N/A,97.56%,7.06%,Salesforce,cc-by-nc-4.0 -68,23.95%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,92.68%,5.29%,Meta,Meta Llama 3 Community -69,20.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,12.19%,7.25%,41.50%,0.00%,0.00%,12.88%,5.50%,46.00%,0.00%,0.00%,41.63%,11.24%,11.96%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,12.20%,79.93%,Google,gemma-terms-of-use -70,17.93%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,22.77%,25.08%,32.00%,24.00%,10.00%,19.11%,27.93%,18.00%,28.00%,2.50%,29.85%,25.97%,4.82%,6.25%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,48.78%,54.42%,Meta,Meta Llama 3 Community \ No newline at end of file +Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Relevance Detection,Irrelevance Detection,Organization,License +1,68.94%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,8.22,1.51,4.53,3.29,85.90%,74.58%,92.50%,92.00%,84.50%,85.64%,87.07%,92.00%,86.00%,77.50%,75.43%,74.42%,75.12%,81.25%,70.83%,45.25%,54.50%,44.00%,34.50%,48.00%,63.41%,82.93%,OpenAI,Proprietary +2,66.68%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.0,2.54,5.74,4.98,84.67%,69.17%,91.00%,90.50%,88.00%,84.32%,88.29%,88.00%,86.00%,75.00%,76.23%,77.52%,77.63%,81.25%,66.67%,39.25%,54.50%,32.50%,29.50%,40.50%,73.17%,79.76%,OpenAI,Proprietary +3,65.61%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,199.71,20.13,12.38,40.02,86.42%,78.17%,93.00%,89.50%,85.00%,88.88%,99.50%,92.00%,84.00%,80.00%,73.08%,80.62%,76.76%,75.00%,79.17%,36.62%,43.00%,38.50%,32.50%,32.50%,73.17%,74.60%,OpenAI,Proprietary +4,64.24%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.59,7.81,7.99,17.34,81.31%,73.75%,90.00%,81.00%,80.50%,84.00%,88.50%,92.00%,78.00%,77.50%,75.39%,73.26%,71.07%,75.00%,62.50%,33.50%,40.50%,32.50%,26.50%,34.50%,48.78%,88.04%,OpenAI,Proprietary +5,61.53%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,51.78,1.43,3.31,2.74,91.46%,82.33%,95.00%,95.00%,93.50%,90.00%,99.50%,98.00%,80.00%,82.50%,69.04%,85.66%,84.57%,87.50%,75.00%,26.75%,36.50%,24.00%,17.00%,29.50%,82.93%,58.95%,OpenAI,Proprietary +6,61.29%,Claude-3.5-Sonnet-20240620 (FC),https://www.anthropic.com/news/claude-3-5-sonnet,8.61,3.88,5.16,6.6,70.04%,75.17%,93.50%,64.50%,47.00%,66.27%,97.57%,90.00%,40.00%,37.50%,74.68%,80.23%,76.76%,56.25%,58.33%,40.00%,46.00%,39.00%,35.00%,40.00%,68.29%,74.58%,Anthropic,Proprietary +7,60.72%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.64,14.77,2.73,84.25%,73.50%,90.50%,90.00%,83.00%,84.12%,83.50%,92.00%,86.00%,75.00%,70.19%,72.87%,74.45%,87.50%,70.83%,28.25%,40.50%,15.50%,24.00%,33.00%,80.49%,71.77%,OpenAI,Proprietary +8,59.94%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,50.7,170.09,248.31,89.52%,76.08%,96.50%,95.00%,90.50%,89.77%,97.57%,94.00%,90.00%,77.50%,73.48%,79.46%,81.87%,68.75%,70.83%,17.25%,28.50%,12.50%,23.50%,4.50%,70.73%,73.32%,MeetKai,MIT +9,59.80%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,6.54,2.99,4.53,4.96,88.00%,77.50%,93.00%,92.50%,89.00%,91.41%,97.14%,94.00%,92.00%,82.50%,74.28%,79.84%,77.72%,87.50%,79.17%,16.25%,20.00%,15.00%,14.50%,15.50%,75.61%,75.10%,Google,Proprietary +10,59.27%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.79,1.21,8.2,2.13,86.23%,79.42%,93.00%,86.50%,86.00%,91.12%,100.00%,96.00%,86.00%,82.50%,74.63%,79.46%,74.35%,93.75%,70.83%,14.50%,20.00%,11.50%,10.00%,16.50%,75.61%,81.00%,OpenAI,Proprietary +11,59.13%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,5.43,2.09,2.84,4.04,87.96%,74.83%,95.00%,91.50%,90.50%,85.82%,78.79%,94.00%,88.00%,82.50%,72.41%,74.81%,74.64%,87.50%,70.83%,19.13%,26.00%,13.50%,19.50%,17.50%,73.17%,73.15%,Google,Proprietary +12,58.95%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,N/A,N/A,N/A,88.15%,81.08%,93.00%,91.50%,87.00%,90.11%,96.43%,96.00%,88.00%,80.00%,71.97%,78.29%,80.14%,75.00%,62.50%,17.38%,25.50%,20.50%,15.00%,8.50%,85.37%,67.29%,Salesforce,cc-by-nc-4.0 +13,57.48%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,5.97,1.86,3.5,2.91,83.94%,72.75%,90.00%,91.50%,81.50%,86.30%,93.71%,88.00%,86.00%,77.50%,73.83%,74.03%,72.32%,93.75%,75.00%,13.12%,14.50%,13.50%,13.50%,11.00%,63.41%,82.00%,Google,Proprietary +14,57.27%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,N/A,N/A,N/A,87.06%,76.25%,93.00%,90.00%,89.00%,89.52%,98.57%,94.00%,88.00%,77.50%,74.99%,66.67%,74.93%,81.25%,70.83%,7.88%,8.50%,10.50%,5.50%,7.00%,80.49%,85.71%,Huawei Noah & USTC,Apache-2.0 +15,57.19%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,4.75,1.82,4.85,2.65,83.98%,69.42%,93.00%,91.00%,82.50%,88.39%,91.57%,92.00%,90.00%,80.00%,72.81%,73.64%,70.59%,81.25%,62.50%,12.75%,16.00%,11.00%,12.50%,11.50%,63.41%,80.18%,Google,Proprietary +16,56.23%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.49,1.17,1.24,1.71,85.31%,70.75%,90.00%,91.00%,89.50%,83.79%,79.14%,92.00%,84.00%,80.00%,68.24%,74.81%,76.18%,93.75%,79.17%,17.62%,25.50%,16.00%,12.00%,17.00%,87.80%,62.18%,Google,Proprietary +17,55.51%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,12.52,2.96,8.83,5.16,86.62%,73.00%,92.00%,91.50%,90.00%,84.57%,73.79%,94.00%,88.00%,82.50%,68.37%,81.78%,79.27%,68.75%,75.00%,16.75%,23.00%,12.50%,15.50%,16.00%,75.61%,49.44%,Mistral AI,Proprietary +18,55.33%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.42,1.14,1.55,1.7,79.69%,74.25%,91.50%,87.00%,66.00%,80.64%,93.57%,92.00%,82.00%,55.00%,73.21%,77.52%,74.73%,87.50%,58.33%,12.50%,15.00%,14.50%,9.00%,11.50%,78.05%,75.65%,Google,Proprietary +19,54.98%,Claude-3-Opus-20240229 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,20.25,10.03,9.03,18.28,58.67%,68.67%,89.00%,41.00%,36.00%,62.05%,88.71%,88.00%,44.00%,27.50%,74.10%,74.81%,75.60%,50.00%,41.67%,28.12%,30.00%,29.50%,28.00%,25.00%,63.41%,77.80%,Anthropic,Proprietary +20,54.65%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,N/A,N/A,N/A,90.27%,80.58%,95.00%,93.50%,92.00%,89.25%,90.00%,94.00%,88.00%,85.00%,69.79%,74.42%,77.15%,81.25%,75.00%,5.62%,9.50%,2.00%,7.50%,3.50%,95.12%,68.46%,MadeAgents,cc-by-nc-4.0 +21,54.59%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,1.62,1.79,4.11,86.42%,74.67%,94.50%,89.50%,87.00%,85.95%,88.79%,92.00%,88.00%,75.00%,70.41%,75.19%,75.89%,81.25%,62.50%,8.38%,15.50%,0.50%,12.50%,5.00%,85.37%,68.62%,MeetKai,MIT +22,54.55%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.29,0.81,1.11,1.37,81.21%,65.83%,91.50%,80.50%,87.00%,73.04%,68.14%,90.00%,54.00%,80.00%,75.12%,71.32%,70.97%,81.25%,75.00%,9.75%,15.00%,5.00%,8.00%,11.00%,60.98%,86.90%,Google,Proprietary +23,54.29%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,88.90%,76.58%,95.50%,93.50%,90.00%,89.34%,91.36%,96.00%,90.00%,80.00%,61.13%,77.13%,71.46%,87.50%,62.50%,14.25%,18.50%,15.50%,10.00%,13.00%,92.68%,58.38%,Meta,Meta Llama 3 Community +24,53.42%,GoGoAgent,https://gogoagent.ai,N/A,55.61,43.13,127.3,86.00%,75.50%,92.50%,92.00%,84.00%,88.05%,94.71%,94.00%,86.00%,77.50%,72.46%,71.32%,72.42%,87.50%,62.50%,0.25%,0.50%,0.50%,0.00%,0.00%,87.80%,81.89%,BitAgent,Proprietary +25,53.38%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.29,0.64,0.79,0.91,77.21%,64.83%,94.50%,73.00%,76.50%,75.12%,61.50%,88.00%,76.00%,75.00%,72.81%,72.48%,73.67%,62.50%,58.33%,10.88%,13.00%,10.00%,13.00%,7.50%,63.41%,75.83%,Google,Proprietary +26,53.00%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,88.52%,81.08%,92.50%,91.00%,89.50%,87.89%,83.57%,96.00%,92.00%,80.00%,69.48%,79.46%,77.24%,68.75%,62.50%,2.12%,3.50%,2.00%,1.50%,1.50%,87.80%,68.76%,Google,gemma-terms-of-use +27,52.64%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,N/A,N/A,N/A,85.79%,75.67%,96.00%,88.50%,83.00%,88.13%,94.50%,92.00%,86.00%,80.00%,65.97%,72.48%,72.32%,62.50%,66.67%,6.38%,8.00%,7.50%,6.00%,4.00%,92.68%,64.95%,Qwen,apache-2.0 +28,52.31%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,N/A,N/A,N/A,68.85%,68.42%,88.00%,63.50%,55.50%,78.43%,87.71%,88.00%,68.00%,70.00%,69.12%,68.22%,76.76%,62.50%,54.17%,13.88%,18.50%,14.00%,12.50%,10.50%,87.80%,68.12%,Salesforce,cc-by-nc-4.0 +29,52.25%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,87.17%,75.17%,95.50%,90.50%,87.50%,89.21%,95.86%,96.00%,80.00%,85.00%,66.15%,78.68%,79.65%,68.75%,66.67%,5.50%,9.50%,4.50%,5.50%,2.50%,92.68%,52.78%,Meta,Meta Llama 3 Community +30,52.20%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.39,1.08,0.99,1.86,84.12%,75.50%,93.00%,88.00%,80.00%,84.11%,95.43%,90.00%,86.00%,65.00%,61.22%,74.42%,77.82%,43.75%,50.00%,16.88%,28.00%,13.00%,17.00%,9.50%,97.56%,35.16%,OpenAI,Proprietary +31,51.87%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,N/A,N/A,N/A,81.40%,73.08%,93.50%,79.50%,79.50%,83.46%,76.86%,92.00%,90.00%,75.00%,67.88%,71.32%,73.48%,31.25%,58.33%,6.88%,11.50%,7.00%,6.00%,3.00%,97.56%,64.05%,Salesforce,cc-by-nc-4.0 +32,51.68%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,11.89,3.65,16.23,4.25,86.92%,77.67%,92.50%,90.00%,87.50%,88.23%,91.43%,96.00%,88.00%,77.50%,68.46%,72.87%,61.33%,81.25%,66.67%,0.50%,0.50%,0.50%,0.00%,1.00%,65.85%,74.85%,Mistral AI,Proprietary +33,51.50%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,84.38%,74.50%,92.00%,88.00%,83.00%,85.18%,84.21%,94.00%,90.00%,72.50%,69.21%,73.64%,73.58%,56.25%,58.33%,0.75%,1.00%,2.00%,0.00%,0.00%,87.80%,72.45%,Google,gemma-terms-of-use +34,50.96%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,10.92,3.88,10.24,8.75,74.02%,70.58%,91.50%,65.50%,68.50%,81.73%,95.43%,92.00%,72.00%,67.50%,73.10%,68.60%,73.00%,81.25%,50.00%,0.50%,1.50%,0.00%,0.00%,0.50%,60.98%,95.21%,Mistral AI,Proprietary +35,50.77%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,15.17,1.23,0.84,2.18,80.90%,71.08%,91.50%,82.00%,79.00%,85.07%,93.29%,90.00%,82.00%,75.00%,69.75%,66.67%,70.30%,68.75%,70.83%,0.38%,1.00%,0.00%,0.00%,0.50%,73.17%,72.83%,Cohere For AI,cc-by-nc-4.0 +36,50.75%,Gorilla-OpenFunctions-v2 (FC),https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html,N/A,6.59,8.42,16.0,86.29%,77.67%,95.00%,89.00%,83.50%,86.09%,95.86%,96.00%,80.00%,72.50%,67.44%,73.64%,58.73%,68.75%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,73.17%,75.05%,Gorilla LLM,Apache 2.0 +37,49.86%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,83.62%,73.00%,94.50%,83.50%,83.50%,87.29%,85.64%,96.00%,90.00%,77.50%,57.93%,71.32%,72.23%,50.00%,45.83%,10.50%,14.00%,10.50%,8.00%,9.50%,78.05%,41.62%,Meta,Meta Llama 3 Community +38,49.68%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,N/A,N/A,N/A,84.06%,75.25%,90.50%,88.00%,82.50%,88.95%,93.29%,92.00%,88.00%,82.50%,63.22%,70.54%,68.56%,56.25%,66.67%,1.38%,2.50%,0.50%,1.00%,1.50%,92.68%,60.64%,MadeAgents,cc-by-nc-4.0 +39,49.56%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.16,1.28,7.77,2.34,81.21%,63.33%,92.00%,86.50%,83.00%,77.04%,55.64%,90.00%,90.00%,72.50%,62.37%,71.71%,67.79%,62.50%,66.67%,8.00%,12.00%,5.00%,10.50%,4.50%,60.98%,62.40%,Mistral AI,Proprietary +40,48.29%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,N/A,N/A,N/A,82.33%,72.83%,91.50%,84.50%,80.50%,85.91%,85.64%,92.00%,86.00%,80.00%,57.49%,65.12%,55.35%,43.75%,54.17%,2.75%,5.00%,1.50%,3.00%,1.50%,95.12%,72.55%,IBM,Apache-2.0 +41,47.80%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,11.06,1.2,2.78,2.66,49.35%,32.42%,48.00%,74.00%,43.00%,69.93%,49.71%,82.00%,78.00%,70.00%,62.19%,42.64%,42.82%,25.00%,41.67%,17.62%,21.50%,14.00%,15.00%,20.00%,36.59%,94.19%,OpenAI,Proprietary +42,47.12%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,N/A,N/A,N/A,75.19%,70.25%,85.50%,73.50%,71.50%,82.82%,72.79%,94.00%,82.00%,82.50%,61.71%,64.73%,59.88%,50.00%,41.67%,1.50%,2.00%,2.00%,1.00%,1.00%,75.61%,67.17%,Qwen,apache-2.0 +43,47.00%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.1,0.96,2.31,1.54,65.04%,62.67%,83.00%,65.50%,49.00%,67.68%,46.21%,90.00%,72.00%,62.50%,67.48%,63.57%,64.61%,68.75%,54.17%,5.75%,7.50%,7.00%,4.00%,4.50%,80.49%,75.47%,OpenAI,Proprietary +44,46.52%,Hermes-2-Pro-Llama-3-70B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-70B,N/A,N/A,N/A,N/A,78.85%,59.92%,80.00%,88.00%,87.50%,80.45%,76.29%,82.00%,86.00%,77.50%,60.51%,63.18%,53.04%,56.25%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,60.98%,70.27%,NousResearch,apache-2.0 +45,45.44%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,N/A,N/A,N/A,76.54%,64.17%,89.50%,79.50%,73.00%,75.48%,69.93%,94.00%,78.00%,60.00%,61.79%,67.44%,64.42%,56.25%,45.83%,0.38%,1.00%,0.00%,0.50%,0.00%,56.10%,58.50%,NousResearch,apache-2.0 +46,44.70%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,5.52,3.21,7.41,6.2,76.83%,66.33%,90.00%,82.00%,69.00%,78.61%,88.93%,88.00%,80.00%,57.50%,57.26%,66.67%,60.56%,56.25%,50.00%,1.38%,1.50%,0.00%,1.50%,2.50%,92.68%,53.32%,Cohere For AI,cc-by-nc-4.0 +47,44.12%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,N/A,N/A,N/A,74.85%,67.42%,87.50%,71.00%,73.50%,81.70%,86.79%,88.00%,82.00%,70.00%,54.24%,59.30%,62.20%,50.00%,66.67%,2.63%,3.50%,3.50%,1.50%,2.00%,87.80%,47.41%,Qwen,apache-2.0 +48,43.74%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.09,6.21,10.4,22.38,67.04%,72.17%,91.50%,56.50%,48.00%,75.04%,90.14%,88.00%,62.00%,60.00%,62.33%,74.81%,71.65%,75.00%,58.33%,0.00%,0.00%,0.00%,0.00%,0.00%,87.80%,48.98%,Databricks,Databricks Open Model +49,43.67%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.34,1.64,7.26,2.84,57.27%,67.08%,93.50%,20.00%,48.50%,53.77%,87.07%,92.00%,16.00%,20.00%,70.19%,63.57%,71.46%,12.50%,12.50%,2.12%,3.50%,0.00%,2.50%,2.50%,82.93%,78.43%,Mistral AI,Proprietary +50,43.56%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.74,2.41,8.48,4.44,56.52%,67.08%,94.00%,39.50%,25.50%,65.14%,84.07%,88.00%,66.00%,22.50%,66.10%,75.19%,65.96%,50.00%,37.50%,2.50%,4.00%,2.50%,2.50%,1.00%,68.29%,68.33%,Google,Proprietary +51,43.37%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.73,1.01,0.68,1.96,85.60%,77.42%,93.50%,87.00%,84.50%,91.23%,95.93%,96.00%,88.00%,85.00%,50.33%,78.29%,74.54%,75.00%,62.50%,0.12%,0.00%,0.50%,0.00%,0.00%,90.24%,10.16%,Mistral AI,Proprietary +52,43.35%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.72,1.86,6.57,3.26,57.94%,68.75%,86.00%,40.00%,37.00%,65.91%,71.14%,88.00%,52.00%,52.50%,64.95%,57.36%,65.00%,68.75%,50.00%,0.62%,1.50%,0.00%,0.00%,1.00%,68.29%,76.16%,Mistral AI,Proprietary +53,42.35%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,77.77%,64.08%,90.00%,80.50%,76.50%,69.41%,78.14%,92.00%,50.00%,57.50%,50.91%,47.67%,44.74%,0.00%,29.17%,2.12%,1.50%,2.00%,2.00%,3.00%,63.41%,68.81%,Meta,Meta Llama 3 Community +54,42.17%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,N/A,N/A,N/A,72.83%,61.33%,87.50%,78.50%,64.00%,77.30%,61.71%,94.00%,86.00%,67.50%,56.46%,64.73%,59.40%,43.75%,37.50%,0.25%,0.50%,0.00%,0.00%,0.50%,75.61%,38.95%,NousResearch,apache-2.0 +55,42.05%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,6.99,3.67,19.78,5.7,61.08%,71.33%,94.50%,10.50%,68.00%,63.82%,84.29%,94.00%,22.00%,55.00%,66.86%,73.26%,72.32%,6.25%,41.67%,0.62%,1.00%,0.00%,1.00%,0.50%,82.93%,43.94%,Mistral AI,Proprietary +56,40.44%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,N/A,N/A,N/A,74.56%,74.25%,92.00%,78.00%,54.00%,65.75%,84.50%,90.00%,66.00%,22.50%,53.44%,75.58%,57.28%,43.75%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,70.73%,46.05%,Salesforce,cc-by-nc-4.0 +57,40.41%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,N/A,N/A,N/A,66.79%,62.17%,80.00%,67.50%,57.50%,70.43%,53.21%,86.00%,80.00%,62.50%,52.42%,48.84%,44.07%,62.50%,41.67%,0.38%,0.50%,0.00%,0.50%,0.50%,85.37%,64.51%,MadeAgents,cc-by-nc-4.0 +58,40.36%,Claude-3-Haiku-20240307 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,0.21,1.46,1.47,2.24,41.67%,70.67%,93.50%,2.00%,0.50%,47.52%,92.07%,92.00%,6.00%,0.00%,57.66%,74.03%,77.15%,0.00%,4.17%,20.62%,27.50%,15.00%,17.50%,22.50%,97.56%,29.37%,Anthropic,Proprietary +59,39.54%,MiniCPM3-4B (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,N/A,N/A,N/A,63.19%,67.75%,74.00%,60.50%,50.50%,48.70%,44.79%,50.00%,40.00%,60.00%,59.88%,56.98%,49.47%,56.25%,33.33%,0.88%,1.50%,2.00%,0.00%,0.00%,58.54%,73.64%,openbmb,Apache-2.0 +60,39.19%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,21.54,3.21,9.37,6.72,62.27%,46.58%,77.00%,70.00%,55.50%,56.93%,40.21%,80.00%,70.00%,37.50%,53.35%,45.74%,73.10%,68.75%,54.17%,9.62%,14.50%,11.00%,6.00%,7.00%,90.24%,22.38%,Mistral AI,Proprietary +61,36.52%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,3.53,2.33,8.1,46.15%,57.58%,53.00%,34.50%,39.50%,57.86%,47.43%,86.00%,38.00%,60.00%,53.49%,39.92%,38.48%,56.25%,41.67%,0.88%,1.50%,0.50%,0.50%,1.00%,65.85%,77.90%,Nexusflow,Apache 2.0 +62,36.48%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,61.02%,63.08%,85.50%,51.50%,44.00%,66.70%,83.29%,82.00%,44.00%,57.50%,50.51%,60.85%,60.75%,37.50%,20.83%,0.00%,0.00%,0.00%,0.00%,0.00%,75.61%,27.39%,Meta,Meta Llama 3 Community +63,36.30%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,5.51,14.5,40.27,60.17%,50.17%,66.50%,60.50%,63.50%,59.66%,45.14%,78.00%,68.00%,47.50%,48.38%,50.00%,48.41%,56.25%,37.50%,1.25%,1.00%,3.50%,0.00%,0.50%,85.37%,50.23%,Google,Proprietary +64,31.04%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,0.12%,0.00%,0.50%,0.00%,0.00%,75.61%,22.92%,Qwen,apache-2.0 +65,26.16%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,10.55,35.8,24.13,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,2.75%,4.50%,2.00%,2.00%,2.50%,100.00%,43.86%,Meta,Meta Llama 3 Community +66,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.8,1.32,0.67,2.56,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,41.46%,81.47%,Mistral AI,Proprietary +67,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,97.56%,7.06%,Salesforce,cc-by-nc-4.0 +68,23.95%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,0.00%,0.00%,0.00%,0.00%,0.00%,92.68%,5.29%,Meta,Meta Llama 3 Community +69,20.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,12.19%,7.25%,41.50%,0.00%,0.00%,12.88%,5.50%,46.00%,0.00%,0.00%,41.63%,11.24%,11.96%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,12.20%,79.93%,Google,gemma-terms-of-use +70,17.93%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,22.77%,25.08%,32.00%,24.00%,10.00%,19.11%,27.93%,18.00%,28.00%,2.50%,29.85%,25.97%,4.82%,6.25%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,48.78%,54.42%,Meta,Meta Llama 3 Community \ No newline at end of file diff --git a/leaderboard.html b/leaderboard.html index 4834b0bca..70e9e007b 100644 --- a/leaderboard.html +++ b/leaderboard.html @@ -109,7 +109,7 @@

BFCL Leaderboard

Last Updated: - 2024-11-09 [Change Log]
From 1cf88c0c397f814e9b2593fe80ac99f1617ccc15 Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Sun, 17 Nov 2024 00:42:10 -0800 Subject: [PATCH 5/6] fix format --- data_overall.csv | 142 +++++++++++++++++++++++------------------------ 1 file changed, 71 insertions(+), 71 deletions(-) diff --git a/data_overall.csv b/data_overall.csv index 98eb1e271..8456a8c37 100644 --- a/data_overall.csv +++ b/data_overall.csv @@ -1,71 +1,71 @@ -Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Relevance Detection,Irrelevance Detection,Organization,License -1,68.94%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,8.22,1.51,4.53,3.29,85.90%,74.58%,92.50%,92.00%,84.50%,85.64%,87.07%,92.00%,86.00%,77.50%,75.43%,74.42%,75.12%,81.25%,70.83%,45.25%,54.50%,44.00%,34.50%,48.00%,63.41%,82.93%,OpenAI,Proprietary -2,66.68%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.0,2.54,5.74,4.98,84.67%,69.17%,91.00%,90.50%,88.00%,84.32%,88.29%,88.00%,86.00%,75.00%,76.23%,77.52%,77.63%,81.25%,66.67%,39.25%,54.50%,32.50%,29.50%,40.50%,73.17%,79.76%,OpenAI,Proprietary -3,65.61%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,199.71,20.13,12.38,40.02,86.42%,78.17%,93.00%,89.50%,85.00%,88.88%,99.50%,92.00%,84.00%,80.00%,73.08%,80.62%,76.76%,75.00%,79.17%,36.62%,43.00%,38.50%,32.50%,32.50%,73.17%,74.60%,OpenAI,Proprietary -4,64.24%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.59,7.81,7.99,17.34,81.31%,73.75%,90.00%,81.00%,80.50%,84.00%,88.50%,92.00%,78.00%,77.50%,75.39%,73.26%,71.07%,75.00%,62.50%,33.50%,40.50%,32.50%,26.50%,34.50%,48.78%,88.04%,OpenAI,Proprietary -5,61.53%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,51.78,1.43,3.31,2.74,91.46%,82.33%,95.00%,95.00%,93.50%,90.00%,99.50%,98.00%,80.00%,82.50%,69.04%,85.66%,84.57%,87.50%,75.00%,26.75%,36.50%,24.00%,17.00%,29.50%,82.93%,58.95%,OpenAI,Proprietary -6,61.29%,Claude-3.5-Sonnet-20240620 (FC),https://www.anthropic.com/news/claude-3-5-sonnet,8.61,3.88,5.16,6.6,70.04%,75.17%,93.50%,64.50%,47.00%,66.27%,97.57%,90.00%,40.00%,37.50%,74.68%,80.23%,76.76%,56.25%,58.33%,40.00%,46.00%,39.00%,35.00%,40.00%,68.29%,74.58%,Anthropic,Proprietary -7,60.72%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.64,14.77,2.73,84.25%,73.50%,90.50%,90.00%,83.00%,84.12%,83.50%,92.00%,86.00%,75.00%,70.19%,72.87%,74.45%,87.50%,70.83%,28.25%,40.50%,15.50%,24.00%,33.00%,80.49%,71.77%,OpenAI,Proprietary -8,59.94%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,50.7,170.09,248.31,89.52%,76.08%,96.50%,95.00%,90.50%,89.77%,97.57%,94.00%,90.00%,77.50%,73.48%,79.46%,81.87%,68.75%,70.83%,17.25%,28.50%,12.50%,23.50%,4.50%,70.73%,73.32%,MeetKai,MIT -9,59.80%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,6.54,2.99,4.53,4.96,88.00%,77.50%,93.00%,92.50%,89.00%,91.41%,97.14%,94.00%,92.00%,82.50%,74.28%,79.84%,77.72%,87.50%,79.17%,16.25%,20.00%,15.00%,14.50%,15.50%,75.61%,75.10%,Google,Proprietary -10,59.27%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.79,1.21,8.2,2.13,86.23%,79.42%,93.00%,86.50%,86.00%,91.12%,100.00%,96.00%,86.00%,82.50%,74.63%,79.46%,74.35%,93.75%,70.83%,14.50%,20.00%,11.50%,10.00%,16.50%,75.61%,81.00%,OpenAI,Proprietary -11,59.13%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,5.43,2.09,2.84,4.04,87.96%,74.83%,95.00%,91.50%,90.50%,85.82%,78.79%,94.00%,88.00%,82.50%,72.41%,74.81%,74.64%,87.50%,70.83%,19.13%,26.00%,13.50%,19.50%,17.50%,73.17%,73.15%,Google,Proprietary -12,58.95%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,N/A,N/A,N/A,88.15%,81.08%,93.00%,91.50%,87.00%,90.11%,96.43%,96.00%,88.00%,80.00%,71.97%,78.29%,80.14%,75.00%,62.50%,17.38%,25.50%,20.50%,15.00%,8.50%,85.37%,67.29%,Salesforce,cc-by-nc-4.0 -13,57.48%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,5.97,1.86,3.5,2.91,83.94%,72.75%,90.00%,91.50%,81.50%,86.30%,93.71%,88.00%,86.00%,77.50%,73.83%,74.03%,72.32%,93.75%,75.00%,13.12%,14.50%,13.50%,13.50%,11.00%,63.41%,82.00%,Google,Proprietary -14,57.27%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,N/A,N/A,N/A,87.06%,76.25%,93.00%,90.00%,89.00%,89.52%,98.57%,94.00%,88.00%,77.50%,74.99%,66.67%,74.93%,81.25%,70.83%,7.88%,8.50%,10.50%,5.50%,7.00%,80.49%,85.71%,Huawei Noah & USTC,Apache-2.0 -15,57.19%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,4.75,1.82,4.85,2.65,83.98%,69.42%,93.00%,91.00%,82.50%,88.39%,91.57%,92.00%,90.00%,80.00%,72.81%,73.64%,70.59%,81.25%,62.50%,12.75%,16.00%,11.00%,12.50%,11.50%,63.41%,80.18%,Google,Proprietary -16,56.23%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.49,1.17,1.24,1.71,85.31%,70.75%,90.00%,91.00%,89.50%,83.79%,79.14%,92.00%,84.00%,80.00%,68.24%,74.81%,76.18%,93.75%,79.17%,17.62%,25.50%,16.00%,12.00%,17.00%,87.80%,62.18%,Google,Proprietary -17,55.51%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,12.52,2.96,8.83,5.16,86.62%,73.00%,92.00%,91.50%,90.00%,84.57%,73.79%,94.00%,88.00%,82.50%,68.37%,81.78%,79.27%,68.75%,75.00%,16.75%,23.00%,12.50%,15.50%,16.00%,75.61%,49.44%,Mistral AI,Proprietary -18,55.33%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.42,1.14,1.55,1.7,79.69%,74.25%,91.50%,87.00%,66.00%,80.64%,93.57%,92.00%,82.00%,55.00%,73.21%,77.52%,74.73%,87.50%,58.33%,12.50%,15.00%,14.50%,9.00%,11.50%,78.05%,75.65%,Google,Proprietary -19,54.98%,Claude-3-Opus-20240229 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,20.25,10.03,9.03,18.28,58.67%,68.67%,89.00%,41.00%,36.00%,62.05%,88.71%,88.00%,44.00%,27.50%,74.10%,74.81%,75.60%,50.00%,41.67%,28.12%,30.00%,29.50%,28.00%,25.00%,63.41%,77.80%,Anthropic,Proprietary -20,54.65%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,N/A,N/A,N/A,90.27%,80.58%,95.00%,93.50%,92.00%,89.25%,90.00%,94.00%,88.00%,85.00%,69.79%,74.42%,77.15%,81.25%,75.00%,5.62%,9.50%,2.00%,7.50%,3.50%,95.12%,68.46%,MadeAgents,cc-by-nc-4.0 -21,54.59%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,1.62,1.79,4.11,86.42%,74.67%,94.50%,89.50%,87.00%,85.95%,88.79%,92.00%,88.00%,75.00%,70.41%,75.19%,75.89%,81.25%,62.50%,8.38%,15.50%,0.50%,12.50%,5.00%,85.37%,68.62%,MeetKai,MIT -22,54.55%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.29,0.81,1.11,1.37,81.21%,65.83%,91.50%,80.50%,87.00%,73.04%,68.14%,90.00%,54.00%,80.00%,75.12%,71.32%,70.97%,81.25%,75.00%,9.75%,15.00%,5.00%,8.00%,11.00%,60.98%,86.90%,Google,Proprietary -23,54.29%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,88.90%,76.58%,95.50%,93.50%,90.00%,89.34%,91.36%,96.00%,90.00%,80.00%,61.13%,77.13%,71.46%,87.50%,62.50%,14.25%,18.50%,15.50%,10.00%,13.00%,92.68%,58.38%,Meta,Meta Llama 3 Community -24,53.42%,GoGoAgent,https://gogoagent.ai,N/A,55.61,43.13,127.3,86.00%,75.50%,92.50%,92.00%,84.00%,88.05%,94.71%,94.00%,86.00%,77.50%,72.46%,71.32%,72.42%,87.50%,62.50%,0.25%,0.50%,0.50%,0.00%,0.00%,87.80%,81.89%,BitAgent,Proprietary -25,53.38%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.29,0.64,0.79,0.91,77.21%,64.83%,94.50%,73.00%,76.50%,75.12%,61.50%,88.00%,76.00%,75.00%,72.81%,72.48%,73.67%,62.50%,58.33%,10.88%,13.00%,10.00%,13.00%,7.50%,63.41%,75.83%,Google,Proprietary -26,53.00%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,88.52%,81.08%,92.50%,91.00%,89.50%,87.89%,83.57%,96.00%,92.00%,80.00%,69.48%,79.46%,77.24%,68.75%,62.50%,2.12%,3.50%,2.00%,1.50%,1.50%,87.80%,68.76%,Google,gemma-terms-of-use -27,52.64%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,N/A,N/A,N/A,85.79%,75.67%,96.00%,88.50%,83.00%,88.13%,94.50%,92.00%,86.00%,80.00%,65.97%,72.48%,72.32%,62.50%,66.67%,6.38%,8.00%,7.50%,6.00%,4.00%,92.68%,64.95%,Qwen,apache-2.0 -28,52.31%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,N/A,N/A,N/A,68.85%,68.42%,88.00%,63.50%,55.50%,78.43%,87.71%,88.00%,68.00%,70.00%,69.12%,68.22%,76.76%,62.50%,54.17%,13.88%,18.50%,14.00%,12.50%,10.50%,87.80%,68.12%,Salesforce,cc-by-nc-4.0 -29,52.25%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,87.17%,75.17%,95.50%,90.50%,87.50%,89.21%,95.86%,96.00%,80.00%,85.00%,66.15%,78.68%,79.65%,68.75%,66.67%,5.50%,9.50%,4.50%,5.50%,2.50%,92.68%,52.78%,Meta,Meta Llama 3 Community -30,52.20%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.39,1.08,0.99,1.86,84.12%,75.50%,93.00%,88.00%,80.00%,84.11%,95.43%,90.00%,86.00%,65.00%,61.22%,74.42%,77.82%,43.75%,50.00%,16.88%,28.00%,13.00%,17.00%,9.50%,97.56%,35.16%,OpenAI,Proprietary -31,51.87%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,N/A,N/A,N/A,81.40%,73.08%,93.50%,79.50%,79.50%,83.46%,76.86%,92.00%,90.00%,75.00%,67.88%,71.32%,73.48%,31.25%,58.33%,6.88%,11.50%,7.00%,6.00%,3.00%,97.56%,64.05%,Salesforce,cc-by-nc-4.0 -32,51.68%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,11.89,3.65,16.23,4.25,86.92%,77.67%,92.50%,90.00%,87.50%,88.23%,91.43%,96.00%,88.00%,77.50%,68.46%,72.87%,61.33%,81.25%,66.67%,0.50%,0.50%,0.50%,0.00%,1.00%,65.85%,74.85%,Mistral AI,Proprietary -33,51.50%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,84.38%,74.50%,92.00%,88.00%,83.00%,85.18%,84.21%,94.00%,90.00%,72.50%,69.21%,73.64%,73.58%,56.25%,58.33%,0.75%,1.00%,2.00%,0.00%,0.00%,87.80%,72.45%,Google,gemma-terms-of-use -34,50.96%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,10.92,3.88,10.24,8.75,74.02%,70.58%,91.50%,65.50%,68.50%,81.73%,95.43%,92.00%,72.00%,67.50%,73.10%,68.60%,73.00%,81.25%,50.00%,0.50%,1.50%,0.00%,0.00%,0.50%,60.98%,95.21%,Mistral AI,Proprietary -35,50.77%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,15.17,1.23,0.84,2.18,80.90%,71.08%,91.50%,82.00%,79.00%,85.07%,93.29%,90.00%,82.00%,75.00%,69.75%,66.67%,70.30%,68.75%,70.83%,0.38%,1.00%,0.00%,0.00%,0.50%,73.17%,72.83%,Cohere For AI,cc-by-nc-4.0 -36,50.75%,Gorilla-OpenFunctions-v2 (FC),https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html,N/A,6.59,8.42,16.0,86.29%,77.67%,95.00%,89.00%,83.50%,86.09%,95.86%,96.00%,80.00%,72.50%,67.44%,73.64%,58.73%,68.75%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,73.17%,75.05%,Gorilla LLM,Apache 2.0 -37,49.86%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,83.62%,73.00%,94.50%,83.50%,83.50%,87.29%,85.64%,96.00%,90.00%,77.50%,57.93%,71.32%,72.23%,50.00%,45.83%,10.50%,14.00%,10.50%,8.00%,9.50%,78.05%,41.62%,Meta,Meta Llama 3 Community -38,49.68%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,N/A,N/A,N/A,84.06%,75.25%,90.50%,88.00%,82.50%,88.95%,93.29%,92.00%,88.00%,82.50%,63.22%,70.54%,68.56%,56.25%,66.67%,1.38%,2.50%,0.50%,1.00%,1.50%,92.68%,60.64%,MadeAgents,cc-by-nc-4.0 -39,49.56%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.16,1.28,7.77,2.34,81.21%,63.33%,92.00%,86.50%,83.00%,77.04%,55.64%,90.00%,90.00%,72.50%,62.37%,71.71%,67.79%,62.50%,66.67%,8.00%,12.00%,5.00%,10.50%,4.50%,60.98%,62.40%,Mistral AI,Proprietary -40,48.29%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,N/A,N/A,N/A,82.33%,72.83%,91.50%,84.50%,80.50%,85.91%,85.64%,92.00%,86.00%,80.00%,57.49%,65.12%,55.35%,43.75%,54.17%,2.75%,5.00%,1.50%,3.00%,1.50%,95.12%,72.55%,IBM,Apache-2.0 -41,47.80%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,11.06,1.2,2.78,2.66,49.35%,32.42%,48.00%,74.00%,43.00%,69.93%,49.71%,82.00%,78.00%,70.00%,62.19%,42.64%,42.82%,25.00%,41.67%,17.62%,21.50%,14.00%,15.00%,20.00%,36.59%,94.19%,OpenAI,Proprietary -42,47.12%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,N/A,N/A,N/A,75.19%,70.25%,85.50%,73.50%,71.50%,82.82%,72.79%,94.00%,82.00%,82.50%,61.71%,64.73%,59.88%,50.00%,41.67%,1.50%,2.00%,2.00%,1.00%,1.00%,75.61%,67.17%,Qwen,apache-2.0 -43,47.00%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.1,0.96,2.31,1.54,65.04%,62.67%,83.00%,65.50%,49.00%,67.68%,46.21%,90.00%,72.00%,62.50%,67.48%,63.57%,64.61%,68.75%,54.17%,5.75%,7.50%,7.00%,4.00%,4.50%,80.49%,75.47%,OpenAI,Proprietary -44,46.52%,Hermes-2-Pro-Llama-3-70B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-70B,N/A,N/A,N/A,N/A,78.85%,59.92%,80.00%,88.00%,87.50%,80.45%,76.29%,82.00%,86.00%,77.50%,60.51%,63.18%,53.04%,56.25%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,60.98%,70.27%,NousResearch,apache-2.0 -45,45.44%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,N/A,N/A,N/A,76.54%,64.17%,89.50%,79.50%,73.00%,75.48%,69.93%,94.00%,78.00%,60.00%,61.79%,67.44%,64.42%,56.25%,45.83%,0.38%,1.00%,0.00%,0.50%,0.00%,56.10%,58.50%,NousResearch,apache-2.0 -46,44.70%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,5.52,3.21,7.41,6.2,76.83%,66.33%,90.00%,82.00%,69.00%,78.61%,88.93%,88.00%,80.00%,57.50%,57.26%,66.67%,60.56%,56.25%,50.00%,1.38%,1.50%,0.00%,1.50%,2.50%,92.68%,53.32%,Cohere For AI,cc-by-nc-4.0 -47,44.12%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,N/A,N/A,N/A,74.85%,67.42%,87.50%,71.00%,73.50%,81.70%,86.79%,88.00%,82.00%,70.00%,54.24%,59.30%,62.20%,50.00%,66.67%,2.63%,3.50%,3.50%,1.50%,2.00%,87.80%,47.41%,Qwen,apache-2.0 -48,43.74%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.09,6.21,10.4,22.38,67.04%,72.17%,91.50%,56.50%,48.00%,75.04%,90.14%,88.00%,62.00%,60.00%,62.33%,74.81%,71.65%,75.00%,58.33%,0.00%,0.00%,0.00%,0.00%,0.00%,87.80%,48.98%,Databricks,Databricks Open Model -49,43.67%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.34,1.64,7.26,2.84,57.27%,67.08%,93.50%,20.00%,48.50%,53.77%,87.07%,92.00%,16.00%,20.00%,70.19%,63.57%,71.46%,12.50%,12.50%,2.12%,3.50%,0.00%,2.50%,2.50%,82.93%,78.43%,Mistral AI,Proprietary -50,43.56%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.74,2.41,8.48,4.44,56.52%,67.08%,94.00%,39.50%,25.50%,65.14%,84.07%,88.00%,66.00%,22.50%,66.10%,75.19%,65.96%,50.00%,37.50%,2.50%,4.00%,2.50%,2.50%,1.00%,68.29%,68.33%,Google,Proprietary -51,43.37%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.73,1.01,0.68,1.96,85.60%,77.42%,93.50%,87.00%,84.50%,91.23%,95.93%,96.00%,88.00%,85.00%,50.33%,78.29%,74.54%,75.00%,62.50%,0.12%,0.00%,0.50%,0.00%,0.00%,90.24%,10.16%,Mistral AI,Proprietary -52,43.35%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.72,1.86,6.57,3.26,57.94%,68.75%,86.00%,40.00%,37.00%,65.91%,71.14%,88.00%,52.00%,52.50%,64.95%,57.36%,65.00%,68.75%,50.00%,0.62%,1.50%,0.00%,0.00%,1.00%,68.29%,76.16%,Mistral AI,Proprietary -53,42.35%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,77.77%,64.08%,90.00%,80.50%,76.50%,69.41%,78.14%,92.00%,50.00%,57.50%,50.91%,47.67%,44.74%,0.00%,29.17%,2.12%,1.50%,2.00%,2.00%,3.00%,63.41%,68.81%,Meta,Meta Llama 3 Community -54,42.17%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,N/A,N/A,N/A,72.83%,61.33%,87.50%,78.50%,64.00%,77.30%,61.71%,94.00%,86.00%,67.50%,56.46%,64.73%,59.40%,43.75%,37.50%,0.25%,0.50%,0.00%,0.00%,0.50%,75.61%,38.95%,NousResearch,apache-2.0 -55,42.05%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,6.99,3.67,19.78,5.7,61.08%,71.33%,94.50%,10.50%,68.00%,63.82%,84.29%,94.00%,22.00%,55.00%,66.86%,73.26%,72.32%,6.25%,41.67%,0.62%,1.00%,0.00%,1.00%,0.50%,82.93%,43.94%,Mistral AI,Proprietary -56,40.44%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,N/A,N/A,N/A,74.56%,74.25%,92.00%,78.00%,54.00%,65.75%,84.50%,90.00%,66.00%,22.50%,53.44%,75.58%,57.28%,43.75%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,70.73%,46.05%,Salesforce,cc-by-nc-4.0 -57,40.41%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,N/A,N/A,N/A,66.79%,62.17%,80.00%,67.50%,57.50%,70.43%,53.21%,86.00%,80.00%,62.50%,52.42%,48.84%,44.07%,62.50%,41.67%,0.38%,0.50%,0.00%,0.50%,0.50%,85.37%,64.51%,MadeAgents,cc-by-nc-4.0 -58,40.36%,Claude-3-Haiku-20240307 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,0.21,1.46,1.47,2.24,41.67%,70.67%,93.50%,2.00%,0.50%,47.52%,92.07%,92.00%,6.00%,0.00%,57.66%,74.03%,77.15%,0.00%,4.17%,20.62%,27.50%,15.00%,17.50%,22.50%,97.56%,29.37%,Anthropic,Proprietary -59,39.54%,MiniCPM3-4B (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,N/A,N/A,N/A,63.19%,67.75%,74.00%,60.50%,50.50%,48.70%,44.79%,50.00%,40.00%,60.00%,59.88%,56.98%,49.47%,56.25%,33.33%,0.88%,1.50%,2.00%,0.00%,0.00%,58.54%,73.64%,openbmb,Apache-2.0 -60,39.19%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,21.54,3.21,9.37,6.72,62.27%,46.58%,77.00%,70.00%,55.50%,56.93%,40.21%,80.00%,70.00%,37.50%,53.35%,45.74%,73.10%,68.75%,54.17%,9.62%,14.50%,11.00%,6.00%,7.00%,90.24%,22.38%,Mistral AI,Proprietary -61,36.52%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,3.53,2.33,8.1,46.15%,57.58%,53.00%,34.50%,39.50%,57.86%,47.43%,86.00%,38.00%,60.00%,53.49%,39.92%,38.48%,56.25%,41.67%,0.88%,1.50%,0.50%,0.50%,1.00%,65.85%,77.90%,Nexusflow,Apache 2.0 -62,36.48%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,61.02%,63.08%,85.50%,51.50%,44.00%,66.70%,83.29%,82.00%,44.00%,57.50%,50.51%,60.85%,60.75%,37.50%,20.83%,0.00%,0.00%,0.00%,0.00%,0.00%,75.61%,27.39%,Meta,Meta Llama 3 Community -63,36.30%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,5.51,14.5,40.27,60.17%,50.17%,66.50%,60.50%,63.50%,59.66%,45.14%,78.00%,68.00%,47.50%,48.38%,50.00%,48.41%,56.25%,37.50%,1.25%,1.00%,3.50%,0.00%,0.50%,85.37%,50.23%,Google,Proprietary -64,31.04%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,0.12%,0.00%,0.50%,0.00%,0.00%,75.61%,22.92%,Qwen,apache-2.0 -65,26.16%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,10.55,35.8,24.13,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,2.75%,4.50%,2.00%,2.00%,2.50%,100.00%,43.86%,Meta,Meta Llama 3 Community -66,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.8,1.32,0.67,2.56,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,41.46%,81.47%,Mistral AI,Proprietary -67,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,97.56%,7.06%,Salesforce,cc-by-nc-4.0 -68,23.95%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,0.00%,0.00%,0.00%,0.00%,0.00%,92.68%,5.29%,Meta,Meta Llama 3 Community -69,20.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,12.19%,7.25%,41.50%,0.00%,0.00%,12.88%,5.50%,46.00%,0.00%,0.00%,41.63%,11.24%,11.96%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,12.20%,79.93%,Google,gemma-terms-of-use -70,17.93%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,22.77%,25.08%,32.00%,24.00%,10.00%,19.11%,27.93%,18.00%,28.00%,2.50%,29.85%,25.97%,4.82%,6.25%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,48.78%,54.42%,Meta,Meta Llama 3 Community \ No newline at end of file +Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s),Latency Standard Deviation (s),Latency 95th Percentile (s),Non-Live AST Acc,Non-Live Simple AST,Non-Live Multiple AST,Non-Live Parallel AST,Non-Live Parallel Multiple AST,Non-Live Exec Acc,Non-Live Simple Exec,Non-Live Multiple Exec,Non-Live Parallel Exec,Non-Live Parallel Multiple Exec,Live Acc,Live Simple AST,Live Multiple AST,Live Parallel AST,Live Parallel Multiple AST,Multi Turn Acc,Multi Turn Base,Multi Turn Miss Func,Multi Turn Miss Param,Multi Turn Long Context,Multi Turn Composite,Relevance Detection,Irrelevance Detection,Organization,License +1,68.94%,GPT-4o-2024-08-06 (FC),https://openai.com/index/hello-gpt-4o/,8.22,1.51,4.53,3.29,85.90%,74.58%,92.50%,92.00%,84.50%,85.64%,87.07%,92.00%,86.00%,77.50%,75.43%,74.42%,75.12%,81.25%,70.83%,45.25%,54.50%,44.00%,34.50%,48.00%,N/A,63.41%,82.93%,OpenAI,Proprietary +2,66.68%,GPT-4-turbo-2024-04-09 (FC),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,33.0,2.54,5.74,4.98,84.67%,69.17%,91.00%,90.50%,88.00%,84.32%,88.29%,88.00%,86.00%,75.00%,76.23%,77.52%,77.63%,81.25%,66.67%,39.25%,54.50%,32.50%,29.50%,40.50%,N/A,73.17%,79.76%,OpenAI,Proprietary +3,65.61%,o1-preview-2024-09-12 (Prompt),https://openai.com/index/introducing-openai-o1-preview/,199.71,20.13,12.38,40.02,86.42%,78.17%,93.00%,89.50%,85.00%,88.88%,99.50%,92.00%,84.00%,80.00%,73.08%,80.62%,76.76%,75.00%,79.17%,36.62%,43.00%,38.50%,32.50%,32.50%,N/A,73.17%,74.60%,OpenAI,Proprietary +4,64.24%,o1-mini-2024-09-12 (Prompt),https://openai.com/index/openai-o1-mini-advancing-cost-efficient-reasoning/,29.59,7.81,7.99,17.34,81.31%,73.75%,90.00%,81.00%,80.50%,84.00%,88.50%,92.00%,78.00%,77.50%,75.39%,73.26%,71.07%,75.00%,62.50%,33.50%,40.50%,32.50%,26.50%,34.50%,N/A,48.78%,88.04%,OpenAI,Proprietary +5,61.53%,GPT-4-turbo-2024-04-09 (Prompt),https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo,51.78,1.43,3.31,2.74,91.46%,82.33%,95.00%,95.00%,93.50%,90.00%,99.50%,98.00%,80.00%,82.50%,69.04%,85.66%,84.57%,87.50%,75.00%,26.75%,36.50%,24.00%,17.00%,29.50%,N/A,82.93%,58.95%,OpenAI,Proprietary +6,61.29%,Claude-3.5-Sonnet-20240620 (FC),https://www.anthropic.com/news/claude-3-5-sonnet,8.61,3.88,5.16,6.6,70.04%,75.17%,93.50%,64.50%,47.00%,66.27%,97.57%,90.00%,40.00%,37.50%,74.68%,80.23%,76.76%,56.25%,58.33%,40.00%,46.00%,39.00%,35.00%,40.00%,N/A,68.29%,74.58%,Anthropic,Proprietary +7,60.72%,GPT-4o-mini-2024-07-18 (FC),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.51,1.64,14.77,2.73,84.25%,73.50%,90.50%,90.00%,83.00%,84.12%,83.50%,92.00%,86.00%,75.00%,70.19%,72.87%,74.45%,87.50%,70.83%,28.25%,40.50%,15.50%,24.00%,33.00%,N/A,80.49%,71.77%,OpenAI,Proprietary +8,59.94%,Functionary-Medium-v3.1 (FC),https://huggingface.co/meetkai/functionary-medium-v3.1,N/A,50.7,170.09,248.31,89.52%,76.08%,96.50%,95.00%,90.50%,89.77%,97.57%,94.00%,90.00%,77.50%,73.48%,79.46%,81.87%,68.75%,70.83%,17.25%,28.50%,12.50%,23.50%,4.50%,N/A,70.73%,73.32%,MeetKai,MIT +9,59.80%,Gemini-1.5-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,6.54,2.99,4.53,4.96,88.00%,77.50%,93.00%,92.50%,89.00%,91.41%,97.14%,94.00%,92.00%,82.50%,74.28%,79.84%,77.72%,87.50%,79.17%,16.25%,20.00%,15.00%,14.50%,15.50%,N/A,75.61%,75.10%,Google,Proprietary +10,59.27%,GPT-4o-mini-2024-07-18 (Prompt),https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/,0.79,1.21,8.2,2.13,86.23%,79.42%,93.00%,86.50%,86.00%,91.12%,100.00%,96.00%,86.00%,82.50%,74.63%,79.46%,74.35%,93.75%,70.83%,14.50%,20.00%,11.50%,10.00%,16.50%,N/A,75.61%,81.00%,OpenAI,Proprietary +11,59.13%,Gemini-1.5-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,5.43,2.09,2.84,4.04,87.96%,74.83%,95.00%,91.50%,90.50%,85.82%,78.79%,94.00%,88.00%,82.50%,72.41%,74.81%,74.64%,87.50%,70.83%,19.13%,26.00%,13.50%,19.50%,17.50%,N/A,73.17%,73.15%,Google,Proprietary +12,58.95%,xLAM-8x22b-r (FC),https://huggingface.co/Salesforce/xLAM-8x22b-r,N/A,N/A,N/A,N/A,88.15%,81.08%,93.00%,91.50%,87.00%,90.11%,96.43%,96.00%,88.00%,80.00%,71.97%,78.29%,80.14%,75.00%,62.50%,17.38%,25.50%,20.50%,15.00%,8.50%,N/A,85.37%,67.29%,Salesforce,cc-by-nc-4.0 +13,57.48%,Gemini-1.5-Pro-001 (Prompt),https://deepmind.google/technologies/gemini/pro/,5.97,1.86,3.5,2.91,83.94%,72.75%,90.00%,91.50%,81.50%,86.30%,93.71%,88.00%,86.00%,77.50%,73.83%,74.03%,72.32%,93.75%,75.00%,13.12%,14.50%,13.50%,13.50%,11.00%,N/A,63.41%,82.00%,Google,Proprietary +14,57.27%,ToolACE-8B (FC),https://huggingface.co/Team-ACE/ToolACE-8B,N/A,N/A,N/A,N/A,87.06%,76.25%,93.00%,90.00%,89.00%,89.52%,98.57%,94.00%,88.00%,77.50%,74.99%,66.67%,74.93%,81.25%,70.83%,7.88%,8.50%,10.50%,5.50%,7.00%,N/A,80.49%,85.71%,Huawei Noah & USTC,Apache-2.0 +15,57.19%,Gemini-1.5-Pro-001 (FC),https://deepmind.google/technologies/gemini/pro/,4.75,1.82,4.85,2.65,83.98%,69.42%,93.00%,91.00%,82.50%,88.39%,91.57%,92.00%,90.00%,80.00%,72.81%,73.64%,70.59%,81.25%,62.50%,12.75%,16.00%,11.00%,12.50%,11.50%,N/A,63.41%,80.18%,Google,Proprietary +16,56.23%,Gemini-1.5-Flash-001 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.49,1.17,1.24,1.71,85.31%,70.75%,90.00%,91.00%,89.50%,83.79%,79.14%,92.00%,84.00%,80.00%,68.24%,74.81%,76.18%,93.75%,79.17%,17.62%,25.50%,16.00%,12.00%,17.00%,N/A,87.80%,62.18%,Google,Proprietary +17,55.51%,mistral-large-2407 (FC),https://mistral.ai/news/mistral-large-2407/,12.52,2.96,8.83,5.16,86.62%,73.00%,92.00%,91.50%,90.00%,84.57%,73.79%,94.00%,88.00%,82.50%,68.37%,81.78%,79.27%,68.75%,75.00%,16.75%,23.00%,12.50%,15.50%,16.00%,N/A,75.61%,49.44%,Mistral AI,Proprietary +18,55.33%,Gemini-1.5-Flash-002 (Prompt),https://deepmind.google/technologies/gemini/flash/,0.42,1.14,1.55,1.7,79.69%,74.25%,91.50%,87.00%,66.00%,80.64%,93.57%,92.00%,82.00%,55.00%,73.21%,77.52%,74.73%,87.50%,58.33%,12.50%,15.00%,14.50%,9.00%,11.50%,N/A,78.05%,75.65%,Google,Proprietary +19,54.98%,Claude-3-Opus-20240229 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,20.25,10.03,9.03,18.28,58.67%,68.67%,89.00%,41.00%,36.00%,62.05%,88.71%,88.00%,44.00%,27.50%,74.10%,74.81%,75.60%,50.00%,41.67%,28.12%,30.00%,29.50%,28.00%,25.00%,N/A,63.41%,77.80%,Anthropic,Proprietary +20,54.65%,Hammer2.0-7b (FC),https://huggingface.co/MadeAgents/Hammer2.0-7b,N/A,N/A,N/A,N/A,90.27%,80.58%,95.00%,93.50%,92.00%,89.25%,90.00%,94.00%,88.00%,85.00%,69.79%,74.42%,77.15%,81.25%,75.00%,5.62%,9.50%,2.00%,7.50%,3.50%,N/A,95.12%,68.46%,MadeAgents,cc-by-nc-4.0 +21,54.59%,Functionary-Small-v3.1 (FC),https://huggingface.co/meetkai/functionary-small-v3.1,N/A,1.62,1.79,4.11,86.42%,74.67%,94.50%,89.50%,87.00%,85.95%,88.79%,92.00%,88.00%,75.00%,70.41%,75.19%,75.89%,81.25%,62.50%,8.38%,15.50%,0.50%,12.50%,5.00%,N/A,85.37%,68.62%,MeetKai,MIT +22,54.55%,Gemini-1.5-Flash-002 (FC),https://deepmind.google/technologies/gemini/flash/,0.29,0.81,1.11,1.37,81.21%,65.83%,91.50%,80.50%,87.00%,73.04%,68.14%,90.00%,54.00%,80.00%,75.12%,71.32%,70.97%,81.25%,75.00%,9.75%,15.00%,5.00%,8.00%,11.00%,N/A,60.98%,86.90%,Google,Proprietary +23,54.29%,Llama-3.1-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,88.90%,76.58%,95.50%,93.50%,90.00%,89.34%,91.36%,96.00%,90.00%,80.00%,61.13%,77.13%,71.46%,87.50%,62.50%,14.25%,18.50%,15.50%,10.00%,13.00%,N/A,92.68%,58.38%,Meta,Meta Llama 3 Community +24,53.42%,GoGoAgent,https://gogoagent.ai,N/A,55.61,43.13,127.3,86.00%,75.50%,92.50%,92.00%,84.00%,88.05%,94.71%,94.00%,86.00%,77.50%,72.46%,71.32%,72.42%,87.50%,62.50%,0.25%,0.50%,0.50%,0.00%,0.00%,N/A,87.80%,81.89%,BitAgent,Proprietary +25,53.38%,Gemini-1.5-Flash-001 (FC),https://deepmind.google/technologies/gemini/flash/,0.29,0.64,0.79,0.91,77.21%,64.83%,94.50%,73.00%,76.50%,75.12%,61.50%,88.00%,76.00%,75.00%,72.81%,72.48%,73.67%,62.50%,58.33%,10.88%,13.00%,10.00%,13.00%,7.50%,N/A,63.41%,75.83%,Google,Proprietary +26,53.00%,Gemma-2-27b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,88.52%,81.08%,92.50%,91.00%,89.50%,87.89%,83.57%,96.00%,92.00%,80.00%,69.48%,79.46%,77.24%,68.75%,62.50%,2.12%,3.50%,2.00%,1.50%,1.50%,N/A,87.80%,68.76%,Google,gemma-terms-of-use +27,52.64%,Qwen2.5-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-7B-Instruct,N/A,N/A,N/A,N/A,85.79%,75.67%,96.00%,88.50%,83.00%,88.13%,94.50%,92.00%,86.00%,80.00%,65.97%,72.48%,72.32%,62.50%,66.67%,6.38%,8.00%,7.50%,6.00%,4.00%,N/A,92.68%,64.95%,Qwen,apache-2.0 +28,52.31%,xLAM-8x7b-r (FC),https://huggingface.co/Salesforce/xLAM-8x7b-r,N/A,N/A,N/A,N/A,68.85%,68.42%,88.00%,63.50%,55.50%,78.43%,87.71%,88.00%,68.00%,70.00%,69.12%,68.22%,76.76%,62.50%,54.17%,13.88%,18.50%,14.00%,12.50%,10.50%,N/A,87.80%,68.12%,Salesforce,cc-by-nc-4.0 +29,52.25%,Meta-Llama-3-70B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,87.17%,75.17%,95.50%,90.50%,87.50%,89.21%,95.86%,96.00%,80.00%,85.00%,66.15%,78.68%,79.65%,68.75%,66.67%,5.50%,9.50%,4.50%,5.50%,2.50%,N/A,92.68%,52.78%,Meta,Meta Llama 3 Community +30,52.20%,GPT-3.5-Turbo-0125 (FC),https://platform.openai.com/docs/models/gpt-3-5-turbo,1.39,1.08,0.99,1.86,84.12%,75.50%,93.00%,88.00%,80.00%,84.11%,95.43%,90.00%,86.00%,65.00%,61.22%,74.42%,77.82%,43.75%,50.00%,16.88%,28.00%,13.00%,17.00%,9.50%,N/A,97.56%,35.16%,OpenAI,Proprietary +31,51.87%,xLAM-7b-r (FC),https://huggingface.co/Salesforce/xLAM-7b-r,N/A,N/A,N/A,N/A,81.40%,73.08%,93.50%,79.50%,79.50%,83.46%,76.86%,92.00%,90.00%,75.00%,67.88%,71.32%,73.48%,31.25%,58.33%,6.88%,11.50%,7.00%,6.00%,3.00%,N/A,97.56%,64.05%,Salesforce,cc-by-nc-4.0 +32,51.68%,Open-Mixtral-8x22b (Prompt),https://mistral.ai/news/mixtral-8x22b/,11.89,3.65,16.23,4.25,86.92%,77.67%,92.50%,90.00%,87.50%,88.23%,91.43%,96.00%,88.00%,77.50%,68.46%,72.87%,61.33%,81.25%,66.67%,0.50%,0.50%,0.50%,0.00%,1.00%,N/A,65.85%,74.85%,Mistral AI,Proprietary +33,51.50%,Gemma-2-9b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,84.38%,74.50%,92.00%,88.00%,83.00%,85.18%,84.21%,94.00%,90.00%,72.50%,69.21%,73.64%,73.58%,56.25%,58.33%,0.75%,1.00%,2.00%,0.00%,0.00%,N/A,87.80%,72.45%,Google,gemma-terms-of-use +34,50.96%,Mistral-Medium-2312 (Prompt),https://docs.mistral.ai/guides/model-selection/,10.92,3.88,10.24,8.75,74.02%,70.58%,91.50%,65.50%,68.50%,81.73%,95.43%,92.00%,72.00%,67.50%,73.10%,68.60%,73.00%,81.25%,50.00%,0.50%,1.50%,0.00%,0.00%,0.50%,N/A,60.98%,95.21%,Mistral AI,Proprietary +35,50.77%,Command-R-Plus (Prompt) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,15.17,1.23,0.84,2.18,80.90%,71.08%,91.50%,82.00%,79.00%,85.07%,93.29%,90.00%,82.00%,75.00%,69.75%,66.67%,70.30%,68.75%,70.83%,0.38%,1.00%,0.00%,0.00%,0.50%,N/A,73.17%,72.83%,Cohere For AI,cc-by-nc-4.0 +36,50.75%,Gorilla-OpenFunctions-v2 (FC),https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html,N/A,6.59,8.42,16.0,86.29%,77.67%,95.00%,89.00%,83.50%,86.09%,95.86%,96.00%,80.00%,72.50%,67.44%,73.64%,58.73%,68.75%,41.67%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,73.17%,75.05%,Gorilla LLM,Apache 2.0 +37,49.86%,Llama-3.1-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,83.62%,73.00%,94.50%,83.50%,83.50%,87.29%,85.64%,96.00%,90.00%,77.50%,57.93%,71.32%,72.23%,50.00%,45.83%,10.50%,14.00%,10.50%,8.00%,9.50%,N/A,78.05%,41.62%,Meta,Meta Llama 3 Community +38,49.68%,Hammer2.0-1.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-1.5b,N/A,N/A,N/A,N/A,84.06%,75.25%,90.50%,88.00%,82.50%,88.95%,93.29%,92.00%,88.00%,82.50%,63.22%,70.54%,68.56%,56.25%,66.67%,1.38%,2.50%,0.50%,1.00%,1.50%,N/A,92.68%,60.64%,MadeAgents,cc-by-nc-4.0 +39,49.56%,Open-Mistral-Nemo-2407 (FC),https://mistral.ai/news/mistral-nemo/,1.16,1.28,7.77,2.34,81.21%,63.33%,92.00%,86.50%,83.00%,77.04%,55.64%,90.00%,90.00%,72.50%,62.37%,71.71%,67.79%,62.50%,66.67%,8.00%,12.00%,5.00%,10.50%,4.50%,N/A,60.98%,62.40%,Mistral AI,Proprietary +40,48.29%,Granite-20b-FunctionCalling (FC),https://huggingface.co/ibm-granite/granite-20b-functioncalling,N/A,N/A,N/A,N/A,82.33%,72.83%,91.50%,84.50%,80.50%,85.91%,85.64%,92.00%,86.00%,80.00%,57.49%,65.12%,55.35%,43.75%,54.17%,2.75%,5.00%,1.50%,3.00%,1.50%,N/A,95.12%,72.55%,IBM,Apache-2.0 +41,47.80%,GPT-4o-2024-08-06 (Prompt),https://openai.com/index/hello-gpt-4o/,11.06,1.2,2.78,2.66,49.35%,32.42%,48.00%,74.00%,43.00%,69.93%,49.71%,82.00%,78.00%,70.00%,62.19%,42.64%,42.82%,25.00%,41.67%,17.62%,21.50%,14.00%,15.00%,20.00%,N/A,36.59%,94.19%,OpenAI,Proprietary +42,47.12%,Qwen2.5-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct,N/A,N/A,N/A,N/A,75.19%,70.25%,85.50%,73.50%,71.50%,82.82%,72.79%,94.00%,82.00%,82.50%,61.71%,64.73%,59.88%,50.00%,41.67%,1.50%,2.00%,2.00%,1.00%,1.00%,N/A,75.61%,67.17%,Qwen,apache-2.0 +43,47.00%,GPT-3.5-Turbo-0125 (Prompt),https://platform.openai.com/docs/models/gpt-3-5-turbo,2.1,0.96,2.31,1.54,65.04%,62.67%,83.00%,65.50%,49.00%,67.68%,46.21%,90.00%,72.00%,62.50%,67.48%,63.57%,64.61%,68.75%,54.17%,5.75%,7.50%,7.00%,4.00%,4.50%,N/A,80.49%,75.47%,OpenAI,Proprietary +44,46.52%,Hermes-2-Pro-Llama-3-70B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-70B,N/A,N/A,N/A,N/A,78.85%,59.92%,80.00%,88.00%,87.50%,80.45%,76.29%,82.00%,86.00%,77.50%,60.51%,63.18%,53.04%,56.25%,66.67%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,60.98%,70.27%,NousResearch,apache-2.0 +45,45.44%,Hermes-2-Pro-Llama-3-8B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B,N/A,N/A,N/A,N/A,76.54%,64.17%,89.50%,79.50%,73.00%,75.48%,69.93%,94.00%,78.00%,60.00%,61.79%,67.44%,64.42%,56.25%,45.83%,0.38%,1.00%,0.00%,0.50%,0.00%,N/A,56.10%,58.50%,NousResearch,apache-2.0 +46,44.70%,Command-R-Plus (FC) (Original),https://txt.cohere.com/command-r-plus-microsoft-azure,5.52,3.21,7.41,6.2,76.83%,66.33%,90.00%,82.00%,69.00%,78.61%,88.93%,88.00%,80.00%,57.50%,57.26%,66.67%,60.56%,56.25%,50.00%,1.38%,1.50%,0.00%,1.50%,2.50%,N/A,92.68%,53.32%,Cohere For AI,cc-by-nc-4.0 +47,44.12%,Qwen2-7B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-7B-Instruct,N/A,N/A,N/A,N/A,74.85%,67.42%,87.50%,71.00%,73.50%,81.70%,86.79%,88.00%,82.00%,70.00%,54.24%,59.30%,62.20%,50.00%,66.67%,2.63%,3.50%,3.50%,1.50%,2.00%,N/A,87.80%,47.41%,Qwen,apache-2.0 +48,43.74%,DBRX-Instruct (Prompt),https://www.databricks.com/blog/introducing-dbrx-new-state-art-open-llm,8.09,6.21,10.4,22.38,67.04%,72.17%,91.50%,56.50%,48.00%,75.04%,90.14%,88.00%,62.00%,60.00%,62.33%,74.81%,71.65%,75.00%,58.33%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,87.80%,48.98%,Databricks,Databricks Open Model +49,43.67%,Mistral-small-2402 (FC),https://docs.mistral.ai/guides/model-selection/,3.34,1.64,7.26,2.84,57.27%,67.08%,93.50%,20.00%,48.50%,53.77%,87.07%,92.00%,16.00%,20.00%,70.19%,63.57%,71.46%,12.50%,12.50%,2.12%,3.50%,0.00%,2.50%,2.50%,N/A,82.93%,78.43%,Mistral AI,Proprietary +50,43.56%,Gemini-1.0-Pro-002 (FC),https://deepmind.google/technologies/gemini/pro/,1.74,2.41,8.48,4.44,56.52%,67.08%,94.00%,39.50%,25.50%,65.14%,84.07%,88.00%,66.00%,22.50%,66.10%,75.19%,65.96%,50.00%,37.50%,2.50%,4.00%,2.50%,2.50%,1.00%,N/A,68.29%,68.33%,Google,Proprietary +51,43.37%,Open-Mistral-Nemo-2407 (Prompt),https://mistral.ai/news/mistral-nemo/,1.73,1.01,0.68,1.96,85.60%,77.42%,93.50%,87.00%,84.50%,91.23%,95.93%,96.00%,88.00%,85.00%,50.33%,78.29%,74.54%,75.00%,62.50%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,90.24%,10.16%,Mistral AI,Proprietary +52,43.35%,Open-Mixtral-8x7b (Prompt),https://mistral.ai/news/mixtral-of-experts/,2.72,1.86,6.57,3.26,57.94%,68.75%,86.00%,40.00%,37.00%,65.91%,71.14%,88.00%,52.00%,52.50%,64.95%,57.36%,65.00%,68.75%,50.00%,0.62%,1.50%,0.00%,0.00%,1.00%,N/A,68.29%,76.16%,Mistral AI,Proprietary +53,42.35%,Llama-3.2-3B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,77.77%,64.08%,90.00%,80.50%,76.50%,69.41%,78.14%,92.00%,50.00%,57.50%,50.91%,47.67%,44.74%,0.00%,29.17%,2.12%,1.50%,2.00%,2.00%,3.00%,N/A,63.41%,68.81%,Meta,Meta Llama 3 Community +54,42.17%,Hermes-2-Pro-Mistral-7B (FC),https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B,N/A,N/A,N/A,N/A,72.83%,61.33%,87.50%,78.50%,64.00%,77.30%,61.71%,94.00%,86.00%,67.50%,56.46%,64.73%,59.40%,43.75%,37.50%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,75.61%,38.95%,NousResearch,apache-2.0 +55,42.05%,Open-Mixtral-8x22b (FC),https://mistral.ai/news/mixtral-8x22b/,6.99,3.67,19.78,5.7,61.08%,71.33%,94.50%,10.50%,68.00%,63.82%,84.29%,94.00%,22.00%,55.00%,66.86%,73.26%,72.32%,6.25%,41.67%,0.62%,1.00%,0.00%,1.00%,0.50%,N/A,82.93%,43.94%,Mistral AI,Proprietary +56,40.44%,xLAM-7b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-7b-fc-r,N/A,N/A,N/A,N/A,74.56%,74.25%,92.00%,78.00%,54.00%,65.75%,84.50%,90.00%,66.00%,22.50%,53.44%,75.58%,57.28%,43.75%,25.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,70.73%,46.05%,Salesforce,cc-by-nc-4.0 +57,40.41%,Hammer2.0-0.5b (FC),https://huggingface.co/MadeAgents/Hammer2.0-0.5b,N/A,N/A,N/A,N/A,66.79%,62.17%,80.00%,67.50%,57.50%,70.43%,53.21%,86.00%,80.00%,62.50%,52.42%,48.84%,44.07%,62.50%,41.67%,0.38%,0.50%,0.00%,0.50%,0.50%,N/A,85.37%,64.51%,MadeAgents,cc-by-nc-4.0 +58,40.36%,Claude-3-Haiku-20240307 (FC tools-2024-04-04),https://www.anthropic.com/news/claude-3-family,0.21,1.46,1.47,2.24,41.67%,70.67%,93.50%,2.00%,0.50%,47.52%,92.07%,92.00%,6.00%,0.00%,57.66%,74.03%,77.15%,0.00%,4.17%,20.62%,27.50%,15.00%,17.50%,22.50%,N/A,97.56%,29.37%,Anthropic,Proprietary +59,39.54%,MiniCPM3-4B (FC),https://huggingface.co/openbmb/MiniCPM3-4B,N/A,N/A,N/A,N/A,63.19%,67.75%,74.00%,60.50%,50.50%,48.70%,44.79%,50.00%,40.00%,60.00%,59.88%,56.98%,49.47%,56.25%,33.33%,0.88%,1.50%,2.00%,0.00%,0.00%,N/A,58.54%,73.64%,openbmb,Apache-2.0 +60,39.19%,mistral-large-2407 (Prompt),https://mistral.ai/news/mistral-large-2407/,21.54,3.21,9.37,6.72,62.27%,46.58%,77.00%,70.00%,55.50%,56.93%,40.21%,80.00%,70.00%,37.50%,53.35%,45.74%,73.10%,68.75%,54.17%,9.62%,14.50%,11.00%,6.00%,7.00%,N/A,90.24%,22.38%,Mistral AI,Proprietary +61,36.52%,Nexusflow-Raven-v2 (FC),https://huggingface.co/Nexusflow/NexusRaven-V2-13B,N/A,3.53,2.33,8.1,46.15%,57.58%,53.00%,34.50%,39.50%,57.86%,47.43%,86.00%,38.00%,60.00%,53.49%,39.92%,38.48%,56.25%,41.67%,0.88%,1.50%,0.50%,0.50%,1.00%,N/A,65.85%,77.90%,Nexusflow,Apache 2.0 +62,36.48%,Meta-Llama-3-8B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,61.02%,63.08%,85.50%,51.50%,44.00%,66.70%,83.29%,82.00%,44.00%,57.50%,50.51%,60.85%,60.75%,37.50%,20.83%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,75.61%,27.39%,Meta,Meta Llama 3 Community +63,36.30%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,5.51,14.5,40.27,60.17%,50.17%,66.50%,60.50%,63.50%,59.66%,45.14%,78.00%,68.00%,47.50%,48.38%,50.00%,48.41%,56.25%,37.50%,1.25%,1.00%,3.50%,0.00%,0.50%,N/A,85.37%,50.23%,Google,Proprietary +64,31.04%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,75.61%,22.92%,Qwen,apache-2.0 +65,26.16%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,10.55,35.8,24.13,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,2.75%,4.50%,2.00%,2.00%,2.50%,N/A,100.00%,43.86%,Meta,Meta Llama 3 Community +66,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.8,1.32,0.67,2.56,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,41.46%,81.47%,Mistral AI,Proprietary +67,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,N/A,97.56%,7.06%,Salesforce,cc-by-nc-4.0 +68,23.95%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,92.68%,5.29%,Meta,Meta Llama 3 Community +69,20.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,12.19%,7.25%,41.50%,0.00%,0.00%,12.88%,5.50%,46.00%,0.00%,0.00%,41.63%,11.24%,11.96%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,12.20%,79.93%,Google,gemma-terms-of-use +70,17.93%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,22.77%,25.08%,32.00%,24.00%,10.00%,19.11%,27.93%,18.00%,28.00%,2.50%,29.85%,25.97%,4.82%,6.25%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,48.78%,54.42%,Meta,Meta Llama 3 Community \ No newline at end of file From bea8452c20b137213a15f7b3464b46e8bfc57c3d Mon Sep 17 00:00:00 2001 From: "Huanzhi (Hans) Mao" Date: Mon, 18 Nov 2024 16:22:07 -0800 Subject: [PATCH 6/6] update data.csv --- data_multi_turn.csv | 64 ++++++++++++++++++++++----------------------- data_overall.csv | 6 ++--- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/data_multi_turn.csv b/data_multi_turn.csv index 0721afea6..74f06af72 100644 --- a/data_multi_turn.csv +++ b/data_multi_turn.csv @@ -34,38 +34,38 @@ Rank,Model,Multi Turn Overall Acc,Base,Miss Func,Miss Param,Long Context 33,GPT-3.5-Turbo-0125 (Prompt),5.75%,7.50%,7.00%,4.00%,4.50% 34,Hammer2.0-7b (FC),5.62%,9.50%,2.00%,7.50%,3.50% 35,Meta-Llama-3-70B-Instruct (Prompt),5.50%,9.50%,4.50%,5.50%,2.50% -36,Llama-3.1-70B-Instruct (FC),2.75%,4.50%,2.00%,2.00%,2.50% -37,Granite-20b-FunctionCalling (FC),2.75%,5.00%,1.50%,3.00%,1.50% -38,Qwen2-7B-Instruct (Prompt),2.63%,3.50%,3.50%,1.50%,2.00% -39,Gemini-1.0-Pro-002 (FC),2.50%,4.00%,2.50%,2.50%,1.00% -40,Mistral-small-2402 (FC),2.12%,3.50%,0.00%,2.50%,2.50% -41,Gemma-2-27b-it (Prompt),2.12%,3.50%,2.00%,1.50%,1.50% -42,Llama-3.2-3B-Instruct (Prompt),2.12%,1.50%,2.00%,2.00%,3.00% -43,Qwen2.5-1.5B-Instruct (Prompt),1.50%,2.00%,2.00%,1.00%,1.00% -44,Hammer2.0-1.5b (FC),1.38%,2.50%,0.50%,1.00%,1.50% -45,Command-R-Plus (FC) (Original),1.38%,1.50%,0.00%,1.50%,2.50% -46,Gemini-1.0-Pro-002 (Prompt),1.25%,1.00%,3.50%,0.00%,0.50% -47,MiniCPM3-4B (FC),0.88%,1.50%,2.00%,0.00%,0.00% -48,Nexusflow-Raven-v2 (FC),0.88%,1.50%,0.50%,0.50%,1.00% -49,Gemma-2-9b-it (Prompt),0.75%,1.00%,2.00%,0.00%,0.00% -50,Open-Mixtral-8x22b (FC),0.62%,1.00%,0.00%,1.00%,0.50% -51,Open-Mixtral-8x7b (Prompt),0.62%,1.50%,0.00%,0.00%,1.00% -52,Mistral-Medium-2312 (Prompt),0.50%,1.50%,0.00%,0.00%,0.50% -53,Open-Mixtral-8x22b (Prompt),0.50%,0.50%,0.50%,0.00%,1.00% -54,Hermes-2-Pro-Llama-3-8B (FC),0.38%,1.00%,0.00%,0.50%,0.00% -55,Hammer2.0-0.5b (FC),0.38%,0.50%,0.00%,0.50%,0.50% -56,Command-R-Plus (Prompt) (Original),0.38%,1.00%,0.00%,0.00%,0.50% -57,Hermes-2-Pro-Mistral-7B (FC),0.25%,0.50%,0.00%,0.00%,0.50% -58,Mistral-Small-2402 (Prompt),0.25%,0.50%,0.00%,0.00%,0.50% -59,Hermes-2-Pro-Llama-3-70B (FC),0.25%,0.50%,0.00%,0.00%,0.50% -60,GoGoAgent,0.25%,0.50%,0.50%,0.00%,0.00% -61,Open-Mistral-Nemo-2407 (Prompt),0.12%,0.00%,0.50%,0.00%,0.00% -62,xLAM-1b-fc-r (FC),0.12%,0.00%,0.00%,0.00%,0.50% -63,Qwen2-1.5B-Instruct (Prompt),0.12%,0.00%,0.50%,0.00%,0.00% -64,DBRX-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% -65,Gemma-2-2b-it (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% -66,Llama-3.2-1B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% -67,Llama-3.1-8B-Instruct (FC),0.00%,0.00%,0.00%,0.00%,0.00% +36,Llama-3.1-8B-Instruct (FC),4.00%,4.50%,3.50%,5.00%,3.00% +37,Llama-3.1-70B-Instruct (FC),2.75%,4.50%,2.00%,2.00%,2.50% +38,Granite-20b-FunctionCalling (FC),2.75%,5.00%,1.50%,3.00%,1.50% +39,Qwen2-7B-Instruct (Prompt),2.63%,3.50%,3.50%,1.50%,2.00% +40,Gemini-1.0-Pro-002 (FC),2.50%,4.00%,2.50%,2.50%,1.00% +41,Mistral-small-2402 (FC),2.12%,3.50%,0.00%,2.50%,2.50% +42,Gemma-2-27b-it (Prompt),2.12%,3.50%,2.00%,1.50%,1.50% +43,Llama-3.2-3B-Instruct (Prompt),2.12%,1.50%,2.00%,2.00%,3.00% +44,Qwen2.5-1.5B-Instruct (Prompt),1.50%,2.00%,2.00%,1.00%,1.00% +45,Hammer2.0-1.5b (FC),1.38%,2.50%,0.50%,1.00%,1.50% +46,Command-R-Plus (FC) (Original),1.38%,1.50%,0.00%,1.50%,2.50% +47,Gemini-1.0-Pro-002 (Prompt),1.25%,1.00%,3.50%,0.00%,0.50% +48,MiniCPM3-4B (FC),0.88%,1.50%,2.00%,0.00%,0.00% +49,Nexusflow-Raven-v2 (FC),0.88%,1.50%,0.50%,0.50%,1.00% +50,Gemma-2-9b-it (Prompt),0.75%,1.00%,2.00%,0.00%,0.00% +51,Open-Mixtral-8x22b (FC),0.62%,1.00%,0.00%,1.00%,0.50% +52,Open-Mixtral-8x7b (Prompt),0.62%,1.50%,0.00%,0.00%,1.00% +53,Mistral-Medium-2312 (Prompt),0.50%,1.50%,0.00%,0.00%,0.50% +54,Open-Mixtral-8x22b (Prompt),0.50%,0.50%,0.50%,0.00%,1.00% +55,Hermes-2-Pro-Llama-3-8B (FC),0.38%,1.00%,0.00%,0.50%,0.00% +56,Hammer2.0-0.5b (FC),0.38%,0.50%,0.00%,0.50%,0.50% +57,Command-R-Plus (Prompt) (Original),0.38%,1.00%,0.00%,0.00%,0.50% +58,Hermes-2-Pro-Mistral-7B (FC),0.25%,0.50%,0.00%,0.00%,0.50% +59,Mistral-Small-2402 (Prompt),0.25%,0.50%,0.00%,0.00%,0.50% +60,Hermes-2-Pro-Llama-3-70B (FC),0.25%,0.50%,0.00%,0.00%,0.50% +61,GoGoAgent,0.25%,0.50%,0.50%,0.00%,0.00% +62,Open-Mistral-Nemo-2407 (Prompt),0.12%,0.00%,0.50%,0.00%,0.00% +63,xLAM-1b-fc-r (FC),0.12%,0.00%,0.00%,0.00%,0.50% +64,Qwen2-1.5B-Instruct (Prompt),0.12%,0.00%,0.50%,0.00%,0.00% +65,DBRX-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +66,Gemma-2-2b-it (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% +67,Llama-3.2-1B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% 68,xLAM-7b-fc-r (FC),0.00%,0.00%,0.00%,0.00%,0.00% 69,Gorilla-OpenFunctions-v2 (FC),0.00%,0.00%,0.00%,0.00%,0.00% 70,Meta-Llama-3-8B-Instruct (Prompt),0.00%,0.00%,0.00%,0.00%,0.00% \ No newline at end of file diff --git a/data_overall.csv b/data_overall.csv index 8456a8c37..507745fb1 100644 --- a/data_overall.csv +++ b/data_overall.csv @@ -64,8 +64,8 @@ Rank,Overall Acc,Model,Model Link,Cost ($ Per 1k Function Calls),Latency Mean (s 63,36.30%,Gemini-1.0-Pro-002 (Prompt),https://deepmind.google/technologies/gemini/pro/,2.18,5.51,14.5,40.27,60.17%,50.17%,66.50%,60.50%,63.50%,59.66%,45.14%,78.00%,68.00%,47.50%,48.38%,50.00%,48.41%,56.25%,37.50%,1.25%,1.00%,3.50%,0.00%,0.50%,N/A,85.37%,50.23%,Google,Proprietary 64,31.04%,Qwen2-1.5B-Instruct (Prompt),https://huggingface.co/Qwen/Qwen2-1.5B-Instruct,N/A,N/A,N/A,N/A,59.73%,55.92%,80.00%,55.50%,47.50%,58.52%,51.07%,82.00%,56.00%,45.00%,39.00%,50.39%,40.50%,25.00%,20.83%,0.12%,0.00%,0.50%,0.00%,0.00%,N/A,75.61%,22.92%,Qwen,apache-2.0 65,26.16%,Llama-3.1-70B-Instruct (FC),https://llama.meta.com/llama3,N/A,10.55,35.8,24.13,25.15%,49.58%,24.50%,11.50%,15.00%,31.23%,53.43%,34.00%,30.00%,7.50%,44.47%,48.45%,52.56%,31.25%,25.00%,2.75%,4.50%,2.00%,2.00%,2.50%,N/A,100.00%,43.86%,Meta,Meta Llama 3 Community -66,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.8,1.32,0.67,2.56,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,41.46%,81.47%,Mistral AI,Proprietary -67,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,N/A,97.56%,7.06%,Salesforce,cc-by-nc-4.0 -68,23.95%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,92.68%,5.29%,Meta,Meta Llama 3 Community +66,25.28%,Llama-3.1-8B-Instruct (FC),https://llama.meta.com/llama3,N/A,7.16,13.45,17.89,36.52%,56.08%,55.00%,0.00%,35.00%,49.93%,58.21%,58.00%,56.00%,27.50%,33.23%,48.06%,47.64%,31.25%,37.50%,4.00%,4.50%,3.50%,5.00%,3.00%,N/A,92.68%,5.29%,Meta,Meta Llama 3 Community +67,25.08%,Mistral-Small-2402 (Prompt),https://docs.mistral.ai/guides/model-selection/,3.8,1.32,0.67,2.56,16.33%,10.83%,36.50%,11.50%,6.50%,9.38%,13.00%,18.00%,4.00%,2.50%,53.98%,18.22%,45.90%,12.50%,8.33%,0.25%,0.50%,0.00%,0.00%,0.50%,N/A,41.46%,81.47%,Mistral AI,Proprietary +68,24.81%,xLAM-1b-fc-r (FC),https://huggingface.co/Salesforce/xLAM-1b-fc-r,N/A,N/A,N/A,N/A,39.94%,71.25%,85.50%,1.50%,1.50%,40.23%,74.93%,86.00%,0.00%,0.00%,38.34%,63.18%,54.19%,0.00%,0.00%,0.12%,0.00%,0.00%,0.00%,0.50%,N/A,97.56%,7.06%,Salesforce,cc-by-nc-4.0 69,20.21%,Gemma-2-2b-it (Prompt),https://blog.google/technology/developers/gemma-open-models/,N/A,N/A,N/A,N/A,12.19%,7.25%,41.50%,0.00%,0.00%,12.88%,5.50%,46.00%,0.00%,0.00%,41.63%,11.24%,11.96%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,12.20%,79.93%,Google,gemma-terms-of-use 70,17.93%,Llama-3.2-1B-Instruct (Prompt),https://llama.meta.com/llama3,N/A,N/A,N/A,N/A,22.77%,25.08%,32.00%,24.00%,10.00%,19.11%,27.93%,18.00%,28.00%,2.50%,29.85%,25.97%,4.82%,6.25%,4.17%,0.00%,0.00%,0.00%,0.00%,0.00%,N/A,48.78%,54.42%,Meta,Meta Llama 3 Community \ No newline at end of file