Skip to content
This repository has been archived by the owner on Jan 7, 2025. It is now read-only.

Benchmarks moved out of test dir #472

Merged
merged 2 commits into from
Jan 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Benchmarks

In this directory we write benchmarks for Mentat's performance on different tasks.

## Running Exercism Benchmarks

```
./benchmarks/exercism_practice.py
```

Flags that control the performance of the benchmarks are defined [here](arg_parser.py) and set conservatively so benchmarks without flags will run relatively quickly and cheaply. To run the exercism benchmark with multiple workers on all the tests with one retry for the clojure language run the following:
```
./benchmarks/exercism_practice.py --max_benchmarks 134 --max_iterations 2 --max_workers 2 --language clojure
```

Warning: If you increase `max_workers` much higher you'll start to get rate limited.

## Running Real World Benchmarks

```
./benchmarks/benchmark_runner.py
```

## Making Real World Benchmarks

Real world benchmarks can either be [samples](benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json) or [python files](benchmarks/mentat/).
File renamed without changes.
65 changes: 65 additions & 0 deletions benchmarks/arg_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import argparse


def common_benchmark_parser():
parser = argparse.ArgumentParser(description="Run exercism benchmarks")
parser.add_argument(
"--refresh_repo",
action="store_true",
default=False,
help="When set local changes will be discarded.",
)
parser.add_argument(
"--language",
default="python",
type=str,
help="Which exercism language to do exercises for",
)
parser.add_argument(
"--benchmarks",
action="append",
nargs="*",
default=[[]],
help=(
"Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
" depends on benchmark."
),
)
parser.add_argument(
"--max_benchmarks",
default=1,
type=int,
help="The maximum number of exercises to run",
)
parser.add_argument(
"--max_iterations",
default=1,
type=int,
help="Number of times to rerun mentat with error messages",
)
parser.add_argument(
"--max_workers",
default=1,
type=int,
help="Number of workers to use for multiprocessing",
)
parser.add_argument(
"--retries",
action="store",
default=1,
type=int,
help="Number of times to retry a benchmark",
)
parser.add_argument(
"--repo",
action="store",
default="mentat",
help="For benchmarks that are evaluated against a repo",
)
parser.add_argument(
"--evaluate_baseline",
action="store_true",
help="Evaluate the baseline for the benchmark",
)

return parser
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import attr
from jinja2 import Environment, FileSystemLoader, select_autoescape

from tests.benchmarks.benchmark_result import BenchmarkResult
from benchmarks.benchmark_result import BenchmarkResult


class BenchmarkResultSummary:
Expand Down Expand Up @@ -124,9 +124,7 @@ def summary_string(self) -> str:
def render_results(self):
env = Environment(
loader=FileSystemLoader(
os.path.join(
os.path.dirname(__file__), "../../mentat/resources/templates"
)
os.path.join(os.path.dirname(__file__), "../mentat/resources/templates")
),
autoescape=select_autoescape(["html", "xml"]),
)
Expand Down
29 changes: 17 additions & 12 deletions tests/benchmarks/benchmark_runner.py → benchmarks/benchmark_runner.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,26 +1,26 @@
#!/usr/bin/env python
import asyncio
import importlib.util
import json
import os
import re
from pathlib import Path

import pytest
from openai.types.chat import (
ChatCompletionAssistantMessageParam,
ChatCompletionUserMessageParam,
)
from openai.types.chat.completion_create_params import ResponseFormat

from benchmarks.arg_parser import common_benchmark_parser
from benchmarks.benchmark_result import BenchmarkResult
from benchmarks.benchmark_result_summary import BenchmarkResultSummary
from mentat.errors import SampleError
from mentat.llm_api_handler import model_context_size, prompt_tokens
from mentat.python_client.client import PythonClient
from mentat.sampler.sample import Sample
from mentat.sampler.utils import setup_repo
from mentat.session_context import SESSION_CONTEXT
from tests.benchmarks.benchmark_result import BenchmarkResult
from tests.benchmarks.benchmark_result_summary import BenchmarkResultSummary

pytestmark = pytest.mark.benchmark


def dynamic_import(path_to_module, module_name):
Expand All @@ -30,11 +30,6 @@ def dynamic_import(path_to_module, module_name):
return module


@pytest.fixture
def retries(request):
return int(request.config.getoption("--retries"))


async def grade(to_grade, prompt, model="gpt-4-1106-preview"):
try:
messages = [
Expand Down Expand Up @@ -257,8 +252,7 @@ def benchmark_listed(title, benchmarks):
return False


@pytest.mark.asyncio
async def test_benchmark(retries, benchmarks):
async def run_benchmarks(retries, benchmarks):
print("Running benchmarks")
benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"

Expand Down Expand Up @@ -296,3 +290,14 @@ async def test_benchmark(retries, benchmarks):
with open("results.json", "w") as f:
f.write(summary.to_json())
summary.render_results()


if __name__ == "__main__":
parser = common_benchmark_parser()
args = parser.parse_args()
asyncio.run(
run_benchmarks(
args.retries,
args.benchmarks[0],
)
)
40 changes: 20 additions & 20 deletions tests/benchmarks/context_benchmark.py → benchmarks/context_benchmark.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,24 +1,24 @@
#!/usr/bin/env python
import asyncio
import json
import os
from collections import defaultdict
from itertools import islice
from pathlib import Path
from typing import Any

import pytest
from git import Repo

from benchmarks.arg_parser import common_benchmark_parser
from mentat.code_context import CodeContext
from mentat.code_feature import CodeFeature, CodeMessageLevel
from mentat.code_feature import CodeFeature
from mentat.code_file_manager import CodeFileManager
from mentat.config import Config
from mentat.interval import Interval
from mentat.llm_api import CostTracker, count_tokens, model_context_size, setup_api_key
from mentat.cost_tracker import CostTracker
from mentat.llm_api_handler import count_tokens, model_context_size
from mentat.sampler.utils import clone_repo
from mentat.session_context import SESSION_CONTEXT, SessionContext

pytestmark = pytest.mark.benchmark


class MockStream:
def send(self, message, **kwargs):
Expand All @@ -29,7 +29,7 @@ def send(self, message, **kwargs):
def _load_benchmarks() -> dict[str, dict[str, Any]]:
"""Load all benchmarks found in benchmark_repos"""
benchmarks = {}
benchmarks_dir = Path(__file__).parent / "../../benchmark_repos"
benchmarks_dir = Path(__file__).parent / "../benchmark_repos"
for repo_dir in benchmarks_dir.iterdir():
benchmarks_path = repo_dir / "benchmarks.json"
if benchmarks_path.exists():
Expand All @@ -46,18 +46,9 @@ def _convert_features_to_line_sets(
for feature in features:
# Non-explicit features (e.g. CodeMaps) are considered false positives.
# Using negative numbers here as that affect.
if feature.level not in (CodeMessageLevel.CODE, CodeMessageLevel.INTERVAL):
n_lines = len(feature.get_code_message())
lines[feature.path].update(range(-1, -n_lines - 1, -1))
continue

# Otherwise match specific lines
path = feature.path.relative_to(git_root)
if feature.level == CodeMessageLevel.INTERVAL:
interval = feature.interval
else:
n_lines = len(feature.get_code_message())
interval = Interval(1, n_lines + 1)
interval = feature.interval
lines[path].update(range(interval.start, interval.end + 1))
return lines

Expand Down Expand Up @@ -129,15 +120,13 @@ async def select_features_for_benchmark(
return {"features": selected_features, "score": selector_performance}


@pytest.mark.asyncio
async def test_code_context_performance(benchmarks, max_benchmarks=10):
"""Run a set of benchmarks and evaluate performance

Run standalone:
`pytest -s tests/benchmarks/context_benchmark.py --benchmark`
`./benchmarks/context_benchmark.py`
"""
# Load applicable benchmarks
setup_api_key()
all_benchmarks = _load_benchmarks()
if len(benchmarks) > 0:
benchmarks_to_run = {k: v for k, v in all_benchmarks.items() if k in benchmarks}
Expand Down Expand Up @@ -203,3 +192,14 @@ async def test_code_context_performance(benchmarks, max_benchmarks=10):
print(f"Error: '{e}'; skipping")

return scores


if __name__ == "__main__":
parser = common_benchmark_parser()
args = parser.parse_args()
asyncio.run(
test_code_context_performance(
args.benchmarks,
args.max_benchmarks,
)
)
31 changes: 17 additions & 14 deletions tests/benchmarks/edit_rubric_benchmark.py → benchmarks/edit_rubric_benchmark.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,29 +1,19 @@
#!/usr/bin/env python
import asyncio
import json
import os
import subprocess
from itertools import islice
from pathlib import Path
from textwrap import dedent

import pytest
from git import Repo
from openai import OpenAI

from benchmarks.arg_parser import common_benchmark_parser
from mentat.python_client.client import PythonClient
from mentat.sampler.utils import clone_repo

pytestmark = pytest.mark.benchmark


@pytest.fixture
def evaluate_baseline(request):
return bool(request.config.getoption("--evaluate_baseline"))


@pytest.fixture
def repo(request):
return request.config.getoption("--repo")


def load_tests(benchmarks_dir):
tests = {}
Expand Down Expand Up @@ -78,7 +68,6 @@ def evaluate_diff(diff: str) -> dict[str, int]:
return json.loads(message)


@pytest.mark.asyncio
async def test_edit_quality(
benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo
):
Expand Down Expand Up @@ -152,3 +141,17 @@ async def test_edit_quality(
repo.git.clean("-fd")
repo.git.checkout(start_commit)
await client.shutdown()


if __name__ == "__main__":
parser = common_benchmark_parser()
args = parser.parse_args()
asyncio.run(
test_edit_quality(
args.benchmarks,
args.max_benchmarks,
args.evaluate_baseline,
args.repo,
args.refresh_repo,
)
)
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import subprocess
from pathlib import Path

from tests.benchmarks.benchmark_result import BenchmarkResult
from benchmarks.benchmark_result import BenchmarkResult


class AbstractExerciseRunner:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from pathlib import Path

from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
AbstractExerciseRunner,
)
from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner


class ClojureExerciseRunner(AbstractExerciseRunner):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,12 +1,8 @@
from tests.benchmarks.exercise_runners.clojure_exercise_runner import (
ClojureExerciseRunner,
)
from tests.benchmarks.exercise_runners.javascript_exercise_runner import (
from benchmarks.exercise_runners.clojure_exercise_runner import ClojureExerciseRunner
from benchmarks.exercise_runners.javascript_exercise_runner import (
JavascriptExerciseRunner,
)
from tests.benchmarks.exercise_runners.python_exercise_runner import (
PythonExerciseRunner,
)
from benchmarks.exercise_runners.python_exercise_runner import PythonExerciseRunner


class ExerciseRunnerFactory:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,7 @@
import os
import subprocess

from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
AbstractExerciseRunner,
)
from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner


class JavascriptExerciseRunner(AbstractExerciseRunner):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
from pathlib import Path

from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
AbstractExerciseRunner,
)
from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner


class PythonExerciseRunner(AbstractExerciseRunner):
Expand Down
Loading