AbanteAI · jakethekoenig · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -0,0 +1,26 @@
+# Benchmarks
+
+In this directory we write benchmarks for Mentat's performance on different tasks.
+
+## Running Exercism Benchmarks
+
+```
+./benchmarks/exercism_practice.py
+```
+
+Flags that control the performance of the benchmarks are defined [here](arg_parser.py) and set conservatively so benchmarks without flags will run relatively quickly and cheaply. To run the exercism benchmark with multiple workers on all the tests with one retry for the clojure language run the following:
+```
+./benchmarks/exercism_practice.py  --max_benchmarks 134 --max_iterations 2 --max_workers 2 --language clojure
+```
+
+Warning: If you increase `max_workers` much higher you'll start to get rate limited.
+
+## Running Real World Benchmarks
+
+```
+./benchmarks/benchmark_runner.py
+```
+
+## Making Real World Benchmarks
+
+Real world benchmarks can either be [samples](benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json) or [python files](benchmarks/mentat/).
diff --git a/tests/benchmarks/__init__.py → benchmarks/__init__.py b/tests/benchmarks/__init__.py → benchmarks/__init__.py
diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py
@@ -0,0 +1,65 @@
+import argparse
+
+
+def common_benchmark_parser():
+    parser = argparse.ArgumentParser(description="Run exercism benchmarks")
+    parser.add_argument(
+        "--refresh_repo",
+        action="store_true",
+        default=False,
+        help="When set local changes will be discarded.",
+    )
+    parser.add_argument(
+        "--language",
+        default="python",
+        type=str,
+        help="Which exercism language to do exercises for",
+    )
+    parser.add_argument(
+        "--benchmarks",
+        action="append",
+        nargs="*",
+        default=[[]],
+        help=(
+            "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
+            " depends on benchmark."
+        ),
+    )
+    parser.add_argument(
+        "--max_benchmarks",
+        default=1,
+        type=int,
+        help="The maximum number of exercises to run",
+    )
+    parser.add_argument(
+        "--max_iterations",
+        default=1,
+        type=int,
+        help="Number of times to rerun mentat with error messages",
+    )
+    parser.add_argument(
+        "--max_workers",
+        default=1,
+        type=int,
+        help="Number of workers to use for multiprocessing",
+    )
+    parser.add_argument(
+        "--retries",
+        action="store",
+        default=1,
+        type=int,
+        help="Number of times to retry a benchmark",
+    )
+    parser.add_argument(
+        "--repo",
+        action="store",
+        default="mentat",
+        help="For benchmarks that are evaluated against a repo",
+    )
+    parser.add_argument(
+        "--evaluate_baseline",
+        action="store_true",
+        help="Evaluate the baseline for the benchmark",
+    )
+
+    return parser
diff --git a/tests/benchmarks/benchmark_result.py → benchmarks/benchmark_result.py b/tests/benchmarks/benchmark_result.py → benchmarks/benchmark_result.py
diff --git a/tests/benchmarks/benchmark_result_summary.py → benchmarks/benchmark_result_summary.py b/tests/benchmarks/benchmark_result_summary.py → benchmarks/benchmark_result_summary.py
@@ -6,7 +6,7 @@
 import attr
 from jinja2 import Environment, FileSystemLoader, select_autoescape
 
-from tests.benchmarks.benchmark_result import BenchmarkResult
+from benchmarks.benchmark_result import BenchmarkResult
 
 
 class BenchmarkResultSummary:
@@ -124,9 +124,7 @@ def summary_string(self) -> str:
     def render_results(self):
         env = Environment(
             loader=FileSystemLoader(
-                os.path.join(
-                    os.path.dirname(__file__), "../../mentat/resources/templates"
-                )
+                os.path.join(os.path.dirname(__file__), "../mentat/resources/templates")
             ),
             autoescape=select_autoescape(["html", "xml"]),
         )

diff --git a/tests/benchmarks/benchmark_runner.py → benchmarks/benchmark_runner.py b/tests/benchmarks/benchmark_runner.py → benchmarks/benchmark_runner.py
@@ -1,26 +1,26 @@
+#!/usr/bin/env python
+import asyncio
 import importlib.util
 import json
 import os
 import re
 from pathlib import Path
 
-import pytest
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam,
     ChatCompletionUserMessageParam,
 )
 from openai.types.chat.completion_create_params import ResponseFormat
 
+from benchmarks.arg_parser import common_benchmark_parser
+from benchmarks.benchmark_result import BenchmarkResult
+from benchmarks.benchmark_result_summary import BenchmarkResultSummary
 from mentat.errors import SampleError
 from mentat.llm_api_handler import model_context_size, prompt_tokens
 from mentat.python_client.client import PythonClient
 from mentat.sampler.sample import Sample
 from mentat.sampler.utils import setup_repo
 from mentat.session_context import SESSION_CONTEXT
-from tests.benchmarks.benchmark_result import BenchmarkResult
-from tests.benchmarks.benchmark_result_summary import BenchmarkResultSummary
-
-pytestmark = pytest.mark.benchmark
 
 
 def dynamic_import(path_to_module, module_name):
@@ -30,11 +30,6 @@ def dynamic_import(path_to_module, module_name):
     return module
 
 
-@pytest.fixture
-def retries(request):
-    return int(request.config.getoption("--retries"))
-
-
 async def grade(to_grade, prompt, model="gpt-4-1106-preview"):
     try:
         messages = [
@@ -257,8 +252,7 @@ def benchmark_listed(title, benchmarks):
     return False
 
 
-@pytest.mark.asyncio
-async def test_benchmark(retries, benchmarks):
+async def run_benchmarks(retries, benchmarks):
     print("Running benchmarks")
     benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"
 
@@ -296,3 +290,14 @@ async def test_benchmark(retries, benchmarks):
     with open("results.json", "w") as f:
         f.write(summary.to_json())
     summary.render_results()
+
+
+if __name__ == "__main__":
+    parser = common_benchmark_parser()
+    args = parser.parse_args()
+    asyncio.run(
+        run_benchmarks(
+            args.retries,
+            args.benchmarks[0],
+        )
+    )
diff --git a/...chmarks/mentat/clojure_exercism_runner.py → ...chmarks/mentat/clojure_exercism_runner.py b/...chmarks/mentat/clojure_exercism_runner.py → ...chmarks/mentat/clojure_exercism_runner.py
diff --git a/...marks/benchmarks/mentat/license_update.py → ...marks/benchmarks/mentat/license_update.py b/...marks/benchmarks/mentat/license_update.py → ...marks/benchmarks/mentat/license_update.py
diff --git a/.../benchmarks/benchmarks/mentat/pre_tags.py → benchmarks/benchmarks/mentat/pre_tags.py b/.../benchmarks/benchmarks/mentat/pre_tags.py → benchmarks/benchmarks/mentat/pre_tags.py
diff --git a/...ple_15223222005645d08b81f093e51d52fe.json → ...ple_15223222005645d08b81f093e51d52fe.json b/...ple_15223222005645d08b81f093e51d52fe.json → ...ple_15223222005645d08b81f093e51d52fe.json
diff --git a/tests/benchmarks/context_benchmark.py → benchmarks/context_benchmark.py b/tests/benchmarks/context_benchmark.py → benchmarks/context_benchmark.py
@@ -1,24 +1,24 @@
+#!/usr/bin/env python
+import asyncio
 import json
 import os
 from collections import defaultdict
 from itertools import islice
 from pathlib import Path
 from typing import Any
 
-import pytest
 from git import Repo
 
+from benchmarks.arg_parser import common_benchmark_parser
 from mentat.code_context import CodeContext
-from mentat.code_feature import CodeFeature, CodeMessageLevel
+from mentat.code_feature import CodeFeature
 from mentat.code_file_manager import CodeFileManager
 from mentat.config import Config
-from mentat.interval import Interval
-from mentat.llm_api import CostTracker, count_tokens, model_context_size, setup_api_key
+from mentat.cost_tracker import CostTracker
+from mentat.llm_api_handler import count_tokens, model_context_size
 from mentat.sampler.utils import clone_repo
 from mentat.session_context import SESSION_CONTEXT, SessionContext
 
-pytestmark = pytest.mark.benchmark
-
 
 class MockStream:
     def send(self, message, **kwargs):
@@ -29,7 +29,7 @@ def send(self, message, **kwargs):
 def _load_benchmarks() -> dict[str, dict[str, Any]]:
     """Load all benchmarks found in benchmark_repos"""
     benchmarks = {}
-    benchmarks_dir = Path(__file__).parent / "../../benchmark_repos"
+    benchmarks_dir = Path(__file__).parent / "../benchmark_repos"
     for repo_dir in benchmarks_dir.iterdir():
         benchmarks_path = repo_dir / "benchmarks.json"
         if benchmarks_path.exists():
@@ -46,18 +46,9 @@ def _convert_features_to_line_sets(
     for feature in features:
         # Non-explicit features (e.g. CodeMaps) are considered false positives.
         # Using negative numbers here as that affect.
-        if feature.level not in (CodeMessageLevel.CODE, CodeMessageLevel.INTERVAL):
-            n_lines = len(feature.get_code_message())
-            lines[feature.path].update(range(-1, -n_lines - 1, -1))
-            continue
 
-        # Otherwise match specific lines
         path = feature.path.relative_to(git_root)
-        if feature.level == CodeMessageLevel.INTERVAL:
-            interval = feature.interval
-        else:
-            n_lines = len(feature.get_code_message())
-            interval = Interval(1, n_lines + 1)
+        interval = feature.interval
         lines[path].update(range(interval.start, interval.end + 1))
     return lines
 
@@ -129,15 +120,13 @@ async def select_features_for_benchmark(
     return {"features": selected_features, "score": selector_performance}
 
 
-@pytest.mark.asyncio
 async def test_code_context_performance(benchmarks, max_benchmarks=10):
     """Run a set of benchmarks and evaluate performance
 
     Run standalone:
-        `pytest -s tests/benchmarks/context_benchmark.py --benchmark`
+        `./benchmarks/context_benchmark.py`
     """
     # Load applicable benchmarks
-    setup_api_key()
     all_benchmarks = _load_benchmarks()
     if len(benchmarks) > 0:
         benchmarks_to_run = {k: v for k, v in all_benchmarks.items() if k in benchmarks}
@@ -203,3 +192,14 @@ async def test_code_context_performance(benchmarks, max_benchmarks=10):
                     print(f"Error: '{e}'; skipping")
 
     return scores
+
+
+if __name__ == "__main__":
+    parser = common_benchmark_parser()
+    args = parser.parse_args()
+    asyncio.run(
+        test_code_context_performance(
+            args.benchmarks,
+            args.max_benchmarks,
+        )
+    )
diff --git a/tests/benchmarks/edit_rubric_benchmark.py → benchmarks/edit_rubric_benchmark.py b/tests/benchmarks/edit_rubric_benchmark.py → benchmarks/edit_rubric_benchmark.py
@@ -1,29 +1,19 @@
+#!/usr/bin/env python
+import asyncio
 import json
 import os
 import subprocess
 from itertools import islice
 from pathlib import Path
 from textwrap import dedent
 
-import pytest
 from git import Repo
 from openai import OpenAI
 
+from benchmarks.arg_parser import common_benchmark_parser
 from mentat.python_client.client import PythonClient
 from mentat.sampler.utils import clone_repo
 
-pytestmark = pytest.mark.benchmark
-
-
-@pytest.fixture
-def evaluate_baseline(request):
-    return bool(request.config.getoption("--evaluate_baseline"))
-
-
-@pytest.fixture
-def repo(request):
-    return request.config.getoption("--repo")
-
 
 def load_tests(benchmarks_dir):
     tests = {}
@@ -78,7 +68,6 @@ def evaluate_diff(diff: str) -> dict[str, int]:
     return json.loads(message)
 
 
-@pytest.mark.asyncio
 async def test_edit_quality(
     benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo
 ):
@@ -152,3 +141,17 @@ async def test_edit_quality(
         repo.git.clean("-fd")
         repo.git.checkout(start_commit)
         await client.shutdown()
+
+
+if __name__ == "__main__":
+    parser = common_benchmark_parser()
+    args = parser.parse_args()
+    asyncio.run(
+        test_edit_quality(
+            args.benchmarks,
+            args.max_benchmarks,
+            args.evaluate_baseline,
+            args.repo,
+            args.refresh_repo,
+        )
+    )
diff --git a/...s/benchmarks/exercise_runners/__init__.py → benchmarks/exercise_runners/__init__.py b/...s/benchmarks/exercise_runners/__init__.py → benchmarks/exercise_runners/__init__.py
diff --git a/...rcise_runners/abstract_exercise_runner.py → ...rcise_runners/abstract_exercise_runner.py b/...rcise_runners/abstract_exercise_runner.py → ...rcise_runners/abstract_exercise_runner.py
@@ -2,7 +2,7 @@
 import subprocess
 from pathlib import Path
 
-from tests.benchmarks.benchmark_result import BenchmarkResult
+from benchmarks.benchmark_result import BenchmarkResult
 
 
 class AbstractExerciseRunner:

diff --git a/...ercise_runners/clojure_exercise_runner.py → ...ercise_runners/clojure_exercise_runner.py b/...ercise_runners/clojure_exercise_runner.py → ...ercise_runners/clojure_exercise_runner.py
@@ -1,8 +1,6 @@
 from pathlib import Path
 
-from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
-    AbstractExerciseRunner,
-)
+from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner
 
 
 class ClojureExerciseRunner(AbstractExerciseRunner):

diff --git a/...ercise_runners/exercise_runner_factory.py → ...ercise_runners/exercise_runner_factory.py b/...ercise_runners/exercise_runner_factory.py → ...ercise_runners/exercise_runner_factory.py
@@ -1,12 +1,8 @@
-from tests.benchmarks.exercise_runners.clojure_exercise_runner import (
-    ClojureExerciseRunner,
-)
-from tests.benchmarks.exercise_runners.javascript_exercise_runner import (
+from benchmarks.exercise_runners.clojure_exercise_runner import ClojureExerciseRunner
+from benchmarks.exercise_runners.javascript_exercise_runner import (
     JavascriptExerciseRunner,
 )
-from tests.benchmarks.exercise_runners.python_exercise_runner import (
-    PythonExerciseRunner,
-)
+from benchmarks.exercise_runners.python_exercise_runner import PythonExerciseRunner
 
 
 class ExerciseRunnerFactory:

diff --git a/...ise_runners/javascript_exercise_runner.py → ...ise_runners/javascript_exercise_runner.py b/...ise_runners/javascript_exercise_runner.py → ...ise_runners/javascript_exercise_runner.py
@@ -1,9 +1,7 @@
 import os
 import subprocess
 
-from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
-    AbstractExerciseRunner,
-)
+from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner
 
 
 class JavascriptExerciseRunner(AbstractExerciseRunner):

diff --git a/...xercise_runners/python_exercise_runner.py → ...xercise_runners/python_exercise_runner.py b/...xercise_runners/python_exercise_runner.py → ...xercise_runners/python_exercise_runner.py
@@ -1,8 +1,6 @@
 from pathlib import Path
 
-from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
-    AbstractExerciseRunner,
-)
+from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner
 
 
 class PythonExerciseRunner(AbstractExerciseRunner):