diff --git a/benchmarks/README.md b/benchmarks/README.md new file mode 100644 index 000000000..66489470a --- /dev/null +++ b/benchmarks/README.md @@ -0,0 +1,26 @@ +# Benchmarks + +In this directory we write benchmarks for Mentat's performance on different tasks. + +## Running Exercism Benchmarks + +``` +./benchmarks/exercism_practice.py +``` + +Flags that control the performance of the benchmarks are defined [here](arg_parser.py) and set conservatively so benchmarks without flags will run relatively quickly and cheaply. To run the exercism benchmark with multiple workers on all the tests with one retry for the clojure language run the following: +``` +./benchmarks/exercism_practice.py --max_benchmarks 134 --max_iterations 2 --max_workers 2 --language clojure +``` + +Warning: If you increase `max_workers` much higher you'll start to get rate limited. + +## Running Real World Benchmarks + +``` +./benchmarks/benchmark_runner.py +``` + +## Making Real World Benchmarks + +Real world benchmarks can either be [samples](benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json) or [python files](benchmarks/mentat/). diff --git a/tests/benchmarks/__init__.py b/benchmarks/__init__.py similarity index 100% rename from tests/benchmarks/__init__.py rename to benchmarks/__init__.py diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py new file mode 100644 index 000000000..2921bf487 --- /dev/null +++ b/benchmarks/arg_parser.py @@ -0,0 +1,65 @@ +import argparse + + +def common_benchmark_parser(): + parser = argparse.ArgumentParser(description="Run exercism benchmarks") + parser.add_argument( + "--refresh_repo", + action="store_true", + default=False, + help="When set local changes will be discarded.", + ) + parser.add_argument( + "--language", + default="python", + type=str, + help="Which exercism language to do exercises for", + ) + parser.add_argument( + "--benchmarks", + action="append", + nargs="*", + default=[[]], + help=( + "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning" + " depends on benchmark." + ), + ) + parser.add_argument( + "--max_benchmarks", + default=1, + type=int, + help="The maximum number of exercises to run", + ) + parser.add_argument( + "--max_iterations", + default=1, + type=int, + help="Number of times to rerun mentat with error messages", + ) + parser.add_argument( + "--max_workers", + default=1, + type=int, + help="Number of workers to use for multiprocessing", + ) + parser.add_argument( + "--retries", + action="store", + default=1, + type=int, + help="Number of times to retry a benchmark", + ) + parser.add_argument( + "--repo", + action="store", + default="mentat", + help="For benchmarks that are evaluated against a repo", + ) + parser.add_argument( + "--evaluate_baseline", + action="store_true", + help="Evaluate the baseline for the benchmark", + ) + + return parser diff --git a/tests/benchmarks/benchmark_result.py b/benchmarks/benchmark_result.py similarity index 100% rename from tests/benchmarks/benchmark_result.py rename to benchmarks/benchmark_result.py diff --git a/tests/benchmarks/benchmark_result_summary.py b/benchmarks/benchmark_result_summary.py similarity index 96% rename from tests/benchmarks/benchmark_result_summary.py rename to benchmarks/benchmark_result_summary.py index 9dfb24fc1..5e02ed584 100644 --- a/tests/benchmarks/benchmark_result_summary.py +++ b/benchmarks/benchmark_result_summary.py @@ -6,7 +6,7 @@ import attr from jinja2 import Environment, FileSystemLoader, select_autoescape -from tests.benchmarks.benchmark_result import BenchmarkResult +from benchmarks.benchmark_result import BenchmarkResult class BenchmarkResultSummary: @@ -124,9 +124,7 @@ def summary_string(self) -> str: def render_results(self): env = Environment( loader=FileSystemLoader( - os.path.join( - os.path.dirname(__file__), "../../mentat/resources/templates" - ) + os.path.join(os.path.dirname(__file__), "../mentat/resources/templates") ), autoescape=select_autoescape(["html", "xml"]), ) diff --git a/tests/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py old mode 100644 new mode 100755 similarity index 95% rename from tests/benchmarks/benchmark_runner.py rename to benchmarks/benchmark_runner.py index 1d10be6ef..b53250249 --- a/tests/benchmarks/benchmark_runner.py +++ b/benchmarks/benchmark_runner.py @@ -1,26 +1,26 @@ +#!/usr/bin/env python +import asyncio import importlib.util import json import os import re from pathlib import Path -import pytest from openai.types.chat import ( ChatCompletionAssistantMessageParam, ChatCompletionUserMessageParam, ) from openai.types.chat.completion_create_params import ResponseFormat +from benchmarks.arg_parser import common_benchmark_parser +from benchmarks.benchmark_result import BenchmarkResult +from benchmarks.benchmark_result_summary import BenchmarkResultSummary from mentat.errors import SampleError from mentat.llm_api_handler import model_context_size, prompt_tokens from mentat.python_client.client import PythonClient from mentat.sampler.sample import Sample from mentat.sampler.utils import setup_repo from mentat.session_context import SESSION_CONTEXT -from tests.benchmarks.benchmark_result import BenchmarkResult -from tests.benchmarks.benchmark_result_summary import BenchmarkResultSummary - -pytestmark = pytest.mark.benchmark def dynamic_import(path_to_module, module_name): @@ -30,11 +30,6 @@ def dynamic_import(path_to_module, module_name): return module -@pytest.fixture -def retries(request): - return int(request.config.getoption("--retries")) - - async def grade(to_grade, prompt, model="gpt-4-1106-preview"): try: messages = [ @@ -257,8 +252,7 @@ def benchmark_listed(title, benchmarks): return False -@pytest.mark.asyncio -async def test_benchmark(retries, benchmarks): +async def run_benchmarks(retries, benchmarks): print("Running benchmarks") benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks" @@ -296,3 +290,14 @@ async def test_benchmark(retries, benchmarks): with open("results.json", "w") as f: f.write(summary.to_json()) summary.render_results() + + +if __name__ == "__main__": + parser = common_benchmark_parser() + args = parser.parse_args() + asyncio.run( + run_benchmarks( + args.retries, + args.benchmarks[0], + ) + ) diff --git a/tests/benchmarks/benchmarks/mentat/clojure_exercism_runner.py b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py similarity index 100% rename from tests/benchmarks/benchmarks/mentat/clojure_exercism_runner.py rename to benchmarks/benchmarks/mentat/clojure_exercism_runner.py diff --git a/tests/benchmarks/benchmarks/mentat/license_update.py b/benchmarks/benchmarks/mentat/license_update.py similarity index 100% rename from tests/benchmarks/benchmarks/mentat/license_update.py rename to benchmarks/benchmarks/mentat/license_update.py diff --git a/tests/benchmarks/benchmarks/mentat/pre_tags.py b/benchmarks/benchmarks/mentat/pre_tags.py similarity index 100% rename from tests/benchmarks/benchmarks/mentat/pre_tags.py rename to benchmarks/benchmarks/mentat/pre_tags.py diff --git a/tests/benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json b/benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json similarity index 100% rename from tests/benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json rename to benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json diff --git a/tests/benchmarks/context_benchmark.py b/benchmarks/context_benchmark.py old mode 100644 new mode 100755 similarity index 87% rename from tests/benchmarks/context_benchmark.py rename to benchmarks/context_benchmark.py index ef67d0792..191085f43 --- a/tests/benchmarks/context_benchmark.py +++ b/benchmarks/context_benchmark.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +import asyncio import json import os from collections import defaultdict @@ -5,20 +7,18 @@ from pathlib import Path from typing import Any -import pytest from git import Repo +from benchmarks.arg_parser import common_benchmark_parser from mentat.code_context import CodeContext -from mentat.code_feature import CodeFeature, CodeMessageLevel +from mentat.code_feature import CodeFeature from mentat.code_file_manager import CodeFileManager from mentat.config import Config -from mentat.interval import Interval -from mentat.llm_api import CostTracker, count_tokens, model_context_size, setup_api_key +from mentat.cost_tracker import CostTracker +from mentat.llm_api_handler import count_tokens, model_context_size from mentat.sampler.utils import clone_repo from mentat.session_context import SESSION_CONTEXT, SessionContext -pytestmark = pytest.mark.benchmark - class MockStream: def send(self, message, **kwargs): @@ -29,7 +29,7 @@ def send(self, message, **kwargs): def _load_benchmarks() -> dict[str, dict[str, Any]]: """Load all benchmarks found in benchmark_repos""" benchmarks = {} - benchmarks_dir = Path(__file__).parent / "../../benchmark_repos" + benchmarks_dir = Path(__file__).parent / "../benchmark_repos" for repo_dir in benchmarks_dir.iterdir(): benchmarks_path = repo_dir / "benchmarks.json" if benchmarks_path.exists(): @@ -46,18 +46,9 @@ def _convert_features_to_line_sets( for feature in features: # Non-explicit features (e.g. CodeMaps) are considered false positives. # Using negative numbers here as that affect. - if feature.level not in (CodeMessageLevel.CODE, CodeMessageLevel.INTERVAL): - n_lines = len(feature.get_code_message()) - lines[feature.path].update(range(-1, -n_lines - 1, -1)) - continue - # Otherwise match specific lines path = feature.path.relative_to(git_root) - if feature.level == CodeMessageLevel.INTERVAL: - interval = feature.interval - else: - n_lines = len(feature.get_code_message()) - interval = Interval(1, n_lines + 1) + interval = feature.interval lines[path].update(range(interval.start, interval.end + 1)) return lines @@ -129,15 +120,13 @@ async def select_features_for_benchmark( return {"features": selected_features, "score": selector_performance} -@pytest.mark.asyncio async def test_code_context_performance(benchmarks, max_benchmarks=10): """Run a set of benchmarks and evaluate performance Run standalone: - `pytest -s tests/benchmarks/context_benchmark.py --benchmark` + `./benchmarks/context_benchmark.py` """ # Load applicable benchmarks - setup_api_key() all_benchmarks = _load_benchmarks() if len(benchmarks) > 0: benchmarks_to_run = {k: v for k, v in all_benchmarks.items() if k in benchmarks} @@ -203,3 +192,14 @@ async def test_code_context_performance(benchmarks, max_benchmarks=10): print(f"Error: '{e}'; skipping") return scores + + +if __name__ == "__main__": + parser = common_benchmark_parser() + args = parser.parse_args() + asyncio.run( + test_code_context_performance( + args.benchmarks, + args.max_benchmarks, + ) + ) diff --git a/tests/benchmarks/edit_rubric_benchmark.py b/benchmarks/edit_rubric_benchmark.py old mode 100644 new mode 100755 similarity index 91% rename from tests/benchmarks/edit_rubric_benchmark.py rename to benchmarks/edit_rubric_benchmark.py index 3f37aae31..c283f7ec7 --- a/tests/benchmarks/edit_rubric_benchmark.py +++ b/benchmarks/edit_rubric_benchmark.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python +import asyncio import json import os import subprocess @@ -5,25 +7,13 @@ from pathlib import Path from textwrap import dedent -import pytest from git import Repo from openai import OpenAI +from benchmarks.arg_parser import common_benchmark_parser from mentat.python_client.client import PythonClient from mentat.sampler.utils import clone_repo -pytestmark = pytest.mark.benchmark - - -@pytest.fixture -def evaluate_baseline(request): - return bool(request.config.getoption("--evaluate_baseline")) - - -@pytest.fixture -def repo(request): - return request.config.getoption("--repo") - def load_tests(benchmarks_dir): tests = {} @@ -78,7 +68,6 @@ def evaluate_diff(diff: str) -> dict[str, int]: return json.loads(message) -@pytest.mark.asyncio async def test_edit_quality( benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo ): @@ -152,3 +141,17 @@ async def test_edit_quality( repo.git.clean("-fd") repo.git.checkout(start_commit) await client.shutdown() + + +if __name__ == "__main__": + parser = common_benchmark_parser() + args = parser.parse_args() + asyncio.run( + test_edit_quality( + args.benchmarks, + args.max_benchmarks, + args.evaluate_baseline, + args.repo, + args.refresh_repo, + ) + ) diff --git a/tests/benchmarks/exercise_runners/__init__.py b/benchmarks/exercise_runners/__init__.py similarity index 100% rename from tests/benchmarks/exercise_runners/__init__.py rename to benchmarks/exercise_runners/__init__.py diff --git a/tests/benchmarks/exercise_runners/abstract_exercise_runner.py b/benchmarks/exercise_runners/abstract_exercise_runner.py similarity index 97% rename from tests/benchmarks/exercise_runners/abstract_exercise_runner.py rename to benchmarks/exercise_runners/abstract_exercise_runner.py index c6ea1f785..3c1ebc1ac 100644 --- a/tests/benchmarks/exercise_runners/abstract_exercise_runner.py +++ b/benchmarks/exercise_runners/abstract_exercise_runner.py @@ -2,7 +2,7 @@ import subprocess from pathlib import Path -from tests.benchmarks.benchmark_result import BenchmarkResult +from benchmarks.benchmark_result import BenchmarkResult class AbstractExerciseRunner: diff --git a/tests/benchmarks/exercise_runners/clojure_exercise_runner.py b/benchmarks/exercise_runners/clojure_exercise_runner.py similarity index 86% rename from tests/benchmarks/exercise_runners/clojure_exercise_runner.py rename to benchmarks/exercise_runners/clojure_exercise_runner.py index c645aef68..80a994dd0 100644 --- a/tests/benchmarks/exercise_runners/clojure_exercise_runner.py +++ b/benchmarks/exercise_runners/clojure_exercise_runner.py @@ -1,8 +1,6 @@ from pathlib import Path -from tests.benchmarks.exercise_runners.abstract_exercise_runner import ( - AbstractExerciseRunner, -) +from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner class ClojureExerciseRunner(AbstractExerciseRunner): diff --git a/tests/benchmarks/exercise_runners/exercise_runner_factory.py b/benchmarks/exercise_runners/exercise_runner_factory.py similarity index 53% rename from tests/benchmarks/exercise_runners/exercise_runner_factory.py rename to benchmarks/exercise_runners/exercise_runner_factory.py index f72b9aeaf..59afa2a67 100644 --- a/tests/benchmarks/exercise_runners/exercise_runner_factory.py +++ b/benchmarks/exercise_runners/exercise_runner_factory.py @@ -1,12 +1,8 @@ -from tests.benchmarks.exercise_runners.clojure_exercise_runner import ( - ClojureExerciseRunner, -) -from tests.benchmarks.exercise_runners.javascript_exercise_runner import ( +from benchmarks.exercise_runners.clojure_exercise_runner import ClojureExerciseRunner +from benchmarks.exercise_runners.javascript_exercise_runner import ( JavascriptExerciseRunner, ) -from tests.benchmarks.exercise_runners.python_exercise_runner import ( - PythonExerciseRunner, -) +from benchmarks.exercise_runners.python_exercise_runner import PythonExerciseRunner class ExerciseRunnerFactory: diff --git a/tests/benchmarks/exercise_runners/javascript_exercise_runner.py b/benchmarks/exercise_runners/javascript_exercise_runner.py similarity index 89% rename from tests/benchmarks/exercise_runners/javascript_exercise_runner.py rename to benchmarks/exercise_runners/javascript_exercise_runner.py index aa32fcc20..5dec86a01 100644 --- a/tests/benchmarks/exercise_runners/javascript_exercise_runner.py +++ b/benchmarks/exercise_runners/javascript_exercise_runner.py @@ -1,9 +1,7 @@ import os import subprocess -from tests.benchmarks.exercise_runners.abstract_exercise_runner import ( - AbstractExerciseRunner, -) +from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner class JavascriptExerciseRunner(AbstractExerciseRunner): diff --git a/tests/benchmarks/exercise_runners/python_exercise_runner.py b/benchmarks/exercise_runners/python_exercise_runner.py similarity index 85% rename from tests/benchmarks/exercise_runners/python_exercise_runner.py rename to benchmarks/exercise_runners/python_exercise_runner.py index 8c2ce958c..742458823 100644 --- a/tests/benchmarks/exercise_runners/python_exercise_runner.py +++ b/benchmarks/exercise_runners/python_exercise_runner.py @@ -1,8 +1,6 @@ from pathlib import Path -from tests.benchmarks.exercise_runners.abstract_exercise_runner import ( - AbstractExerciseRunner, -) +from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner class PythonExerciseRunner(AbstractExerciseRunner): diff --git a/tests/benchmarks/exercism_practice.py b/benchmarks/exercism_practice.py old mode 100644 new mode 100755 similarity index 92% rename from tests/benchmarks/exercism_practice.py rename to benchmarks/exercism_practice.py index e3df91e9e..5a0407b1f --- a/tests/benchmarks/exercism_practice.py +++ b/benchmarks/exercism_practice.py @@ -1,48 +1,29 @@ +#!/usr/bin/env python import asyncio import os from functools import partial from multiprocessing import Pool from pathlib import Path -import pytest import tqdm from openai import BadRequestError +from benchmarks.arg_parser import common_benchmark_parser +from benchmarks.benchmark_result import BenchmarkResult +from benchmarks.benchmark_result_summary import BenchmarkResultSummary +from benchmarks.exercise_runners.exercise_runner_factory import ExerciseRunnerFactory from mentat.config import Config from mentat.python_client.client import PythonClient from mentat.sampler.utils import clone_repo from mentat.session_context import SESSION_CONTEXT -from tests.benchmarks.benchmark_result import BenchmarkResult -from tests.benchmarks.benchmark_result_summary import BenchmarkResultSummary -from tests.benchmarks.exercise_runners.exercise_runner_factory import ( - ExerciseRunnerFactory, -) - -pytestmark = pytest.mark.benchmark -@pytest.fixture def clone_exercism_repo(refresh_repo, language): exercism_url = f"https://github.com/exercism/{language}.git" local_dir = clone_repo(exercism_url, f"exercism-{language}", refresh_repo) os.chdir(local_dir) -@pytest.fixture -def max_iterations(request): - return int(request.config.getoption("--max_iterations")) - - -@pytest.fixture -def max_workers(request): - return int(request.config.getoption("--max_workers")) - - -@pytest.fixture -def language(request): - return request.config.getoption("--language") - - prompt = ( "You are a professional code reviewer who helps other coders improve their skills." " You recently assigned a coder a small coding test to assess their level, with a" @@ -200,8 +181,7 @@ def tqdm_summary(results): return "Passed: " + str(passed_in_n)[1:-1] + "| Failed: " + str(failed) -def test_practice_directory_performance( - clone_exercism_repo, +def run_exercism_benchmark( benchmarks, max_benchmarks, max_iterations, @@ -237,3 +217,16 @@ def test_practice_directory_performance( with open("results.json", "w") as f: f.write(summary.to_json()) summary.render_results() + + +if __name__ == "__main__": + parser = common_benchmark_parser() + args = parser.parse_args() + clone_exercism_repo(args.refresh_repo, args.language) + run_exercism_benchmark( + args.benchmarks, + args.max_benchmarks, + args.max_iterations, + args.max_workers, + args.language, + ) diff --git a/scripts/evolve_llm_feature_selector.py b/scripts/evolve_llm_feature_selector.py index b8a139c13..24adf9a05 100644 --- a/scripts/evolve_llm_feature_selector.py +++ b/scripts/evolve_llm_feature_selector.py @@ -1,3 +1,4 @@ +# ruff: noqa: E501 import argparse import asyncio import json @@ -8,9 +9,9 @@ from openai import AsyncOpenAI +from benchmarks.context_benchmark import test_code_context_performance from mentat.errors import ModelError from mentat.prompts.prompts import read_prompt -from tests.benchmarks.context_benchmark import test_code_context_performance prompts_dir = Path(__file__).parent.parent / "mentat/resources/prompts" @@ -59,7 +60,7 @@ async def generate_variations( 3. To also identify relevant context to the query, such as the type-definitions of variables which will be edited, or functions which would be directly affected by the edits. \ 4. To NOT select irrelevant files or lines of code. \ 5. It's critical respond to this with a JSON-parsable list of strings (one for each prompt). \ - """).format(population=population) # ruff: noqa: E501 + """).format(population=population) scores = [(prompt, recall_weighted_mean(scores[prompt])) for prompt in scores] top_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:population] messages = [ diff --git a/scripts/git_log_to_transcripts.py b/scripts/git_log_to_transcripts.py index cdc7ea2a2..f3cf5e1c9 100755 --- a/scripts/git_log_to_transcripts.py +++ b/scripts/git_log_to_transcripts.py @@ -11,6 +11,7 @@ from git import Repo from openai import OpenAI +from benchmarks.context_benchmark import MockStream, select_features_for_benchmark from mentat.code_context import CodeContext from mentat.code_file_manager import CodeFileManager from mentat.config import Config @@ -18,7 +19,6 @@ from mentat.parsers.git_parser import GitParser from mentat.sampler.utils import clone_repo from mentat.session_context import SESSION_CONTEXT, SessionContext -from tests.benchmarks.context_benchmark import MockStream, select_features_for_benchmark system_prompt = dedent("""\ You are part of an automated system for making synthetic data. You will be given the \ diff --git a/scripts/run_and_upload_benchmarks.sh b/scripts/run_and_upload_benchmarks.sh index 6a7d86aee..314ecb498 100755 --- a/scripts/run_and_upload_benchmarks.sh +++ b/scripts/run_and_upload_benchmarks.sh @@ -5,7 +5,7 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S) ##################### # JAVASCRIPT EXERCISM ##################### -pytest -s tests/benchmarks/exercism_practice.py \ +./benchmarks/exercism_practice.py \ --max_iterations 2 \ --max_workers 1 \ --max_benchmarks 200 \ @@ -26,7 +26,7 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${ ################# # PYTHON EXERCISM ################# -pytest -s tests/benchmarks/exercism_practice.py \ +./benchmarks/exercism_practice.py \ --max_iterations 2 \ --max_workers 1 \ --max_benchmarks 200 \ @@ -47,7 +47,7 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${ ####################### # REAL WORLD BENCHMARKS ####################### -pytest tests/benchmarks/benchmark_runner.py --benchmark -s --retries 2 +./benchmarks/benchmark_runner.py --benchmark -s --retries 2 SUMMARY=$(jq '.summary_string' results.json) # Upload results to S3 diff --git a/scripts/sampler/__main__.py b/scripts/sampler/__main__.py index 7e7f7ab65..04fd9c082 100644 --- a/scripts/sampler/__main__.py +++ b/scripts/sampler/__main__.py @@ -13,13 +13,13 @@ from run import run_sample from validate import validate_sample -from mentat.sampler.sample import Sample -from mentat.utils import mentat_dir_path -from tests.benchmarks.benchmark_runner import ( +from benchmarks.benchmark_runner import ( compare_diffs, grade_diff_syntax, grade_model_response, ) +from mentat.sampler.sample import Sample +from mentat.utils import mentat_dir_path def warn(msg: Any): diff --git a/setup.py b/setup.py index b397fb811..a4ca409f1 100644 --- a/setup.py +++ b/setup.py @@ -15,7 +15,7 @@ name="mentat", version=__version__, python_requires=">=3.10", - packages=find_packages(include=["mentat", "mentat.*", "tests"]), + packages=find_packages(include=["mentat", "mentat.*", "benchmarks"]), install_requires=[ str(r) for r in pkg_resources.parse_requirements( diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md deleted file mode 100644 index 1535c2cc8..000000000 --- a/tests/benchmarks/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# Benchmarks - -In this directory we write benchmarks for Mentat's performance on different tasks. - -## Running Benchmarks - -Benchmarks are run with pytest e.g. -``` -pytest -s tests/benchmarks/exercism_practice.py --benchmark -``` -Note you need the `-s` to see the printed results and `--benchmark` is necessary for tests that actually call gpt. - -They should not start with `test_` or end with `_test.py` so they will not be automatically collected and ran by pytest. - -Flags that control the performance of the benchmarks are defined in [conftest](/conftest.py) and set conservatively so benchmarks without flags will run relatively quickly and cheaply. To run the exercism benchmark with multiple workers on all the tests with one retry for the clojure language run the following: -``` -pytest -s tests/benchmarks/exercism_practice.py --benchmark --max_benchmarks 134 --max_iterations 2 --max_workers 2 --language clojure --benchmark -``` - -Warning: If you increase max_workers much higher you'll start to get rate limited. diff --git a/tests/conftest.py b/tests/conftest.py index 6b41b878a..bcd768a8c 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -47,69 +47,6 @@ def filter_mark(items, mark, exists): def pytest_addoption(parser): parser.addoption("--benchmark", action="store_true") parser.addoption("--uitest", action="store_true") - # The following flags are used by benchmark tests - parser.addoption( - "--max_benchmarks", - action="store", - default="1", - help="The maximum number of exercises to run", - ) - parser.addoption( - "--retries", - action="store", - default="1", - help="Number of times to retry a benchmark", - ) - parser.addoption( - "--max_iterations", - action="store", - default="1", - help="Number of times to rerun mentat with error messages", - ) - parser.addoption( - "--language", - action="store", - default="python", - help="Which exercism language to do exercises for", - ) - parser.addoption( - "--max_workers", - action="store", - default="1", - help="Number of workers to use for multiprocessing", - ) - parser.addoption( - "--refresh_repo", - action="store_true", - default=False, - help="When set local changes will be discarded.", - ) - parser.addoption( - "--benchmarks", - action="append", - nargs="*", - default=[], - help=( - "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning" - " depends on benchmark." - ), - ) - parser.addoption( - "--repo", - action="store", - default="mentat", - help="For benchmarks that are evaluated against a repo", - ) - parser.addoption( - "--evaluate_baseline", - action="store_true", - help="Evaluate the baseline for the benchmark", - ) - - -@pytest.fixture -def refresh_repo(request): - return request.config.getoption("--refresh_repo") @pytest.fixture @@ -120,11 +57,6 @@ def benchmarks(request): return benchmarks -@pytest.fixture -def max_benchmarks(request): - return int(request.config.getoption("--max_benchmarks")) - - def pytest_configure(config): config.addinivalue_line("markers", "benchmark: run benchmarks that call openai") config.addinivalue_line(