diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py index 2921bf487..edf030b31 100644 --- a/benchmarks/arg_parser.py +++ b/benchmarks/arg_parser.py @@ -17,9 +17,8 @@ def common_benchmark_parser(): ) parser.add_argument( "--benchmarks", - action="append", nargs="*", - default=[[]], + default=[], help=( "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning" " depends on benchmark." diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py index cba8cc624..ad68fdc1d 100755 --- a/benchmarks/benchmark_runner.py +++ b/benchmarks/benchmark_runner.py @@ -177,6 +177,7 @@ async def evaluate_sample(sample_file, retries=1): """Run a sample using Mentat and return the resulting diff""" sample = Sample.load(sample_file) results = [] + start_dir = Path.cwd() for i in range(retries): formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_") result = BenchmarkResult( @@ -189,20 +190,23 @@ async def evaluate_sample(sample_file, retries=1): diff_merge_base=sample.diff_merge_base, diff_active=sample.diff_active, ) - cwd = Path(repo.working_dir) - - # Run sample in PythonClient - paths = list[Path]() - for a in sample.context: - paths.append(Path(a)) - client = PythonClient(cwd=cwd, paths=paths) - response = await run_client( - client, sample.message_prompt, result, sample.message_history - ) - await grade_and_clean_diff( - repo, response, result, comparison_diff=sample.diff_edit - ) - results.append(result) + try: + cwd = Path(repo.working_dir) + + # Run sample in PythonClient + paths = list[Path]() + for a in sample.context: + paths.append(Path(a)) + client = PythonClient(cwd=cwd, paths=paths) + response = await run_client( + client, sample.message_prompt, result, sample.message_history + ) + await grade_and_clean_diff( + repo, response, result, comparison_diff=sample.diff_edit + ) + results.append(result) + finally: + os.chdir(start_dir) return results @@ -212,36 +216,43 @@ async def evalute_py(path, retries): title = benchmark.title print("Benchmark:", title) - repo = setup_repo( - url=benchmark.repo, - commit=benchmark.commit, - ) - cwd = Path(repo.working_dir) - - if hasattr(benchmark, "comparison_commit"): - comparison_commit = benchmark.comparison_commit - repo.git.checkout(comparison_commit) - comparison_diff = repo.git.diff(benchmark.commit) - else: - comparison_diff = None - - for i, prompt in enumerate(benchmark.prompts): - print(" Prompt:", prompt) - for j in range(1, retries + 1): - formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_") - result = BenchmarkResult( - name=f"{formatted_title}-{i}-{j}", - family=formatted_title, - ) - client = PythonClient(cwd=cwd, config=benchmark.config) - response = await run_client(client, prompt, result) + start_dir = Path.cwd() + try: + repo = setup_repo( + url=benchmark.repo, + commit=benchmark.commit, + ) + cwd = Path(repo.working_dir) - await client.shutdown() - if hasattr(benchmark, "verify"): - result.verify = benchmark.verify() + if hasattr(benchmark, "comparison_commit"): + comparison_commit = benchmark.comparison_commit + repo.git.checkout(comparison_commit) + comparison_diff = repo.git.diff(benchmark.commit) + else: + comparison_diff = None + + for i, prompt in enumerate(benchmark.prompts): + print(" Prompt:", prompt) + for j in range(1, retries + 1): + formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_") + result = BenchmarkResult( + name=f"{formatted_title}-{i}-{j}", + family=formatted_title, + ) + client = PythonClient( + cwd=cwd, paths=benchmark.paths, config=benchmark.config + ) + response = await run_client(client, prompt, result) - await grade_and_clean_diff(repo, response, result, comparison_diff) - results.append(result) + await client.shutdown() + if hasattr(benchmark, "verify"): + result.verify = benchmark.verify() + + await grade_and_clean_diff(repo, response, result, comparison_diff) + os.chdir("../..") + results.append(result) + finally: + os.chdir(start_dir) return results @@ -252,9 +263,9 @@ def benchmark_listed(title, benchmarks): return False -async def run_benchmarks(retries, benchmarks): +async def run_benchmarks(benchmarks, retries=1): print("Running benchmarks") - benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks" + benchmarks_dir = Path("benchmarks/benchmarks") benchmark_paths = [] for root, dirs, files in os.walk(benchmarks_dir): @@ -296,7 +307,7 @@ async def run_benchmarks(retries, benchmarks): args = parser.parse_args() asyncio.run( run_benchmarks( + args.benchmarks, args.retries, - args.benchmarks[0], ) ) diff --git a/benchmarks/benchmarks/mentat/clojure_exercism_runner.py b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py index b05f70823..e69e9463d 100644 --- a/benchmarks/benchmarks/mentat/clojure_exercism_runner.py +++ b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py @@ -14,6 +14,7 @@ repo = "https://github.com/AbanteAI/mentat" commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b" minimum_context = ["tests/benchmarks/exercise_runners"] +paths = [] config = Config( auto_context_tokens=8000, diff --git a/benchmarks/benchmarks/mentat/license_update.py b/benchmarks/benchmarks/mentat/license_update.py index fc5eb4289..2bc7b2a0f 100644 --- a/benchmarks/benchmarks/mentat/license_update.py +++ b/benchmarks/benchmarks/mentat/license_update.py @@ -22,6 +22,7 @@ repo = "https://github.com/AbanteAI/mentat" commit = "b0848711c36e0c2fe9619ebb2b77dc6d27396ff2" minimum_context = ["tests/license_check.py:11-22"] +paths = [] config = Config( auto_context_tokens=8000, diff --git a/benchmarks/benchmarks/mentat/pre_tags.py b/benchmarks/benchmarks/mentat/pre_tags.py index 8ce733253..d45bbe958 100644 --- a/benchmarks/benchmarks/mentat/pre_tags.py +++ b/benchmarks/benchmarks/mentat/pre_tags.py @@ -22,6 +22,7 @@ repo = "https://github.com/AbanteAI/mentat" commit = "b8d90b89e4a0d7ad266bf914c4ce99c473dd8dc0" +paths = [] config = Config( auto_context_tokens=8000, diff --git a/benchmarks/exercism_practice.py b/benchmarks/exercism_practice.py index d6d70eea3..0aad2f649 100755 --- a/benchmarks/exercism_practice.py +++ b/benchmarks/exercism_practice.py @@ -225,7 +225,7 @@ def run_exercism_benchmark( args = parser.parse_args() clone_exercism_repo(args.refresh_repo, args.language) run_exercism_benchmark( - args.benchmarks[0], + args.benchmarks, args.max_benchmarks, args.max_iterations, args.max_workers, diff --git a/mentat/sampler/utils.py b/mentat/sampler/utils.py index 730307825..0cc579b11 100644 --- a/mentat/sampler/utils.py +++ b/mentat/sampler/utils.py @@ -9,7 +9,7 @@ from mentat.git_handler import get_non_gitignored_files from mentat.utils import is_file_text_encoded -CLONE_TO_DIR = Path("benchmark_repos") +CLONE_TO_DIR = Path("benchmarks/benchmark_repos") def clone_repo( diff --git a/pyproject.toml b/pyproject.toml index 3fe9e55c8..ca62663ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,14 +1,14 @@ [tool.isort] profile = "black" known_first_party = "mentat" -skip = ["vscode/bundled", "benchmark_repos", "testbed/exercism-python"] +skip = ["vscode/bundled", "benchmarks/benchmark_repos", "testbed/exercism-python"] [tool.ruff] line-length = 120 ignore = ["E731"] [tool.pytest.ini_options] -addopts = "--ignore=vscode/bundled --ignore=benchmark_repos --ignore=testbed/exercism-python" +addopts = "--ignore=vscode/bundled --ignore=benchmarks/benchmark_repos --ignore=testbed/exercism-python" [tool.black] preview = "true" diff --git a/pyrightconfig.json b/pyrightconfig.json index f584d379b..c68bae2e5 100644 --- a/pyrightconfig.json +++ b/pyrightconfig.json @@ -1,6 +1,13 @@ { "include": ["mentat"], - "ignore": ["testbed", "tests", "scripts", "benchmark_repos", "build"], + "ignore": [ + "testbed", + "tests", + "scripts", + "benchmark_repos", + "build", + "benchmarks/benchmark_repos", + ], "typeCheckingMode": "strict", "reportMissingTypeStubs": false, } diff --git a/scripts/run_and_upload_benchmarks.sh b/scripts/run_and_upload_benchmarks.sh index 91ea71e4a..4643b7d1f 100755 --- a/scripts/run_and_upload_benchmarks.sh +++ b/scripts/run_and_upload_benchmarks.sh @@ -11,12 +11,12 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S) --max_benchmarks 200 \ --language javascript -SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-javascript/results.json) +SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-javascript/results.json) BUCKET="benchmarks.mentat.ai" # Upload results to S3 -aws s3 cp benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html -aws s3 cp benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json +aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html +aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json # Send slack notification JAVASCRIPT_RESULTS_URL="http://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html" @@ -32,11 +32,11 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${ --max_benchmarks 200 \ --language python -SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-python/results.json) +SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-python/results.json) # Upload results to S3 -aws s3 cp benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html -aws s3 cp benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json +aws s3 cp benchmarks/benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html +aws s3 cp benchmarks/benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json # Send slack notification PYTHON_RESULTS_URL="http://${BUCKET}/exercism-python-results-${TIMESTAMP}.html" diff --git a/testbed/benchmarks/benchmarks/clojure_exercism_runner.py b/testbed/benchmarks/benchmarks/clojure_exercism_runner.py new file mode 100644 index 000000000..26f40db74 --- /dev/null +++ b/testbed/benchmarks/benchmarks/clojure_exercism_runner.py @@ -0,0 +1,19 @@ +from mentat.config import Config + +title = "Clojure Exercism Runner" + +description = """ +This benchmark tests the ability to write an exercism test runner for the clojure language. +""" + +prompts = [ + "Write a test runner for the clojure language.", +] + + +repo = "https://github.com/AbanteAI/mentat" +commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b" +minimum_context = ["tests/benchmarks/exercise_runners"] +paths = ["tests/benchmarks/exercise_runners"] + +config = Config() diff --git a/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py b/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py index a49be036d..08372a6bc 100644 --- a/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py +++ b/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py @@ -9,31 +9,33 @@ def test_empty_sequence(self): def test_pow(self): self.assertEqual( - accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25]) + accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25] + ) def test_divmod(self): self.assertEqual( - accumulate([10, 17, 23], lambda x: divmod(x, 7)), - [(1, 3), (2, 3), (3, 2)]) + accumulate([10, 17, 23], lambda x: divmod(x, 7)), [(1, 3), (2, 3), (3, 2)] + ) def test_composition(self): inp = [10, 17, 23] self.assertEqual( accumulate( - accumulate(inp, lambda x: divmod(x, 7)), - lambda x: 7 * x[0] + x[1]), inp) + accumulate(inp, lambda x: divmod(x, 7)), lambda x: 7 * x[0] + x[1] + ), + inp, + ) def test_capitalize(self): - self.assertEqual( - accumulate(['hello', 'world'], str.upper), ['HELLO', 'WORLD']) + self.assertEqual(accumulate(["hello", "world"], str.upper), ["HELLO", "WORLD"]) def test_recursive(self): - inp = ['a', 'b', 'c'] - out = [['a1', 'a2', 'a3'], ['b1', 'b2', 'b3'], ['c1', 'c2', 'c3']] + inp = ["a", "b", "c"] + out = [["a1", "a2", "a3"], ["b1", "b2", "b3"], ["c1", "c2", "c3"]] self.assertEqual( - accumulate( - inp, lambda x: accumulate(list('123'), lambda y: x + y)), out) + accumulate(inp, lambda x: accumulate(list("123"), lambda y: x + y)), out + ) -if __name__ == '__main__': +if __name__ == "__main__": unittest.main() diff --git a/tests/benchmarks/test_benchmark_runner.py b/tests/benchmarks/test_benchmark_runner.py new file mode 100644 index 000000000..e8db354e8 --- /dev/null +++ b/tests/benchmarks/test_benchmark_runner.py @@ -0,0 +1,95 @@ +import json +import os +from textwrap import dedent +from unittest.mock import patch + +import pytest + +from benchmarks.benchmark_runner import run_benchmarks + + +@pytest.fixture +def mock_webbrowser(): + with patch("webbrowser.open") as mock: + yield mock + + +@pytest.mark.asyncio +async def test_run_exercism_benchmark(mock_webbrowser, mock_call_llm_api): + cwd = os.getcwd() + mock_call_llm_api.set_return_values( + [ + dedent("""\ + Here are the code changes: + + @@start + { + "file": "tests/benchmarks/exercise_runners/clojure_exercise_runner.py", + "action": "create-file" + } + @@code + from .abstract_exercise_runner import AbstractExerciseRunner + import subprocess + import os + + + class ClojureExerciseRunner(AbstractExerciseRunner): + def __init__(self, exercise): + super().__init__(exercise, "clj") + self.file = self.file.with_suffix(".clj") + self.full_path = self.dir / self.file + + def run_test(self): + self._run_test_command(["lein", "test"], cwd=str(self.dir)) + + def passed(self): + try: + with open(self.test_output_file, "r") as f: + lines = f.readlines() + return "FAIL" not in lines[0] and "PASS" in lines[0] + except FileNotFoundError: + return False + @@end + + @@start + { + "file": "tests/benchmarks/exercise_runners/exercise_runner_factory.py", + "action": "insert", + "insert-after-line": 2, + "insert-before-line": 3 + } + @@code + from .clojure_exercise_runner import ClojureExerciseRunner + @@end + + @@start + { + "file": "tests/benchmarks/exercise_runners/exercise_runner_factory.py", + "action": "insert", + "insert-after-line": 7, + "insert-before-line": 8 + } + @@code + "clojure": ClojureExerciseRunner, + @@end"""), + dedent("""\ + { + "indentation": false, + "off_by_one": false, + "syntax": false + }"""), + dedent("""\ + { + "referenced_format": true, + "trailing_waffling": false + }"""), + ] + ) + await run_benchmarks(["Clojure Exercism Runner"]) + assert os.getcwd() == cwd + with open("results.json") as f: + results = json.load(f) + summary = results["summary"] + assert summary["tokens (avg)"] == "0.00 " + assert summary["cost"] == "$0 " + assert summary["referenced_format"] == "100.00% " diff --git a/tests/benchmarks/test_exercism_benchmark.py b/tests/benchmarks/test_exercism_benchmark.py index b84c59af7..951da0aa7 100644 --- a/tests/benchmarks/test_exercism_benchmark.py +++ b/tests/benchmarks/test_exercism_benchmark.py @@ -36,6 +36,7 @@ def mock_pool(): def test_run_exercism_benchmark(mock_pool, mock_webbrowser, mock_call_llm_api): os.chdir("exercism-python") + cwd = os.getcwd() mock_call_llm_api.set_return_values( [ dedent("""\ @@ -103,6 +104,7 @@ def personal_top_three(self): 1, "python", ) + assert os.getcwd() == cwd with open("results.json") as f: results = json.load(f) summary = results["summary"]