diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py
index 2921bf487..edf030b31 100644
--- a/benchmarks/arg_parser.py
+++ b/benchmarks/arg_parser.py
@@ -17,9 +17,8 @@ def common_benchmark_parser():
     )
     parser.add_argument(
         "--benchmarks",
-        action="append",
         nargs="*",
-        default=[[]],
+        default=[],
         help=(
             "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
             " depends on benchmark."
diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
index cba8cc624..ad68fdc1d 100755
--- a/benchmarks/benchmark_runner.py
+++ b/benchmarks/benchmark_runner.py
@@ -177,6 +177,7 @@ async def evaluate_sample(sample_file, retries=1):
     """Run a sample using Mentat and return the resulting diff"""
     sample = Sample.load(sample_file)
     results = []
+    start_dir = Path.cwd()
     for i in range(retries):
         formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_")
         result = BenchmarkResult(
@@ -189,20 +190,23 @@ async def evaluate_sample(sample_file, retries=1):
             diff_merge_base=sample.diff_merge_base,
             diff_active=sample.diff_active,
         )
-        cwd = Path(repo.working_dir)
-
-        # Run sample in PythonClient
-        paths = list[Path]()
-        for a in sample.context:
-            paths.append(Path(a))
-        client = PythonClient(cwd=cwd, paths=paths)
-        response = await run_client(
-            client, sample.message_prompt, result, sample.message_history
-        )
-        await grade_and_clean_diff(
-            repo, response, result, comparison_diff=sample.diff_edit
-        )
-        results.append(result)
+        try:
+            cwd = Path(repo.working_dir)
+
+            # Run sample in PythonClient
+            paths = list[Path]()
+            for a in sample.context:
+                paths.append(Path(a))
+            client = PythonClient(cwd=cwd, paths=paths)
+            response = await run_client(
+                client, sample.message_prompt, result, sample.message_history
+            )
+            await grade_and_clean_diff(
+                repo, response, result, comparison_diff=sample.diff_edit
+            )
+            results.append(result)
+        finally:
+            os.chdir(start_dir)
     return results
 
 
@@ -212,36 +216,43 @@ async def evalute_py(path, retries):
     title = benchmark.title
 
     print("Benchmark:", title)
-    repo = setup_repo(
-        url=benchmark.repo,
-        commit=benchmark.commit,
-    )
-    cwd = Path(repo.working_dir)
-
-    if hasattr(benchmark, "comparison_commit"):
-        comparison_commit = benchmark.comparison_commit
-        repo.git.checkout(comparison_commit)
-        comparison_diff = repo.git.diff(benchmark.commit)
-    else:
-        comparison_diff = None
-
-    for i, prompt in enumerate(benchmark.prompts):
-        print("  Prompt:", prompt)
-        for j in range(1, retries + 1):
-            formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
-            result = BenchmarkResult(
-                name=f"{formatted_title}-{i}-{j}",
-                family=formatted_title,
-            )
-            client = PythonClient(cwd=cwd, config=benchmark.config)
-            response = await run_client(client, prompt, result)
+    start_dir = Path.cwd()
+    try:
+        repo = setup_repo(
+            url=benchmark.repo,
+            commit=benchmark.commit,
+        )
+        cwd = Path(repo.working_dir)
 
-            await client.shutdown()
-            if hasattr(benchmark, "verify"):
-                result.verify = benchmark.verify()
+        if hasattr(benchmark, "comparison_commit"):
+            comparison_commit = benchmark.comparison_commit
+            repo.git.checkout(comparison_commit)
+            comparison_diff = repo.git.diff(benchmark.commit)
+        else:
+            comparison_diff = None
+
+        for i, prompt in enumerate(benchmark.prompts):
+            print("  Prompt:", prompt)
+            for j in range(1, retries + 1):
+                formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
+                result = BenchmarkResult(
+                    name=f"{formatted_title}-{i}-{j}",
+                    family=formatted_title,
+                )
+                client = PythonClient(
+                    cwd=cwd, paths=benchmark.paths, config=benchmark.config
+                )
+                response = await run_client(client, prompt, result)
 
-            await grade_and_clean_diff(repo, response, result, comparison_diff)
-            results.append(result)
+                await client.shutdown()
+                if hasattr(benchmark, "verify"):
+                    result.verify = benchmark.verify()
+
+                await grade_and_clean_diff(repo, response, result, comparison_diff)
+                os.chdir("../..")
+                results.append(result)
+    finally:
+        os.chdir(start_dir)
     return results
 
 
@@ -252,9 +263,9 @@ def benchmark_listed(title, benchmarks):
     return False
 
 
-async def run_benchmarks(retries, benchmarks):
+async def run_benchmarks(benchmarks, retries=1):
     print("Running benchmarks")
-    benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"
+    benchmarks_dir = Path("benchmarks/benchmarks")
 
     benchmark_paths = []
     for root, dirs, files in os.walk(benchmarks_dir):
@@ -296,7 +307,7 @@ async def run_benchmarks(retries, benchmarks):
     args = parser.parse_args()
     asyncio.run(
         run_benchmarks(
+            args.benchmarks,
             args.retries,
-            args.benchmarks[0],
         )
     )
diff --git a/benchmarks/benchmarks/mentat/clojure_exercism_runner.py b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py
index b05f70823..e69e9463d 100644
--- a/benchmarks/benchmarks/mentat/clojure_exercism_runner.py
+++ b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py
@@ -14,6 +14,7 @@
 repo = "https://github.com/AbanteAI/mentat"
 commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
 minimum_context = ["tests/benchmarks/exercise_runners"]
+paths = []
 
 config = Config(
     auto_context_tokens=8000,
diff --git a/benchmarks/benchmarks/mentat/license_update.py b/benchmarks/benchmarks/mentat/license_update.py
index fc5eb4289..2bc7b2a0f 100644
--- a/benchmarks/benchmarks/mentat/license_update.py
+++ b/benchmarks/benchmarks/mentat/license_update.py
@@ -22,6 +22,7 @@
 repo = "https://github.com/AbanteAI/mentat"
 commit = "b0848711c36e0c2fe9619ebb2b77dc6d27396ff2"
 minimum_context = ["tests/license_check.py:11-22"]
+paths = []
 
 config = Config(
     auto_context_tokens=8000,
diff --git a/benchmarks/benchmarks/mentat/pre_tags.py b/benchmarks/benchmarks/mentat/pre_tags.py
index 8ce733253..d45bbe958 100644
--- a/benchmarks/benchmarks/mentat/pre_tags.py
+++ b/benchmarks/benchmarks/mentat/pre_tags.py
@@ -22,6 +22,7 @@
 
 repo = "https://github.com/AbanteAI/mentat"
 commit = "b8d90b89e4a0d7ad266bf914c4ce99c473dd8dc0"
+paths = []
 
 config = Config(
     auto_context_tokens=8000,
diff --git a/benchmarks/exercism_practice.py b/benchmarks/exercism_practice.py
index d6d70eea3..0aad2f649 100755
--- a/benchmarks/exercism_practice.py
+++ b/benchmarks/exercism_practice.py
@@ -225,7 +225,7 @@ def run_exercism_benchmark(
     args = parser.parse_args()
     clone_exercism_repo(args.refresh_repo, args.language)
     run_exercism_benchmark(
-        args.benchmarks[0],
+        args.benchmarks,
         args.max_benchmarks,
         args.max_iterations,
         args.max_workers,
diff --git a/mentat/sampler/utils.py b/mentat/sampler/utils.py
index 730307825..0cc579b11 100644
--- a/mentat/sampler/utils.py
+++ b/mentat/sampler/utils.py
@@ -9,7 +9,7 @@
 from mentat.git_handler import get_non_gitignored_files
 from mentat.utils import is_file_text_encoded
 
-CLONE_TO_DIR = Path("benchmark_repos")
+CLONE_TO_DIR = Path("benchmarks/benchmark_repos")
 
 
 def clone_repo(
diff --git a/pyproject.toml b/pyproject.toml
index 3fe9e55c8..ca62663ff 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,14 +1,14 @@
 [tool.isort]
 profile = "black"
 known_first_party = "mentat"
-skip = ["vscode/bundled", "benchmark_repos", "testbed/exercism-python"]
+skip = ["vscode/bundled", "benchmarks/benchmark_repos", "testbed/exercism-python"]
 
 [tool.ruff]
 line-length = 120
 ignore = ["E731"]
 
 [tool.pytest.ini_options]
-addopts = "--ignore=vscode/bundled --ignore=benchmark_repos --ignore=testbed/exercism-python"
+addopts = "--ignore=vscode/bundled --ignore=benchmarks/benchmark_repos --ignore=testbed/exercism-python"
 
 [tool.black]
 preview = "true"
diff --git a/pyrightconfig.json b/pyrightconfig.json
index f584d379b..c68bae2e5 100644
--- a/pyrightconfig.json
+++ b/pyrightconfig.json
@@ -1,6 +1,13 @@
 {
     "include": ["mentat"],
-    "ignore": ["testbed", "tests", "scripts", "benchmark_repos", "build"],
+    "ignore": [
+        "testbed",
+        "tests",
+        "scripts",
+        "benchmark_repos",
+        "build",
+        "benchmarks/benchmark_repos",
+    ],
     "typeCheckingMode": "strict",
     "reportMissingTypeStubs": false,
 }
diff --git a/scripts/run_and_upload_benchmarks.sh b/scripts/run_and_upload_benchmarks.sh
index 91ea71e4a..4643b7d1f 100755
--- a/scripts/run_and_upload_benchmarks.sh
+++ b/scripts/run_and_upload_benchmarks.sh
@@ -11,12 +11,12 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S)
     --max_benchmarks 200 \
     --language javascript
 
-SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-javascript/results.json)
+SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-javascript/results.json)
 BUCKET="benchmarks.mentat.ai"
 
 # Upload results to S3
-aws s3 cp benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
-aws s3 cp benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json
+aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
+aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json
 
 # Send slack notification
 JAVASCRIPT_RESULTS_URL="http://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html"
@@ -32,11 +32,11 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${
     --max_benchmarks 200 \
     --language python
 
-SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-python/results.json)
+SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-python/results.json)
 
 # Upload results to S3
-aws s3 cp benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
-aws s3 cp benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json
+aws s3 cp benchmarks/benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
+aws s3 cp benchmarks/benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json
 
 # Send slack notification
 PYTHON_RESULTS_URL="http://${BUCKET}/exercism-python-results-${TIMESTAMP}.html"
diff --git a/testbed/benchmarks/benchmarks/clojure_exercism_runner.py b/testbed/benchmarks/benchmarks/clojure_exercism_runner.py
new file mode 100644
index 000000000..26f40db74
--- /dev/null
+++ b/testbed/benchmarks/benchmarks/clojure_exercism_runner.py
@@ -0,0 +1,19 @@
+from mentat.config import Config
+
+title = "Clojure Exercism Runner"
+
+description = """
+This benchmark tests the ability to write an exercism test runner for the clojure language.
+"""
+
+prompts = [
+    "Write a test runner for the clojure language.",
+]
+
+
+repo = "https://github.com/AbanteAI/mentat"
+commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
+minimum_context = ["tests/benchmarks/exercise_runners"]
+paths = ["tests/benchmarks/exercise_runners"]
+
+config = Config()
diff --git a/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py b/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py
index a49be036d..08372a6bc 100644
--- a/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py
+++ b/testbed/exercism-python/exercises/practice/accumulate/accumulate_test.py
@@ -9,31 +9,33 @@ def test_empty_sequence(self):
 
     def test_pow(self):
         self.assertEqual(
-            accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25])
+            accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25]
+        )
 
     def test_divmod(self):
         self.assertEqual(
-            accumulate([10, 17, 23], lambda x: divmod(x, 7)),
-            [(1, 3), (2, 3), (3, 2)])
+            accumulate([10, 17, 23], lambda x: divmod(x, 7)), [(1, 3), (2, 3), (3, 2)]
+        )
 
     def test_composition(self):
         inp = [10, 17, 23]
         self.assertEqual(
             accumulate(
-                accumulate(inp, lambda x: divmod(x, 7)),
-                lambda x: 7 * x[0] + x[1]), inp)
+                accumulate(inp, lambda x: divmod(x, 7)), lambda x: 7 * x[0] + x[1]
+            ),
+            inp,
+        )
 
     def test_capitalize(self):
-        self.assertEqual(
-            accumulate(['hello', 'world'], str.upper), ['HELLO', 'WORLD'])
+        self.assertEqual(accumulate(["hello", "world"], str.upper), ["HELLO", "WORLD"])
 
     def test_recursive(self):
-        inp = ['a', 'b', 'c']
-        out = [['a1', 'a2', 'a3'], ['b1', 'b2', 'b3'], ['c1', 'c2', 'c3']]
+        inp = ["a", "b", "c"]
+        out = [["a1", "a2", "a3"], ["b1", "b2", "b3"], ["c1", "c2", "c3"]]
         self.assertEqual(
-            accumulate(
-                inp, lambda x: accumulate(list('123'), lambda y: x + y)), out)
+            accumulate(inp, lambda x: accumulate(list("123"), lambda y: x + y)), out
+        )
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/tests/benchmarks/test_benchmark_runner.py b/tests/benchmarks/test_benchmark_runner.py
new file mode 100644
index 000000000..e8db354e8
--- /dev/null
+++ b/tests/benchmarks/test_benchmark_runner.py
@@ -0,0 +1,95 @@
+import json
+import os
+from textwrap import dedent
+from unittest.mock import patch
+
+import pytest
+
+from benchmarks.benchmark_runner import run_benchmarks
+
+
+@pytest.fixture
+def mock_webbrowser():
+    with patch("webbrowser.open") as mock:
+        yield mock
+
+
+@pytest.mark.asyncio
+async def test_run_exercism_benchmark(mock_webbrowser, mock_call_llm_api):
+    cwd = os.getcwd()
+    mock_call_llm_api.set_return_values(
+        [
+            dedent("""\
+                Here are the code changes:
+
+                @@start
+                {
+                    "file": "tests/benchmarks/exercise_runners/clojure_exercise_runner.py",
+                    "action": "create-file"
+                }
+                @@code
+                from .abstract_exercise_runner import AbstractExerciseRunner
+                import subprocess
+                import os
+
+
+                class ClojureExerciseRunner(AbstractExerciseRunner):
+                    def __init__(self, exercise):
+                        super().__init__(exercise, "clj")
+                        self.file = self.file.with_suffix(".clj")
+                        self.full_path = self.dir / self.file
+
+                    def run_test(self):
+                        self._run_test_command(["lein", "test"], cwd=str(self.dir))
+
+                    def passed(self):
+                        try:
+                            with open(self.test_output_file, "r") as f:
+                                lines = f.readlines()
+                                return "FAIL" not in lines[0] and "PASS" in lines[0]
+                        except FileNotFoundError:
+                            return False
+                @@end
+
+                @@start
+                {
+                    "file": "tests/benchmarks/exercise_runners/exercise_runner_factory.py",
+                    "action": "insert",
+                    "insert-after-line": 2,
+                    "insert-before-line": 3
+                }
+                @@code
+                from .clojure_exercise_runner import ClojureExerciseRunner
+                @@end
+
+                @@start
+                {
+                    "file": "tests/benchmarks/exercise_runners/exercise_runner_factory.py",
+                    "action": "insert",
+                    "insert-after-line": 7,
+                    "insert-before-line": 8
+                }
+                @@code
+                        "clojure": ClojureExerciseRunner,
+                @@end"""),
+            dedent("""\
+            {
+                "indentation": false,
+                "off_by_one": false,
+                "syntax": false
+            }"""),
+            dedent("""\
+            {
+                "referenced_format": true,
+                "trailing_waffling": false
+            }"""),
+        ]
+    )
+    await run_benchmarks(["Clojure Exercism Runner"])
+    assert os.getcwd() == cwd
+    with open("results.json") as f:
+        results = json.load(f)
+    summary = results["summary"]
+    assert summary["tokens (avg)"] == "0.00 "
+    assert summary["cost"] == "$0 "
+    assert summary["referenced_format"] == "100.00% "
diff --git a/tests/benchmarks/test_exercism_benchmark.py b/tests/benchmarks/test_exercism_benchmark.py
index b84c59af7..951da0aa7 100644
--- a/tests/benchmarks/test_exercism_benchmark.py
+++ b/tests/benchmarks/test_exercism_benchmark.py
@@ -36,6 +36,7 @@ def mock_pool():
 
 def test_run_exercism_benchmark(mock_pool, mock_webbrowser, mock_call_llm_api):
     os.chdir("exercism-python")
+    cwd = os.getcwd()
     mock_call_llm_api.set_return_values(
         [
             dedent("""\
@@ -103,6 +104,7 @@ def personal_top_three(self):
         1,
         "python",
     )
+    assert os.getcwd() == cwd
     with open("results.json") as f:
         results = json.load(f)
     summary = results["summary"]