diff --git a/benchmarks/README.md b/benchmarks/README.md
new file mode 100644
index 000000000..66489470a
--- /dev/null
+++ b/benchmarks/README.md
@@ -0,0 +1,26 @@
+# Benchmarks
+
+In this directory we write benchmarks for Mentat's performance on different tasks.
+
+## Running Exercism Benchmarks
+
+```
+./benchmarks/exercism_practice.py
+```
+
+Flags that control the performance of the benchmarks are defined [here](arg_parser.py) and set conservatively so benchmarks without flags will run relatively quickly and cheaply. To run the exercism benchmark with multiple workers on all the tests with one retry for the clojure language run the following:
+```
+./benchmarks/exercism_practice.py  --max_benchmarks 134 --max_iterations 2 --max_workers 2 --language clojure
+```
+
+Warning: If you increase `max_workers` much higher you'll start to get rate limited.
+
+## Running Real World Benchmarks
+
+```
+./benchmarks/benchmark_runner.py
+```
+
+## Making Real World Benchmarks
+
+Real world benchmarks can either be [samples](benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json) or [python files](benchmarks/mentat/).
diff --git a/tests/benchmarks/__init__.py b/benchmarks/__init__.py
similarity index 100%
rename from tests/benchmarks/__init__.py
rename to benchmarks/__init__.py
diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py
new file mode 100644
index 000000000..2921bf487
--- /dev/null
+++ b/benchmarks/arg_parser.py
@@ -0,0 +1,65 @@
+import argparse
+
+
+def common_benchmark_parser():
+    parser = argparse.ArgumentParser(description="Run exercism benchmarks")
+    parser.add_argument(
+        "--refresh_repo",
+        action="store_true",
+        default=False,
+        help="When set local changes will be discarded.",
+    )
+    parser.add_argument(
+        "--language",
+        default="python",
+        type=str,
+        help="Which exercism language to do exercises for",
+    )
+    parser.add_argument(
+        "--benchmarks",
+        action="append",
+        nargs="*",
+        default=[[]],
+        help=(
+            "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
+            " depends on benchmark."
+        ),
+    )
+    parser.add_argument(
+        "--max_benchmarks",
+        default=1,
+        type=int,
+        help="The maximum number of exercises to run",
+    )
+    parser.add_argument(
+        "--max_iterations",
+        default=1,
+        type=int,
+        help="Number of times to rerun mentat with error messages",
+    )
+    parser.add_argument(
+        "--max_workers",
+        default=1,
+        type=int,
+        help="Number of workers to use for multiprocessing",
+    )
+    parser.add_argument(
+        "--retries",
+        action="store",
+        default=1,
+        type=int,
+        help="Number of times to retry a benchmark",
+    )
+    parser.add_argument(
+        "--repo",
+        action="store",
+        default="mentat",
+        help="For benchmarks that are evaluated against a repo",
+    )
+    parser.add_argument(
+        "--evaluate_baseline",
+        action="store_true",
+        help="Evaluate the baseline for the benchmark",
+    )
+
+    return parser
diff --git a/tests/benchmarks/benchmark_result.py b/benchmarks/benchmark_result.py
similarity index 100%
rename from tests/benchmarks/benchmark_result.py
rename to benchmarks/benchmark_result.py
diff --git a/tests/benchmarks/benchmark_result_summary.py b/benchmarks/benchmark_result_summary.py
similarity index 96%
rename from tests/benchmarks/benchmark_result_summary.py
rename to benchmarks/benchmark_result_summary.py
index 9dfb24fc1..5e02ed584 100644
--- a/tests/benchmarks/benchmark_result_summary.py
+++ b/benchmarks/benchmark_result_summary.py
@@ -6,7 +6,7 @@
 import attr
 from jinja2 import Environment, FileSystemLoader, select_autoescape
 
-from tests.benchmarks.benchmark_result import BenchmarkResult
+from benchmarks.benchmark_result import BenchmarkResult
 
 
 class BenchmarkResultSummary:
@@ -124,9 +124,7 @@ def summary_string(self) -> str:
     def render_results(self):
         env = Environment(
             loader=FileSystemLoader(
-                os.path.join(
-                    os.path.dirname(__file__), "../../mentat/resources/templates"
-                )
+                os.path.join(os.path.dirname(__file__), "../mentat/resources/templates")
             ),
             autoescape=select_autoescape(["html", "xml"]),
         )
diff --git a/tests/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
old mode 100644
new mode 100755
similarity index 95%
rename from tests/benchmarks/benchmark_runner.py
rename to benchmarks/benchmark_runner.py
index 1d10be6ef..b53250249
--- a/tests/benchmarks/benchmark_runner.py
+++ b/benchmarks/benchmark_runner.py
@@ -1,26 +1,26 @@
+#!/usr/bin/env python
+import asyncio
 import importlib.util
 import json
 import os
 import re
 from pathlib import Path
 
-import pytest
 from openai.types.chat import (
     ChatCompletionAssistantMessageParam,
     ChatCompletionUserMessageParam,
 )
 from openai.types.chat.completion_create_params import ResponseFormat
 
+from benchmarks.arg_parser import common_benchmark_parser
+from benchmarks.benchmark_result import BenchmarkResult
+from benchmarks.benchmark_result_summary import BenchmarkResultSummary
 from mentat.errors import SampleError
 from mentat.llm_api_handler import model_context_size, prompt_tokens
 from mentat.python_client.client import PythonClient
 from mentat.sampler.sample import Sample
 from mentat.sampler.utils import setup_repo
 from mentat.session_context import SESSION_CONTEXT
-from tests.benchmarks.benchmark_result import BenchmarkResult
-from tests.benchmarks.benchmark_result_summary import BenchmarkResultSummary
-
-pytestmark = pytest.mark.benchmark
 
 
 def dynamic_import(path_to_module, module_name):
@@ -30,11 +30,6 @@ def dynamic_import(path_to_module, module_name):
     return module
 
 
-@pytest.fixture
-def retries(request):
-    return int(request.config.getoption("--retries"))
-
-
 async def grade(to_grade, prompt, model="gpt-4-1106-preview"):
     try:
         messages = [
@@ -257,8 +252,7 @@ def benchmark_listed(title, benchmarks):
     return False
 
 
-@pytest.mark.asyncio
-async def test_benchmark(retries, benchmarks):
+async def run_benchmarks(retries, benchmarks):
     print("Running benchmarks")
     benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"
 
@@ -296,3 +290,14 @@ async def test_benchmark(retries, benchmarks):
     with open("results.json", "w") as f:
         f.write(summary.to_json())
     summary.render_results()
+
+
+if __name__ == "__main__":
+    parser = common_benchmark_parser()
+    args = parser.parse_args()
+    asyncio.run(
+        run_benchmarks(
+            args.retries,
+            args.benchmarks[0],
+        )
+    )
diff --git a/tests/benchmarks/benchmarks/mentat/clojure_exercism_runner.py b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py
similarity index 100%
rename from tests/benchmarks/benchmarks/mentat/clojure_exercism_runner.py
rename to benchmarks/benchmarks/mentat/clojure_exercism_runner.py
diff --git a/tests/benchmarks/benchmarks/mentat/license_update.py b/benchmarks/benchmarks/mentat/license_update.py
similarity index 100%
rename from tests/benchmarks/benchmarks/mentat/license_update.py
rename to benchmarks/benchmarks/mentat/license_update.py
diff --git a/tests/benchmarks/benchmarks/mentat/pre_tags.py b/benchmarks/benchmarks/mentat/pre_tags.py
similarity index 100%
rename from tests/benchmarks/benchmarks/mentat/pre_tags.py
rename to benchmarks/benchmarks/mentat/pre_tags.py
diff --git a/tests/benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json b/benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json
similarity index 100%
rename from tests/benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json
rename to benchmarks/benchmarks/mentat/sample_15223222005645d08b81f093e51d52fe.json
diff --git a/tests/benchmarks/context_benchmark.py b/benchmarks/context_benchmark.py
old mode 100644
new mode 100755
similarity index 87%
rename from tests/benchmarks/context_benchmark.py
rename to benchmarks/context_benchmark.py
index ef67d0792..191085f43
--- a/tests/benchmarks/context_benchmark.py
+++ b/benchmarks/context_benchmark.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+import asyncio
 import json
 import os
 from collections import defaultdict
@@ -5,20 +7,18 @@
 from pathlib import Path
 from typing import Any
 
-import pytest
 from git import Repo
 
+from benchmarks.arg_parser import common_benchmark_parser
 from mentat.code_context import CodeContext
-from mentat.code_feature import CodeFeature, CodeMessageLevel
+from mentat.code_feature import CodeFeature
 from mentat.code_file_manager import CodeFileManager
 from mentat.config import Config
-from mentat.interval import Interval
-from mentat.llm_api import CostTracker, count_tokens, model_context_size, setup_api_key
+from mentat.cost_tracker import CostTracker
+from mentat.llm_api_handler import count_tokens, model_context_size
 from mentat.sampler.utils import clone_repo
 from mentat.session_context import SESSION_CONTEXT, SessionContext
 
-pytestmark = pytest.mark.benchmark
-
 
 class MockStream:
     def send(self, message, **kwargs):
@@ -29,7 +29,7 @@ def send(self, message, **kwargs):
 def _load_benchmarks() -> dict[str, dict[str, Any]]:
     """Load all benchmarks found in benchmark_repos"""
     benchmarks = {}
-    benchmarks_dir = Path(__file__).parent / "../../benchmark_repos"
+    benchmarks_dir = Path(__file__).parent / "../benchmark_repos"
     for repo_dir in benchmarks_dir.iterdir():
         benchmarks_path = repo_dir / "benchmarks.json"
         if benchmarks_path.exists():
@@ -46,18 +46,9 @@ def _convert_features_to_line_sets(
     for feature in features:
         # Non-explicit features (e.g. CodeMaps) are considered false positives.
         # Using negative numbers here as that affect.
-        if feature.level not in (CodeMessageLevel.CODE, CodeMessageLevel.INTERVAL):
-            n_lines = len(feature.get_code_message())
-            lines[feature.path].update(range(-1, -n_lines - 1, -1))
-            continue
 
-        # Otherwise match specific lines
         path = feature.path.relative_to(git_root)
-        if feature.level == CodeMessageLevel.INTERVAL:
-            interval = feature.interval
-        else:
-            n_lines = len(feature.get_code_message())
-            interval = Interval(1, n_lines + 1)
+        interval = feature.interval
         lines[path].update(range(interval.start, interval.end + 1))
     return lines
 
@@ -129,15 +120,13 @@ async def select_features_for_benchmark(
     return {"features": selected_features, "score": selector_performance}
 
 
-@pytest.mark.asyncio
 async def test_code_context_performance(benchmarks, max_benchmarks=10):
     """Run a set of benchmarks and evaluate performance
 
     Run standalone:
-        `pytest -s tests/benchmarks/context_benchmark.py --benchmark`
+        `./benchmarks/context_benchmark.py`
     """
     # Load applicable benchmarks
-    setup_api_key()
     all_benchmarks = _load_benchmarks()
     if len(benchmarks) > 0:
         benchmarks_to_run = {k: v for k, v in all_benchmarks.items() if k in benchmarks}
@@ -203,3 +192,14 @@ async def test_code_context_performance(benchmarks, max_benchmarks=10):
                     print(f"Error: '{e}'; skipping")
 
     return scores
+
+
+if __name__ == "__main__":
+    parser = common_benchmark_parser()
+    args = parser.parse_args()
+    asyncio.run(
+        test_code_context_performance(
+            args.benchmarks,
+            args.max_benchmarks,
+        )
+    )
diff --git a/tests/benchmarks/edit_rubric_benchmark.py b/benchmarks/edit_rubric_benchmark.py
old mode 100644
new mode 100755
similarity index 91%
rename from tests/benchmarks/edit_rubric_benchmark.py
rename to benchmarks/edit_rubric_benchmark.py
index 3f37aae31..c283f7ec7
--- a/tests/benchmarks/edit_rubric_benchmark.py
+++ b/benchmarks/edit_rubric_benchmark.py
@@ -1,3 +1,5 @@
+#!/usr/bin/env python
+import asyncio
 import json
 import os
 import subprocess
@@ -5,25 +7,13 @@
 from pathlib import Path
 from textwrap import dedent
 
-import pytest
 from git import Repo
 from openai import OpenAI
 
+from benchmarks.arg_parser import common_benchmark_parser
 from mentat.python_client.client import PythonClient
 from mentat.sampler.utils import clone_repo
 
-pytestmark = pytest.mark.benchmark
-
-
-@pytest.fixture
-def evaluate_baseline(request):
-    return bool(request.config.getoption("--evaluate_baseline"))
-
-
-@pytest.fixture
-def repo(request):
-    return request.config.getoption("--repo")
-
 
 def load_tests(benchmarks_dir):
     tests = {}
@@ -78,7 +68,6 @@ def evaluate_diff(diff: str) -> dict[str, int]:
     return json.loads(message)
 
 
-@pytest.mark.asyncio
 async def test_edit_quality(
     benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo
 ):
@@ -152,3 +141,17 @@ async def test_edit_quality(
         repo.git.clean("-fd")
         repo.git.checkout(start_commit)
         await client.shutdown()
+
+
+if __name__ == "__main__":
+    parser = common_benchmark_parser()
+    args = parser.parse_args()
+    asyncio.run(
+        test_edit_quality(
+            args.benchmarks,
+            args.max_benchmarks,
+            args.evaluate_baseline,
+            args.repo,
+            args.refresh_repo,
+        )
+    )
diff --git a/tests/benchmarks/exercise_runners/__init__.py b/benchmarks/exercise_runners/__init__.py
similarity index 100%
rename from tests/benchmarks/exercise_runners/__init__.py
rename to benchmarks/exercise_runners/__init__.py
diff --git a/tests/benchmarks/exercise_runners/abstract_exercise_runner.py b/benchmarks/exercise_runners/abstract_exercise_runner.py
similarity index 97%
rename from tests/benchmarks/exercise_runners/abstract_exercise_runner.py
rename to benchmarks/exercise_runners/abstract_exercise_runner.py
index c6ea1f785..3c1ebc1ac 100644
--- a/tests/benchmarks/exercise_runners/abstract_exercise_runner.py
+++ b/benchmarks/exercise_runners/abstract_exercise_runner.py
@@ -2,7 +2,7 @@
 import subprocess
 from pathlib import Path
 
-from tests.benchmarks.benchmark_result import BenchmarkResult
+from benchmarks.benchmark_result import BenchmarkResult
 
 
 class AbstractExerciseRunner:
diff --git a/tests/benchmarks/exercise_runners/clojure_exercise_runner.py b/benchmarks/exercise_runners/clojure_exercise_runner.py
similarity index 86%
rename from tests/benchmarks/exercise_runners/clojure_exercise_runner.py
rename to benchmarks/exercise_runners/clojure_exercise_runner.py
index c645aef68..80a994dd0 100644
--- a/tests/benchmarks/exercise_runners/clojure_exercise_runner.py
+++ b/benchmarks/exercise_runners/clojure_exercise_runner.py
@@ -1,8 +1,6 @@
 from pathlib import Path
 
-from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
-    AbstractExerciseRunner,
-)
+from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner
 
 
 class ClojureExerciseRunner(AbstractExerciseRunner):
diff --git a/tests/benchmarks/exercise_runners/exercise_runner_factory.py b/benchmarks/exercise_runners/exercise_runner_factory.py
similarity index 53%
rename from tests/benchmarks/exercise_runners/exercise_runner_factory.py
rename to benchmarks/exercise_runners/exercise_runner_factory.py
index f72b9aeaf..59afa2a67 100644
--- a/tests/benchmarks/exercise_runners/exercise_runner_factory.py
+++ b/benchmarks/exercise_runners/exercise_runner_factory.py
@@ -1,12 +1,8 @@
-from tests.benchmarks.exercise_runners.clojure_exercise_runner import (
-    ClojureExerciseRunner,
-)
-from tests.benchmarks.exercise_runners.javascript_exercise_runner import (
+from benchmarks.exercise_runners.clojure_exercise_runner import ClojureExerciseRunner
+from benchmarks.exercise_runners.javascript_exercise_runner import (
     JavascriptExerciseRunner,
 )
-from tests.benchmarks.exercise_runners.python_exercise_runner import (
-    PythonExerciseRunner,
-)
+from benchmarks.exercise_runners.python_exercise_runner import PythonExerciseRunner
 
 
 class ExerciseRunnerFactory:
diff --git a/tests/benchmarks/exercise_runners/javascript_exercise_runner.py b/benchmarks/exercise_runners/javascript_exercise_runner.py
similarity index 89%
rename from tests/benchmarks/exercise_runners/javascript_exercise_runner.py
rename to benchmarks/exercise_runners/javascript_exercise_runner.py
index aa32fcc20..5dec86a01 100644
--- a/tests/benchmarks/exercise_runners/javascript_exercise_runner.py
+++ b/benchmarks/exercise_runners/javascript_exercise_runner.py
@@ -1,9 +1,7 @@
 import os
 import subprocess
 
-from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
-    AbstractExerciseRunner,
-)
+from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner
 
 
 class JavascriptExerciseRunner(AbstractExerciseRunner):
diff --git a/tests/benchmarks/exercise_runners/python_exercise_runner.py b/benchmarks/exercise_runners/python_exercise_runner.py
similarity index 85%
rename from tests/benchmarks/exercise_runners/python_exercise_runner.py
rename to benchmarks/exercise_runners/python_exercise_runner.py
index 8c2ce958c..742458823 100644
--- a/tests/benchmarks/exercise_runners/python_exercise_runner.py
+++ b/benchmarks/exercise_runners/python_exercise_runner.py
@@ -1,8 +1,6 @@
 from pathlib import Path
 
-from tests.benchmarks.exercise_runners.abstract_exercise_runner import (
-    AbstractExerciseRunner,
-)
+from benchmarks.exercise_runners.abstract_exercise_runner import AbstractExerciseRunner
 
 
 class PythonExerciseRunner(AbstractExerciseRunner):
diff --git a/tests/benchmarks/exercism_practice.py b/benchmarks/exercism_practice.py
old mode 100644
new mode 100755
similarity index 92%
rename from tests/benchmarks/exercism_practice.py
rename to benchmarks/exercism_practice.py
index e3df91e9e..5a0407b1f
--- a/tests/benchmarks/exercism_practice.py
+++ b/benchmarks/exercism_practice.py
@@ -1,48 +1,29 @@
+#!/usr/bin/env python
 import asyncio
 import os
 from functools import partial
 from multiprocessing import Pool
 from pathlib import Path
 
-import pytest
 import tqdm
 from openai import BadRequestError
 
+from benchmarks.arg_parser import common_benchmark_parser
+from benchmarks.benchmark_result import BenchmarkResult
+from benchmarks.benchmark_result_summary import BenchmarkResultSummary
+from benchmarks.exercise_runners.exercise_runner_factory import ExerciseRunnerFactory
 from mentat.config import Config
 from mentat.python_client.client import PythonClient
 from mentat.sampler.utils import clone_repo
 from mentat.session_context import SESSION_CONTEXT
-from tests.benchmarks.benchmark_result import BenchmarkResult
-from tests.benchmarks.benchmark_result_summary import BenchmarkResultSummary
-from tests.benchmarks.exercise_runners.exercise_runner_factory import (
-    ExerciseRunnerFactory,
-)
-
-pytestmark = pytest.mark.benchmark
 
 
-@pytest.fixture
 def clone_exercism_repo(refresh_repo, language):
     exercism_url = f"https://github.com/exercism/{language}.git"
     local_dir = clone_repo(exercism_url, f"exercism-{language}", refresh_repo)
     os.chdir(local_dir)
 
 
-@pytest.fixture
-def max_iterations(request):
-    return int(request.config.getoption("--max_iterations"))
-
-
-@pytest.fixture
-def max_workers(request):
-    return int(request.config.getoption("--max_workers"))
-
-
-@pytest.fixture
-def language(request):
-    return request.config.getoption("--language")
-
-
 prompt = (
     "You are a professional code reviewer who helps other coders improve their skills."
     " You recently assigned a coder a small coding test to assess their level, with a"
@@ -200,8 +181,7 @@ def tqdm_summary(results):
     return "Passed: " + str(passed_in_n)[1:-1] + "| Failed: " + str(failed)
 
 
-def test_practice_directory_performance(
-    clone_exercism_repo,
+def run_exercism_benchmark(
     benchmarks,
     max_benchmarks,
     max_iterations,
@@ -237,3 +217,16 @@ def test_practice_directory_performance(
         with open("results.json", "w") as f:
             f.write(summary.to_json())
         summary.render_results()
+
+
+if __name__ == "__main__":
+    parser = common_benchmark_parser()
+    args = parser.parse_args()
+    clone_exercism_repo(args.refresh_repo, args.language)
+    run_exercism_benchmark(
+        args.benchmarks,
+        args.max_benchmarks,
+        args.max_iterations,
+        args.max_workers,
+        args.language,
+    )
diff --git a/scripts/evolve_llm_feature_selector.py b/scripts/evolve_llm_feature_selector.py
index b8a139c13..24adf9a05 100644
--- a/scripts/evolve_llm_feature_selector.py
+++ b/scripts/evolve_llm_feature_selector.py
@@ -1,3 +1,4 @@
+# ruff: noqa: E501
 import argparse
 import asyncio
 import json
@@ -8,9 +9,9 @@
 
 from openai import AsyncOpenAI
 
+from benchmarks.context_benchmark import test_code_context_performance
 from mentat.errors import ModelError
 from mentat.prompts.prompts import read_prompt
-from tests.benchmarks.context_benchmark import test_code_context_performance
 
 prompts_dir = Path(__file__).parent.parent / "mentat/resources/prompts"
 
@@ -59,7 +60,7 @@ async def generate_variations(
           3. To also identify relevant context to the query, such as the type-definitions of variables which will be edited, or functions which would be directly affected by the edits. \
           4. To NOT select irrelevant files or lines of code. \
           5. It's critical respond to this with a JSON-parsable list of strings (one for each prompt). \
-    """).format(population=population)  # ruff: noqa: E501
+    """).format(population=population)
     scores = [(prompt, recall_weighted_mean(scores[prompt])) for prompt in scores]
     top_scores = sorted(scores, key=lambda x: x[1], reverse=True)[:population]
     messages = [
diff --git a/scripts/git_log_to_transcripts.py b/scripts/git_log_to_transcripts.py
index cdc7ea2a2..f3cf5e1c9 100755
--- a/scripts/git_log_to_transcripts.py
+++ b/scripts/git_log_to_transcripts.py
@@ -11,6 +11,7 @@
 from git import Repo
 from openai import OpenAI
 
+from benchmarks.context_benchmark import MockStream, select_features_for_benchmark
 from mentat.code_context import CodeContext
 from mentat.code_file_manager import CodeFileManager
 from mentat.config import Config
@@ -18,7 +19,6 @@
 from mentat.parsers.git_parser import GitParser
 from mentat.sampler.utils import clone_repo
 from mentat.session_context import SESSION_CONTEXT, SessionContext
-from tests.benchmarks.context_benchmark import MockStream, select_features_for_benchmark
 
 system_prompt = dedent("""\
         You are part of an automated system for making synthetic data. You will be given the \
diff --git a/scripts/run_and_upload_benchmarks.sh b/scripts/run_and_upload_benchmarks.sh
index 6a7d86aee..314ecb498 100755
--- a/scripts/run_and_upload_benchmarks.sh
+++ b/scripts/run_and_upload_benchmarks.sh
@@ -5,7 +5,7 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S)
 #####################
 # JAVASCRIPT EXERCISM
 #####################
-pytest -s tests/benchmarks/exercism_practice.py \
+./benchmarks/exercism_practice.py \
     --max_iterations 2 \
     --max_workers 1 \
     --max_benchmarks 200 \
@@ -26,7 +26,7 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${
 #################
 # PYTHON EXERCISM
 #################
-pytest -s tests/benchmarks/exercism_practice.py \
+./benchmarks/exercism_practice.py \
     --max_iterations 2 \
     --max_workers 1 \
     --max_benchmarks 200 \
@@ -47,7 +47,7 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${
 #######################
 # REAL WORLD BENCHMARKS
 #######################
-pytest tests/benchmarks/benchmark_runner.py --benchmark -s --retries 2
+./benchmarks/benchmark_runner.py --benchmark -s --retries 2
 SUMMARY=$(jq '.summary_string' results.json)
 
 # Upload results to S3
diff --git a/scripts/sampler/__main__.py b/scripts/sampler/__main__.py
index 7e7f7ab65..04fd9c082 100644
--- a/scripts/sampler/__main__.py
+++ b/scripts/sampler/__main__.py
@@ -13,13 +13,13 @@
 from run import run_sample
 from validate import validate_sample
 
-from mentat.sampler.sample import Sample
-from mentat.utils import mentat_dir_path
-from tests.benchmarks.benchmark_runner import (
+from benchmarks.benchmark_runner import (
     compare_diffs,
     grade_diff_syntax,
     grade_model_response,
 )
+from mentat.sampler.sample import Sample
+from mentat.utils import mentat_dir_path
 
 
 def warn(msg: Any):
diff --git a/setup.py b/setup.py
index b397fb811..a4ca409f1 100644
--- a/setup.py
+++ b/setup.py
@@ -15,7 +15,7 @@
     name="mentat",
     version=__version__,
     python_requires=">=3.10",
-    packages=find_packages(include=["mentat", "mentat.*", "tests"]),
+    packages=find_packages(include=["mentat", "mentat.*", "benchmarks"]),
     install_requires=[
         str(r)
         for r in pkg_resources.parse_requirements(
diff --git a/tests/benchmarks/README.md b/tests/benchmarks/README.md
deleted file mode 100644
index 1535c2cc8..000000000
--- a/tests/benchmarks/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Benchmarks
-
-In this directory we write benchmarks for Mentat's performance on different tasks.
-
-## Running Benchmarks
-
-Benchmarks are run with pytest e.g.
-```
-pytest -s tests/benchmarks/exercism_practice.py --benchmark
-```
-Note you need the `-s` to see the printed results and `--benchmark` is necessary for tests that actually call gpt.
-
-They should not start with `test_` or end with `_test.py` so they will not be automatically collected and ran by pytest.
-
-Flags that control the performance of the benchmarks are defined in [conftest](/conftest.py) and set conservatively so benchmarks without flags will run relatively quickly and cheaply. To run the exercism benchmark with multiple workers on all the tests with one retry for the clojure language run the following:
-```
-pytest -s tests/benchmarks/exercism_practice.py --benchmark --max_benchmarks 134 --max_iterations 2 --max_workers 2 --language clojure --benchmark
-```
-
-Warning: If you increase max_workers much higher you'll start to get rate limited.
diff --git a/tests/conftest.py b/tests/conftest.py
index 6b41b878a..bcd768a8c 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -47,69 +47,6 @@ def filter_mark(items, mark, exists):
 def pytest_addoption(parser):
     parser.addoption("--benchmark", action="store_true")
     parser.addoption("--uitest", action="store_true")
-    # The following flags are used by benchmark tests
-    parser.addoption(
-        "--max_benchmarks",
-        action="store",
-        default="1",
-        help="The maximum number of exercises to run",
-    )
-    parser.addoption(
-        "--retries",
-        action="store",
-        default="1",
-        help="Number of times to retry a benchmark",
-    )
-    parser.addoption(
-        "--max_iterations",
-        action="store",
-        default="1",
-        help="Number of times to rerun mentat with error messages",
-    )
-    parser.addoption(
-        "--language",
-        action="store",
-        default="python",
-        help="Which exercism language to do exercises for",
-    )
-    parser.addoption(
-        "--max_workers",
-        action="store",
-        default="1",
-        help="Number of workers to use for multiprocessing",
-    )
-    parser.addoption(
-        "--refresh_repo",
-        action="store_true",
-        default=False,
-        help="When set local changes will be discarded.",
-    )
-    parser.addoption(
-        "--benchmarks",
-        action="append",
-        nargs="*",
-        default=[],
-        help=(
-            "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
-            " depends on benchmark."
-        ),
-    )
-    parser.addoption(
-        "--repo",
-        action="store",
-        default="mentat",
-        help="For benchmarks that are evaluated against a repo",
-    )
-    parser.addoption(
-        "--evaluate_baseline",
-        action="store_true",
-        help="Evaluate the baseline for the benchmark",
-    )
-
-
-@pytest.fixture
-def refresh_repo(request):
-    return request.config.getoption("--refresh_repo")
 
 
 @pytest.fixture
@@ -120,11 +57,6 @@ def benchmarks(request):
     return benchmarks
 
 
-@pytest.fixture
-def max_benchmarks(request):
-    return int(request.config.getoption("--max_benchmarks"))
-
-
 def pytest_configure(config):
     config.addinivalue_line("markers", "benchmark: run benchmarks that call openai")
     config.addinivalue_line(