Skip to content
This repository has been archived by the owner on Jan 7, 2025. It is now read-only.

Benchmark runner test #488

Merged
merged 6 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions benchmarks/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ def common_benchmark_parser():
)
parser.add_argument(
"--benchmarks",
action="append",
nargs="*",
default=[[]],
default=[],
help=(
"Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
" depends on benchmark."
Expand Down
101 changes: 56 additions & 45 deletions benchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ async def evaluate_sample(sample_file, retries=1):
"""Run a sample using Mentat and return the resulting diff"""
sample = Sample.load(sample_file)
results = []
start_dir = Path.cwd()
for i in range(retries):
formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_")
result = BenchmarkResult(
Expand All @@ -189,20 +190,23 @@ async def evaluate_sample(sample_file, retries=1):
diff_merge_base=sample.diff_merge_base,
diff_active=sample.diff_active,
)
cwd = Path(repo.working_dir)

# Run sample in PythonClient
paths = list[Path]()
for a in sample.context:
paths.append(Path(a))
client = PythonClient(cwd=cwd, paths=paths)
response = await run_client(
client, sample.message_prompt, result, sample.message_history
)
await grade_and_clean_diff(
repo, response, result, comparison_diff=sample.diff_edit
)
results.append(result)
try:
cwd = Path(repo.working_dir)

# Run sample in PythonClient
paths = list[Path]()
for a in sample.context:
paths.append(Path(a))
client = PythonClient(cwd=cwd, paths=paths)
response = await run_client(
client, sample.message_prompt, result, sample.message_history
)
await grade_and_clean_diff(
repo, response, result, comparison_diff=sample.diff_edit
)
results.append(result)
finally:
os.chdir(start_dir)
return results


Expand All @@ -212,36 +216,43 @@ async def evalute_py(path, retries):
title = benchmark.title

print("Benchmark:", title)
repo = setup_repo(
url=benchmark.repo,
commit=benchmark.commit,
)
cwd = Path(repo.working_dir)

if hasattr(benchmark, "comparison_commit"):
comparison_commit = benchmark.comparison_commit
repo.git.checkout(comparison_commit)
comparison_diff = repo.git.diff(benchmark.commit)
else:
comparison_diff = None

for i, prompt in enumerate(benchmark.prompts):
print(" Prompt:", prompt)
for j in range(1, retries + 1):
formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
result = BenchmarkResult(
name=f"{formatted_title}-{i}-{j}",
family=formatted_title,
)
client = PythonClient(cwd=cwd, config=benchmark.config)
response = await run_client(client, prompt, result)
start_dir = Path.cwd()
try:
repo = setup_repo(
url=benchmark.repo,
commit=benchmark.commit,
)
cwd = Path(repo.working_dir)

await client.shutdown()
if hasattr(benchmark, "verify"):
result.verify = benchmark.verify()
if hasattr(benchmark, "comparison_commit"):
comparison_commit = benchmark.comparison_commit
repo.git.checkout(comparison_commit)
comparison_diff = repo.git.diff(benchmark.commit)
else:
comparison_diff = None

for i, prompt in enumerate(benchmark.prompts):
print(" Prompt:", prompt)
for j in range(1, retries + 1):
formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
result = BenchmarkResult(
name=f"{formatted_title}-{i}-{j}",
family=formatted_title,
)
client = PythonClient(
cwd=cwd, paths=benchmark.paths, config=benchmark.config
)
response = await run_client(client, prompt, result)

await grade_and_clean_diff(repo, response, result, comparison_diff)
results.append(result)
await client.shutdown()
if hasattr(benchmark, "verify"):
result.verify = benchmark.verify()

await grade_and_clean_diff(repo, response, result, comparison_diff)
os.chdir("../..")
results.append(result)
finally:
os.chdir(start_dir)
return results


Expand All @@ -252,9 +263,9 @@ def benchmark_listed(title, benchmarks):
return False


async def run_benchmarks(retries, benchmarks):
async def run_benchmarks(benchmarks, retries=1):
print("Running benchmarks")
benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"
benchmarks_dir = Path("benchmarks/benchmarks")

benchmark_paths = []
for root, dirs, files in os.walk(benchmarks_dir):
Expand Down Expand Up @@ -296,7 +307,7 @@ async def run_benchmarks(retries, benchmarks):
args = parser.parse_args()
asyncio.run(
run_benchmarks(
args.benchmarks,
args.retries,
args.benchmarks[0],
)
)
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/clojure_exercism_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
repo = "https://github.com/AbanteAI/mentat"
commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
minimum_context = ["tests/benchmarks/exercise_runners"]
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/license_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
repo = "https://github.com/AbanteAI/mentat"
commit = "b0848711c36e0c2fe9619ebb2b77dc6d27396ff2"
minimum_context = ["tests/license_check.py:11-22"]
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/pre_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

repo = "https://github.com/AbanteAI/mentat"
commit = "b8d90b89e4a0d7ad266bf914c4ce99c473dd8dc0"
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
2 changes: 1 addition & 1 deletion benchmarks/exercism_practice.py
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,7 @@ def run_exercism_benchmark(
args = parser.parse_args()
clone_exercism_repo(args.refresh_repo, args.language)
run_exercism_benchmark(
args.benchmarks[0],
args.benchmarks,
args.max_benchmarks,
args.max_iterations,
args.max_workers,
Expand Down
2 changes: 1 addition & 1 deletion mentat/sampler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from mentat.git_handler import get_non_gitignored_files
from mentat.utils import is_file_text_encoded

CLONE_TO_DIR = Path("benchmark_repos")
CLONE_TO_DIR = Path("benchmarks/benchmark_repos")


def clone_repo(
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[tool.isort]
profile = "black"
known_first_party = "mentat"
skip = ["vscode/bundled", "benchmark_repos", "testbed/exercism-python"]
skip = ["vscode/bundled", "benchmarks/benchmark_repos", "testbed/exercism-python"]

[tool.ruff]
line-length = 120
ignore = ["E731"]

[tool.pytest.ini_options]
addopts = "--ignore=vscode/bundled --ignore=benchmark_repos --ignore=testbed/exercism-python"
addopts = "--ignore=vscode/bundled --ignore=benchmarks/benchmark_repos --ignore=testbed/exercism-python"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should also update pyrightconfig.json, change benchmark_repos to benchmarks.


[tool.black]
preview = "true"
Expand Down
9 changes: 8 additions & 1 deletion pyrightconfig.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
{
"include": ["mentat"],
"ignore": ["testbed", "tests", "scripts", "benchmark_repos", "build"],
"ignore": [
"testbed",
"tests",
"scripts",
"benchmark_repos",
"build",
"benchmarks/benchmark_repos",
],
"typeCheckingMode": "strict",
"reportMissingTypeStubs": false,
}
12 changes: 6 additions & 6 deletions scripts/run_and_upload_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S)
--max_benchmarks 200 \
--language javascript

SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-javascript/results.json)
SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-javascript/results.json)
BUCKET="benchmarks.mentat.ai"

# Upload results to S3
aws s3 cp benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
aws s3 cp benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json
aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.html s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html
aws s3 cp benchmarks/benchmark_repos/exercism-javascript/results.json s3://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.json

# Send slack notification
JAVASCRIPT_RESULTS_URL="http://${BUCKET}/exercism-javascript-results-${TIMESTAMP}.html"
Expand All @@ -32,11 +32,11 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${
--max_benchmarks 200 \
--language python

SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-python/results.json)
SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-python/results.json)

# Upload results to S3
aws s3 cp benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
aws s3 cp benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json
aws s3 cp benchmarks/benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
aws s3 cp benchmarks/benchmark_repos/exercism-python/results.json s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.json

# Send slack notification
PYTHON_RESULTS_URL="http://${BUCKET}/exercism-python-results-${TIMESTAMP}.html"
Expand Down
19 changes: 19 additions & 0 deletions testbed/benchmarks/benchmarks/clojure_exercism_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from mentat.config import Config

title = "Clojure Exercism Runner"

description = """
This benchmark tests the ability to write an exercism test runner for the clojure language.
"""

prompts = [
"Write a test runner for the clojure language.",
]


repo = "https://github.com/AbanteAI/mentat"
commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
minimum_context = ["tests/benchmarks/exercise_runners"]
paths = ["tests/benchmarks/exercise_runners"]

config = Config()
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,33 @@ def test_empty_sequence(self):

def test_pow(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't let the actual accumulate get formatted because it could make the saved gpt output invalid but I'm inclined to let our linter change the test.

self.assertEqual(
accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25])
accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25]
)

def test_divmod(self):
self.assertEqual(
accumulate([10, 17, 23], lambda x: divmod(x, 7)),
[(1, 3), (2, 3), (3, 2)])
accumulate([10, 17, 23], lambda x: divmod(x, 7)), [(1, 3), (2, 3), (3, 2)]
)

def test_composition(self):
inp = [10, 17, 23]
self.assertEqual(
accumulate(
accumulate(inp, lambda x: divmod(x, 7)),
lambda x: 7 * x[0] + x[1]), inp)
accumulate(inp, lambda x: divmod(x, 7)), lambda x: 7 * x[0] + x[1]
),
inp,
)

def test_capitalize(self):
self.assertEqual(
accumulate(['hello', 'world'], str.upper), ['HELLO', 'WORLD'])
self.assertEqual(accumulate(["hello", "world"], str.upper), ["HELLO", "WORLD"])

def test_recursive(self):
inp = ['a', 'b', 'c']
out = [['a1', 'a2', 'a3'], ['b1', 'b2', 'b3'], ['c1', 'c2', 'c3']]
inp = ["a", "b", "c"]
out = [["a1", "a2", "a3"], ["b1", "b2", "b3"], ["c1", "c2", "c3"]]
self.assertEqual(
accumulate(
inp, lambda x: accumulate(list('123'), lambda y: x + y)), out)
accumulate(inp, lambda x: accumulate(list("123"), lambda y: x + y)), out
)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
Loading