Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmark runner test #488

Merged
merged 6 commits into from
Jan 18, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions benchmarks/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,8 @@ def common_benchmark_parser():
)
parser.add_argument(
"--benchmarks",
action="append",
nargs="*",
default=[[]],
default=[],
help=(
"Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
" depends on benchmark."
Expand Down
101 changes: 56 additions & 45 deletions benchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ async def evaluate_sample(sample_file, retries=1):
"""Run a sample using Mentat and return the resulting diff"""
sample = Sample.load(sample_file)
results = []
start_dir = Path.cwd()
for i in range(retries):
formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_")
result = BenchmarkResult(
Expand All @@ -189,20 +190,23 @@ async def evaluate_sample(sample_file, retries=1):
diff_merge_base=sample.diff_merge_base,
diff_active=sample.diff_active,
)
cwd = Path(repo.working_dir)

# Run sample in PythonClient
paths = list[Path]()
for a in sample.context:
paths.append(Path(a))
client = PythonClient(cwd=cwd, paths=paths)
response = await run_client(
client, sample.message_prompt, result, sample.message_history
)
await grade_and_clean_diff(
repo, response, result, comparison_diff=sample.diff_edit
)
results.append(result)
try:
cwd = Path(repo.working_dir)

# Run sample in PythonClient
paths = list[Path]()
for a in sample.context:
paths.append(Path(a))
client = PythonClient(cwd=cwd, paths=paths)
response = await run_client(
client, sample.message_prompt, result, sample.message_history
)
await grade_and_clean_diff(
repo, response, result, comparison_diff=sample.diff_edit
)
results.append(result)
finally:
os.chdir(start_dir)
return results


Expand All @@ -212,36 +216,43 @@ async def evalute_py(path, retries):
title = benchmark.title

print("Benchmark:", title)
repo = setup_repo(
url=benchmark.repo,
commit=benchmark.commit,
)
cwd = Path(repo.working_dir)

if hasattr(benchmark, "comparison_commit"):
comparison_commit = benchmark.comparison_commit
repo.git.checkout(comparison_commit)
comparison_diff = repo.git.diff(benchmark.commit)
else:
comparison_diff = None

for i, prompt in enumerate(benchmark.prompts):
print(" Prompt:", prompt)
for j in range(1, retries + 1):
formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
result = BenchmarkResult(
name=f"{formatted_title}-{i}-{j}",
family=formatted_title,
)
client = PythonClient(cwd=cwd, config=benchmark.config)
response = await run_client(client, prompt, result)
start_dir = Path.cwd()
try:
repo = setup_repo(
url=benchmark.repo,
commit=benchmark.commit,
)
cwd = Path(repo.working_dir)

await client.shutdown()
if hasattr(benchmark, "verify"):
result.verify = benchmark.verify()
if hasattr(benchmark, "comparison_commit"):
comparison_commit = benchmark.comparison_commit
repo.git.checkout(comparison_commit)
comparison_diff = repo.git.diff(benchmark.commit)
else:
comparison_diff = None

for i, prompt in enumerate(benchmark.prompts):
print(" Prompt:", prompt)
for j in range(1, retries + 1):
formatted_title = re.sub(r"[ '\"/\\-^]", "", title).replace(" ", "_")
result = BenchmarkResult(
name=f"{formatted_title}-{i}-{j}",
family=formatted_title,
)
client = PythonClient(
cwd=cwd, paths=benchmark.paths, config=benchmark.config
)
response = await run_client(client, prompt, result)

await grade_and_clean_diff(repo, response, result, comparison_diff)
results.append(result)
await client.shutdown()
if hasattr(benchmark, "verify"):
result.verify = benchmark.verify()

await grade_and_clean_diff(repo, response, result, comparison_diff)
os.chdir("../..")
results.append(result)
finally:
os.chdir(start_dir)
return results


Expand All @@ -252,9 +263,9 @@ def benchmark_listed(title, benchmarks):
return False


async def run_benchmarks(retries, benchmarks):
async def run_benchmarks(benchmarks, retries=1):
print("Running benchmarks")
benchmarks_dir = f"{os.path.dirname(__file__)}/benchmarks"
benchmarks_dir = Path("benchmarks/benchmarks")

benchmark_paths = []
for root, dirs, files in os.walk(benchmarks_dir):
Expand Down Expand Up @@ -296,7 +307,7 @@ async def run_benchmarks(retries, benchmarks):
args = parser.parse_args()
asyncio.run(
run_benchmarks(
args.benchmarks,
args.retries,
args.benchmarks[0],
)
)
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/clojure_exercism_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
repo = "https://github.com/AbanteAI/mentat"
commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
minimum_context = ["tests/benchmarks/exercise_runners"]
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/license_update.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
repo = "https://github.com/AbanteAI/mentat"
commit = "b0848711c36e0c2fe9619ebb2b77dc6d27396ff2"
minimum_context = ["tests/license_check.py:11-22"]
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
1 change: 1 addition & 0 deletions benchmarks/benchmarks/mentat/pre_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@

repo = "https://github.com/AbanteAI/mentat"
commit = "b8d90b89e4a0d7ad266bf914c4ce99c473dd8dc0"
paths = []

config = Config(
auto_context_tokens=8000,
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/exercism_practice.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,8 +224,9 @@ def run_exercism_benchmark(
parser = common_benchmark_parser()
args = parser.parse_args()
clone_exercism_repo(args.refresh_repo, args.language)
print(args)
run_exercism_benchmark(
args.benchmarks[0],
args.benchmarks,
args.max_benchmarks,
args.max_iterations,
args.max_workers,
Expand Down
2 changes: 1 addition & 1 deletion mentat/sampler/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from mentat.git_handler import get_non_gitignored_files
from mentat.utils import is_file_text_encoded

CLONE_TO_DIR = Path("benchmark_repos")
CLONE_TO_DIR = Path("benchmarks/benchmark_repos")


def clone_repo(
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
[tool.isort]
profile = "black"
known_first_party = "mentat"
skip = ["vscode/bundled", "benchmark_repos", "testbed/exercism-python"]
skip = ["vscode/bundled", "benchmarks/benchmark_repos", "testbed/exercism-python"]

[tool.ruff]
line-length = 120
ignore = ["E731"]

[tool.pytest.ini_options]
addopts = "--ignore=vscode/bundled --ignore=benchmark_repos --ignore=testbed/exercism-python"
addopts = "--ignore=vscode/bundled --ignore=benchmarks/benchmark_repos --ignore=testbed/exercism-python"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should also update pyrightconfig.json, change benchmark_repos to benchmarks.


[tool.black]
preview = "true"
Expand Down
4 changes: 2 additions & 2 deletions scripts/run_and_upload_benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ TIMESTAMP=$(date +%Y%m%d%H%M%S)
--max_benchmarks 200 \
--language javascript

SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-javascript/results.json)
SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-javascript/results.json)
BUCKET="benchmarks.mentat.ai"

# Upload results to S3
Expand All @@ -32,7 +32,7 @@ curl -X POST -H "Content-Type: application/json" -d "{\"benchmark_report\": \"${
--max_benchmarks 200 \
--language python

SUMMARY=$(jq '.summary_string' benchmark_repos/exercism-python/results.json)
SUMMARY=$(jq '.summary_string' benchmarks/benchmark_repos/exercism-python/results.json)

# Upload results to S3
aws s3 cp benchmark_repos/exercism-python/results.html s3://${BUCKET}/exercism-python-results-${TIMESTAMP}.html
Expand Down
19 changes: 19 additions & 0 deletions testbed/benchmarks/benchmarks/clojure_exercism_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from mentat.config import Config

title = "Clojure Exercism Runner"

description = """
This benchmark tests the ability to write an exercism test runner for the clojure language.
"""

prompts = [
"Write a test runner for the clojure language.",
]


repo = "https://github.com/AbanteAI/mentat"
commit = "d611e2ff742856c7328d54f6e71c2418f9c5508b"
minimum_context = ["tests/benchmarks/exercise_runners"]
paths = ["tests/benchmarks/exercise_runners"]

config = Config()
Original file line number Diff line number Diff line change
Expand Up @@ -9,31 +9,33 @@ def test_empty_sequence(self):

def test_pow(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can't let the actual accumulate get formatted because it could make the saved gpt output invalid but I'm inclined to let our linter change the test.

self.assertEqual(
accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25])
accumulate([1, 2, 3, 4, 5], lambda x: x * x), [1, 4, 9, 16, 25]
)

def test_divmod(self):
self.assertEqual(
accumulate([10, 17, 23], lambda x: divmod(x, 7)),
[(1, 3), (2, 3), (3, 2)])
accumulate([10, 17, 23], lambda x: divmod(x, 7)), [(1, 3), (2, 3), (3, 2)]
)

def test_composition(self):
inp = [10, 17, 23]
self.assertEqual(
accumulate(
accumulate(inp, lambda x: divmod(x, 7)),
lambda x: 7 * x[0] + x[1]), inp)
accumulate(inp, lambda x: divmod(x, 7)), lambda x: 7 * x[0] + x[1]
),
inp,
)

def test_capitalize(self):
self.assertEqual(
accumulate(['hello', 'world'], str.upper), ['HELLO', 'WORLD'])
self.assertEqual(accumulate(["hello", "world"], str.upper), ["HELLO", "WORLD"])

def test_recursive(self):
inp = ['a', 'b', 'c']
out = [['a1', 'a2', 'a3'], ['b1', 'b2', 'b3'], ['c1', 'c2', 'c3']]
inp = ["a", "b", "c"]
out = [["a1", "a2", "a3"], ["b1", "b2", "b3"], ["c1", "c2", "c3"]]
self.assertEqual(
accumulate(
inp, lambda x: accumulate(list('123'), lambda y: x + y)), out)
accumulate(inp, lambda x: accumulate(list("123"), lambda y: x + y)), out
)


if __name__ == '__main__':
if __name__ == "__main__":
unittest.main()
95 changes: 95 additions & 0 deletions tests/benchmarks/test_benchmark_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import json
import os
from textwrap import dedent
from unittest.mock import patch

import pytest

from benchmarks.benchmark_runner import run_benchmarks


@pytest.fixture
def mock_webbrowser():
with patch("webbrowser.open") as mock:
yield mock


@pytest.mark.asyncio
async def test_run_exercism_benchmark(mock_webbrowser, mock_call_llm_api):
cwd = os.getcwd()
mock_call_llm_api.set_return_values(
[
dedent("""\
Here are the code changes:

@@start
{
"file": "tests/benchmarks/exercise_runners/clojure_exercise_runner.py",
"action": "create-file"
}
@@code
from .abstract_exercise_runner import AbstractExerciseRunner
import subprocess
import os


class ClojureExerciseRunner(AbstractExerciseRunner):
def __init__(self, exercise):
super().__init__(exercise, "clj")
self.file = self.file.with_suffix(".clj")
self.full_path = self.dir / self.file

def run_test(self):
self._run_test_command(["lein", "test"], cwd=str(self.dir))

def passed(self):
try:
with open(self.test_output_file, "r") as f:
lines = f.readlines()
return "FAIL" not in lines[0] and "PASS" in lines[0]
except FileNotFoundError:
return False
@@end

@@start
{
"file": "tests/benchmarks/exercise_runners/exercise_runner_factory.py",
"action": "insert",
"insert-after-line": 2,
"insert-before-line": 3
}
@@code
from .clojure_exercise_runner import ClojureExerciseRunner
@@end

@@start
{
"file": "tests/benchmarks/exercise_runners/exercise_runner_factory.py",
"action": "insert",
"insert-after-line": 7,
"insert-before-line": 8
}
@@code
"clojure": ClojureExerciseRunner,
@@end"""),
dedent("""\
{
"indentation": false,
"off_by_one": false,
"syntax": false
}"""),
dedent("""\
{
"referenced_format": true,
"trailing_waffling": false
}"""),
]
)
await run_benchmarks(["Clojure Exercism Runner"])
assert os.getcwd() == cwd
with open("results.json") as f:
results = json.load(f)
summary = results["summary"]
assert summary["tokens (avg)"] == "0.00 "
assert summary["cost"] == "$0 "
assert summary["referenced_format"] == "100.00% "
Loading