Skip to content

Commit

Permalink
ruff format
Browse files Browse the repository at this point in the history
  • Loading branch information
biobootloader committed Mar 19, 2024
1 parent e6d47da commit 2379430
Show file tree
Hide file tree
Showing 113 changed files with 1,298 additions and 1,782 deletions.
5 changes: 1 addition & 4 deletions benchmarks/arg_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ def common_benchmark_parser():
"--benchmarks",
nargs="*",
default=[],
help=(
"Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
" depends on benchmark."
),
help=("Which benchmarks to run. max_benchmarks ignored when set. Exact meaning" " depends on benchmark."),
)
parser.add_argument(
"--directory",
Expand Down
54 changes: 13 additions & 41 deletions benchmarks/benchmark_result.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,61 +15,33 @@ class BenchmarkResult:
cost: Optional[float] = attr.ib(default=None, metadata={"aggregation": "sum"})
tokens: Optional[int] = attr.ib(default=None, metadata={"aggregation": "average"})
count: int = attr.ib(default=1, metadata={"aggregation": "sum"})
iterations: Optional[int] = attr.ib(
default=None, metadata={"aggregation": "histogram"}
)
transcript: Optional[Transcript] = attr.ib(
default=None, metadata={"display": "transcript"}
)
iterations: Optional[int] = attr.ib(default=None, metadata={"aggregation": "histogram"})
transcript: Optional[Transcript] = attr.ib(default=None, metadata={"display": "transcript"})
instructions: Optional[str] = attr.ib(default=None, metadata={"display": "text"})
code: Optional[str] = attr.ib(default=None, metadata={"display": "code"})
test_output: Optional[str] = attr.ib(
default=None, metadata={"formatted_name": "Test output", "display": "code"}
)
run_error: Optional[str] = attr.ib(
default=None, metadata={"formatted_name": "Run Error", "display": "code"}
)
response: Optional[str] = attr.ib(
default=None, metadata={"formatted_name": "Analysis", "display": "text"}
)
test_output: Optional[str] = attr.ib(default=None, metadata={"formatted_name": "Test output", "display": "code"})
run_error: Optional[str] = attr.ib(default=None, metadata={"formatted_name": "Run Error", "display": "code"})
response: Optional[str] = attr.ib(default=None, metadata={"formatted_name": "Analysis", "display": "text"})
reason: Optional[str] = attr.ib(default=None, metadata={"aggregation": "histogram"})
# For exercism benchmarks
passed: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
# New optional fields for benchmark results
diff_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
response_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
comparison_grade: Optional[dict] = attr.ib(
default=None, metadata={"display": "json"}
)
comparison_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
verify: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
off_by_one: Optional[bool] = attr.ib(
default=None, metadata={"aggregation": "percent"}
)
indentation_error: Optional[bool] = attr.ib(
default=None, metadata={"aggregation": "percent"}
)
syntax_error: Optional[bool] = attr.ib(
default=None, metadata={"aggregation": "percent"}
)
missing_functionality: Optional[bool] = attr.ib(
default=None, metadata={"aggregation": "percent"}
)
extra_functionality: Optional[bool] = attr.ib(
default=None, metadata={"aggregation": "percent"}
)
referenced_format: Optional[bool] = attr.ib(
default=None, metadata={"aggregation": "percent"}
)
off_by_one: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
indentation_error: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
syntax_error: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
missing_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
extra_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
referenced_format: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})

def display_color(self) -> str:
if self.passed is None:
if self.indentation_error or self.off_by_one or self.syntax_error:
return "grey"
if (
self.missing_functionality
or self.extra_functionality
or self.referenced_format
):
if self.missing_functionality or self.extra_functionality or self.referenced_format:
return "yellow"
if self.verify is not None:
if self.verify:
Expand Down
8 changes: 2 additions & 6 deletions benchmarks/benchmark_result_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,7 @@ def generate_list(path: Path, output: Path):
import argparse

parser = argparse.ArgumentParser()
parser.add_argument(
"path", type=Path, help="Path to the benchmark result directory"
)
parser.add_argument(
"output", type=Path, help="Path to the benchmark result directory"
)
parser.add_argument("path", type=Path, help="Path to the benchmark result directory")
parser.add_argument("output", type=Path, help="Path to the benchmark result directory")
args = parser.parse_args()
generate_list(args.path, args.output)
10 changes: 2 additions & 8 deletions benchmarks/benchmark_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,7 @@ def aggregate_results(self) -> BenchmarkRunSummary:
for field in attr.fields(BenchmarkResult):
if "aggregation" in field.metadata:
name = field.name
values = [
getattr(result, name)
for result in self.results
if getattr(result, name) is not None
]
values = [getattr(result, name) for result in self.results if getattr(result, name) is not None]
if len(values) == 0:
summary[name] = (0, 0)
else:
Expand Down Expand Up @@ -83,9 +79,7 @@ def make_html_report(self, output_path: Path = Path("results.html")):
env = Environment(
loader=FileSystemLoader(
[
os.path.join(
os.path.dirname(__file__), "../mentat/resources/templates"
),
os.path.join(os.path.dirname(__file__), "../mentat/resources/templates"),
os.path.join(os.path.dirname(__file__), "resources/templates"),
]
),
Expand Down
16 changes: 4 additions & 12 deletions benchmarks/benchmark_run_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,24 +46,16 @@ def formatted_summary(self) -> dict[str, str]:

# Add units based on aggregation type
if aggregation_type == "sum" and "cost" in formatted_name:
formatted[formatted_name] = (
f"${formatted_value} {percent_set_display}"
)
formatted[formatted_name] = f"${formatted_value} {percent_set_display}"
elif aggregation_type == "percent":
formatted[formatted_name] = (
f"{formatted_value}% {percent_set_display}"
)
formatted[formatted_name] = f"{formatted_value}% {percent_set_display}"
else:
formatted[formatted_name] = (
f"{formatted_value} {percent_set_display}"
)
formatted[formatted_name] = f"{formatted_value} {percent_set_display}"

return formatted

def display_string(self) -> str:
return ", ".join(
f"{name}: {value}" for name, value in self.formatted_summary().items()
)
return ", ".join(f"{name}: {value}" for name, value in self.formatted_summary().items())

def to_json(self) -> str:
return json.dumps(
Expand Down
16 changes: 4 additions & 12 deletions benchmarks/benchmark_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,7 @@ async def grade(to_grade, prompt, model="gpt-4-1106-preview"):
messages[1]["content"] = messages[1]["content"][:-chars_to_remove]

llm_api_handler = SESSION_CONTEXT.get().llm_api_handler
llm_grade = await llm_api_handler.call_llm_api(
messages, model, False, ResponseFormat(type="json_object")
)
llm_grade = await llm_api_handler.call_llm_api(messages, model, False, ResponseFormat(type="json_object"))
content = llm_grade.choices[0].message.content
return json.loads(content)
except Exception as e:
Expand Down Expand Up @@ -196,9 +194,7 @@ def from_module(cls, path_to_module: Path, module_name: str) -> Benchmark:
],
)
if hasattr(module, "comparison_commit"):
diff_edit = git_diff_from_comparison_commit(
output.samples[0], module.comparison_commit
)
diff_edit = git_diff_from_comparison_commit(output.samples[0], module.comparison_commit)
for sample in output.samples:
if not sample.diff_edit:
sample.diff_edit = diff_edit
Expand All @@ -220,9 +216,7 @@ async def run(self, retries: int = 1) -> list[BenchmarkResult]:
for i, sample in enumerate(self.samples):
print(" Prompt:", sample.message_prompt)
for j in range(1, retries + 1):
formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(
" ", "_"
)
formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_")
result = BenchmarkResult(
name=f"{formatted_title}-{i}-{j}",
family=formatted_title,
Expand Down Expand Up @@ -272,9 +266,7 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
else:
continue

if len(user_benchmarks) > 0 and not benchmark_listed(
benchmark.title, user_benchmarks
):
if len(user_benchmarks) > 0 and not benchmark_listed(benchmark.title, user_benchmarks):
continue
benchmarks.append(benchmark)
print("Found benchmarks:\n" + "\n".join(b.title for b in benchmarks))
Expand Down
4 changes: 1 addition & 3 deletions benchmarks/benchmarks/mentat/clojure_exercism_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ def verify():
ExerciseRunnerFactory,
)

added_to_factory = (
ExerciseRunnerFactory.RUNNERS["clojure"] == ClojureExerciseRunner
)
added_to_factory = ExerciseRunnerFactory.RUNNERS["clojure"] == ClojureExerciseRunner

made_runner = hasattr(ClojureExerciseRunner, "run_test")
made_runner = made_runner and hasattr(ClojureExerciseRunner, "passed")
Expand Down
24 changes: 6 additions & 18 deletions benchmarks/context_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,7 @@ def _load_benchmarks() -> dict[str, dict[str, Any]]:
return benchmarks


def _convert_features_to_line_sets(
git_root: Path, features: list[CodeFeature]
) -> defaultdict[set]:
def _convert_features_to_line_sets(git_root: Path, features: list[CodeFeature]) -> defaultdict[set]:
"""Convert a list of features to a dict of {path: set(lines)} for comparison"""
lines = defaultdict(set)
for feature in features:
Expand Down Expand Up @@ -81,9 +79,7 @@ def evaluate(
return {"precision": precision, "recall": recall, "f1": f1}


async def select_features_for_benchmark(
session_context, benchmark, eval=True, use_expected=False, use_llm=True
):
async def select_features_for_benchmark(session_context, benchmark, eval=True, use_expected=False, use_llm=True):
"""Select features for benchmark using expected edits as a guide"""
git_root = session_context.git_root
config = session_context.config
Expand All @@ -97,26 +93,18 @@ async def select_features_for_benchmark(
if use_expected:
expected_edits = benchmark["expected_edits"]
expected_edits_tokens = count_tokens(expected_edits, model)
max_context_tokens = (
model_context_size(model) - mentat_prompt_tokens - expected_edits_tokens
)
max_context_tokens = model_context_size(model) - mentat_prompt_tokens - expected_edits_tokens
# Fill-in available context
config.auto_context_tokens = 8000
code_context.use_llm = use_llm
await code_context.get_code_message(
benchmark["prompt"], max_context_tokens, expected_edits
)
await code_context.get_code_message(benchmark["prompt"], max_context_tokens, expected_edits)
git_root_length = len(str(git_root)) + 1
selected_features = [f.ref()[git_root_length:] for f in code_context.features]

selector_performance = {}
if eval:
edited_features = [
CodeFeature(git_root / f) for f in benchmark["edited_features"]
]
selector_performance = evaluate(
git_root, code_context.features, edited_features
)
edited_features = [CodeFeature(git_root / f) for f in benchmark["edited_features"]]
selector_performance = evaluate(git_root, code_context.features, edited_features)
return {"features": selected_features, "score": selector_performance}


Expand Down
18 changes: 11 additions & 7 deletions benchmarks/edit_rubric_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def write_result(commit, result, repo_path):
json.dump(results, f, indent=4)


grader_prompt = dedent("""\
grader_prompt = dedent(
"""\
Please grade the following diff on the following metrics:
- correctness
- readability
Expand All @@ -49,7 +50,8 @@ def write_result(commit, result, repo_path):
Please reply with only a json object rating the diff from 1
to 5 on each of those dimensions. For example:
{"correctness": 4, "readability": 3, "style": 2, "surprisingness": 1}
""")
"""
)


def evaluate_diff(diff: str) -> dict[str, int]:
Expand All @@ -68,9 +70,7 @@ def evaluate_diff(diff: str) -> dict[str, int]:
return json.loads(message)


async def test_edit_quality(
benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo
):
async def test_edit_quality(benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo):
repo_path = Path(__file__).parent / f"../../benchmark_repos/{repo}"
tests = load_tests(repo_path)
results = load_results(repo_path)
Expand All @@ -92,13 +92,17 @@ async def test_edit_quality(
codebase = clone_repo(repo_url, repo_name)
os.chdir(codebase)
with open(".git/info/exclude", "w") as f:
f.write(dedent("""\
f.write(
dedent(
"""\
commit_information.json
benchmarks.json
benchmark_results.json
transcripts*.jsonl
gpt-output-cache.json
"""))
"""
)
)
repo = Repo(".")
start_commit = repo.commit()
repo.git.checkout(test["commit"] + "^1")
Expand Down
4 changes: 1 addition & 3 deletions benchmarks/exercise_runners/clojure_exercise_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,7 @@ def __init__(self, exercise):
self.full_path = self.dir / "src" / self.file

def run_test(self):
self._run_test_command(
["lein", "test", ":only", self.name + "-test"], cwd=self.dir
)
self._run_test_command(["lein", "test", ":only", self.name + "-test"], cwd=self.dir)

def passed(self):
try:
Expand Down
24 changes: 6 additions & 18 deletions benchmarks/exercism_practice.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,14 +38,11 @@ def clone_exercism_repo(refresh_repo, language):
+ "reason: <reason_failed>\n"
+ "Your response will be parsed programmatically, so you MUST follow the format for"
" the final line! The possible responses for the final line and what they mean"
" are as follows:\n"
+ "blank (the coder didn't change the file at all from the stub you provided"
" them)\n"
+ "wording (everything was correct, but the coder messed up the wording or spacing"
" are as follows:\n" + "blank (the coder didn't change the file at all from the stub you provided"
" them)\n" + "wording (everything was correct, but the coder messed up the wording or spacing"
" which caused it to be rejected)\n"
+ "duplication (the coder had a random duplicated line that caused the code to not"
" be compiled/interpreted)\n"
+ "syntax (the coder messed up their syntax, meaning their code couldn't be"
" be compiled/interpreted)\n" + "syntax (the coder messed up their syntax, meaning their code couldn't be"
" compiled/interpreted)\n"
+ "logic (the coder messed up the logic)\n"
+ "other (some other reason caused it to fail)\n"
Expand All @@ -58,10 +55,7 @@ async def failure_analysis(exercise_runner, language):
code = exercise_runner.read_code(language)
test_results = exercise_runner.read_test_results()

final_message = (
f"All instructions:\n{instructions}\nCode to review:\n{code}\nTest"
f" results:\n{test_results}"
)
final_message = f"All instructions:\n{instructions}\nCode to review:\n{code}\nTest" f" results:\n{test_results}"
messages = [
{"role": "system", "content": prompt},
{"role": "user", "content": final_message},
Expand Down Expand Up @@ -108,11 +102,7 @@ async def run_exercise(problem_dir, language="python", max_iterations=2):
while iterations < max_iterations and not client.stopped.is_set():
if exercise_runner.passed():
break
message = (
prompt_1
if iterations == 0
else exercise_runner.get_error_message() + prompt_2
)
message = prompt_1 if iterations == 0 else exercise_runner.get_error_message() + prompt_2
await client.call_mentat_auto_accept(message)

exercise_runner.run_test()
Expand Down Expand Up @@ -204,9 +194,7 @@ def run_exercism_benchmark(
pbar = tqdm.tqdm(total=num_exercises)

result_map = pool.imap(
partial(
run_exercise_sync, language=language, max_iterations=max_iterations
),
partial(run_exercise_sync, language=language, max_iterations=max_iterations),
exercises,
)
results = []
Expand Down
8 changes: 2 additions & 6 deletions benchmarks/migrations/added_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,15 +32,11 @@ def migration(path: Path):
benchmark_run.metadata["file"] = file.name

benchmark_run.save(folder=full_result_path, name=file.name)
benchmark_run.make_html_report(
html_path / file.name.replace(".json", ".html")
)
benchmark_run.make_html_report(html_path / file.name.replace(".json", ".html"))


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"path", type=Path, help="Path to the benchmark result directory"
)
parser.add_argument("path", type=Path, help="Path to the benchmark result directory")
args = parser.parse_args()
migration(args.path)
Loading

0 comments on commit 2379430

Please sign in to comment.