ruff format

AbanteAI · Mar 19, 2024 · 2379430 · 2379430
1 parent e6d47da
commit 2379430
Show file tree

Hide file tree

Showing 113 changed files with 1,298 additions and 1,782 deletions.
diff --git a/benchmarks/arg_parser.py b/benchmarks/arg_parser.py
@@ -19,10 +19,7 @@ def common_benchmark_parser():
         "--benchmarks",
         nargs="*",
         default=[],
-        help=(
-            "Which benchmarks to run. max_benchmarks ignored when set. Exact meaning"
-            " depends on benchmark."
-        ),
+        help=("Which benchmarks to run. max_benchmarks ignored when set. Exact meaning" " depends on benchmark."),
     )
     parser.add_argument(
         "--directory",

diff --git a/benchmarks/benchmark_result.py b/benchmarks/benchmark_result.py
@@ -15,61 +15,33 @@ class BenchmarkResult:
     cost: Optional[float] = attr.ib(default=None, metadata={"aggregation": "sum"})
     tokens: Optional[int] = attr.ib(default=None, metadata={"aggregation": "average"})
     count: int = attr.ib(default=1, metadata={"aggregation": "sum"})
-    iterations: Optional[int] = attr.ib(
-        default=None, metadata={"aggregation": "histogram"}
-    )
-    transcript: Optional[Transcript] = attr.ib(
-        default=None, metadata={"display": "transcript"}
-    )
+    iterations: Optional[int] = attr.ib(default=None, metadata={"aggregation": "histogram"})
+    transcript: Optional[Transcript] = attr.ib(default=None, metadata={"display": "transcript"})
     instructions: Optional[str] = attr.ib(default=None, metadata={"display": "text"})
     code: Optional[str] = attr.ib(default=None, metadata={"display": "code"})
-    test_output: Optional[str] = attr.ib(
-        default=None, metadata={"formatted_name": "Test output", "display": "code"}
-    )
-    run_error: Optional[str] = attr.ib(
-        default=None, metadata={"formatted_name": "Run Error", "display": "code"}
-    )
-    response: Optional[str] = attr.ib(
-        default=None, metadata={"formatted_name": "Analysis", "display": "text"}
-    )
+    test_output: Optional[str] = attr.ib(default=None, metadata={"formatted_name": "Test output", "display": "code"})
+    run_error: Optional[str] = attr.ib(default=None, metadata={"formatted_name": "Run Error", "display": "code"})
+    response: Optional[str] = attr.ib(default=None, metadata={"formatted_name": "Analysis", "display": "text"})
     reason: Optional[str] = attr.ib(default=None, metadata={"aggregation": "histogram"})
     # For exercism benchmarks
     passed: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
     # New optional fields for benchmark results
     diff_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
     response_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
-    comparison_grade: Optional[dict] = attr.ib(
-        default=None, metadata={"display": "json"}
-    )
+    comparison_grade: Optional[dict] = attr.ib(default=None, metadata={"display": "json"})
     verify: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
-    off_by_one: Optional[bool] = attr.ib(
-        default=None, metadata={"aggregation": "percent"}
-    )
-    indentation_error: Optional[bool] = attr.ib(
-        default=None, metadata={"aggregation": "percent"}
-    )
-    syntax_error: Optional[bool] = attr.ib(
-        default=None, metadata={"aggregation": "percent"}
-    )
-    missing_functionality: Optional[bool] = attr.ib(
-        default=None, metadata={"aggregation": "percent"}
-    )
-    extra_functionality: Optional[bool] = attr.ib(
-        default=None, metadata={"aggregation": "percent"}
-    )
-    referenced_format: Optional[bool] = attr.ib(
-        default=None, metadata={"aggregation": "percent"}
-    )
+    off_by_one: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    indentation_error: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    syntax_error: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    missing_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    extra_functionality: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
+    referenced_format: Optional[bool] = attr.ib(default=None, metadata={"aggregation": "percent"})
 
     def display_color(self) -> str:
         if self.passed is None:
             if self.indentation_error or self.off_by_one or self.syntax_error:
                 return "grey"
-            if (
-                self.missing_functionality
-                or self.extra_functionality
-                or self.referenced_format
-            ):
+            if self.missing_functionality or self.extra_functionality or self.referenced_format:
                 return "yellow"
             if self.verify is not None:
                 if self.verify:

diff --git a/benchmarks/benchmark_result_list.py b/benchmarks/benchmark_result_list.py
@@ -37,11 +37,7 @@ def generate_list(path: Path, output: Path):
     import argparse
 
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "path", type=Path, help="Path to the benchmark result directory"
-    )
-    parser.add_argument(
-        "output", type=Path, help="Path to the benchmark result directory"
-    )
+    parser.add_argument("path", type=Path, help="Path to the benchmark result directory")
+    parser.add_argument("output", type=Path, help="Path to the benchmark result directory")
     args = parser.parse_args()
     generate_list(args.path, args.output)
diff --git a/benchmarks/benchmark_run.py b/benchmarks/benchmark_run.py
@@ -39,11 +39,7 @@ def aggregate_results(self) -> BenchmarkRunSummary:
         for field in attr.fields(BenchmarkResult):
             if "aggregation" in field.metadata:
                 name = field.name
-                values = [
-                    getattr(result, name)
-                    for result in self.results
-                    if getattr(result, name) is not None
-                ]
+                values = [getattr(result, name) for result in self.results if getattr(result, name) is not None]
                 if len(values) == 0:
                     summary[name] = (0, 0)
                 else:
@@ -83,9 +79,7 @@ def make_html_report(self, output_path: Path = Path("results.html")):
         env = Environment(
             loader=FileSystemLoader(
                 [
-                    os.path.join(
-                        os.path.dirname(__file__), "../mentat/resources/templates"
-                    ),
+                    os.path.join(os.path.dirname(__file__), "../mentat/resources/templates"),
                     os.path.join(os.path.dirname(__file__), "resources/templates"),
                 ]
             ),

diff --git a/benchmarks/benchmark_run_summary.py b/benchmarks/benchmark_run_summary.py
@@ -46,24 +46,16 @@ def formatted_summary(self) -> dict[str, str]:
 
                 # Add units based on aggregation type
                 if aggregation_type == "sum" and "cost" in formatted_name:
-                    formatted[formatted_name] = (
-                        f"${formatted_value} {percent_set_display}"
-                    )
+                    formatted[formatted_name] = f"${formatted_value} {percent_set_display}"
                 elif aggregation_type == "percent":
-                    formatted[formatted_name] = (
-                        f"{formatted_value}% {percent_set_display}"
-                    )
+                    formatted[formatted_name] = f"{formatted_value}% {percent_set_display}"
                 else:
-                    formatted[formatted_name] = (
-                        f"{formatted_value} {percent_set_display}"
-                    )
+                    formatted[formatted_name] = f"{formatted_value} {percent_set_display}"
 
         return formatted
 
     def display_string(self) -> str:
-        return ", ".join(
-            f"{name}: {value}" for name, value in self.formatted_summary().items()
-        )
+        return ", ".join(f"{name}: {value}" for name, value in self.formatted_summary().items())
 
     def to_json(self) -> str:
         return json.dumps(

diff --git a/benchmarks/benchmark_runner.py b/benchmarks/benchmark_runner.py
@@ -55,9 +55,7 @@ async def grade(to_grade, prompt, model="gpt-4-1106-preview"):
             messages[1]["content"] = messages[1]["content"][:-chars_to_remove]
 
         llm_api_handler = SESSION_CONTEXT.get().llm_api_handler
-        llm_grade = await llm_api_handler.call_llm_api(
-            messages, model, False, ResponseFormat(type="json_object")
-        )
+        llm_grade = await llm_api_handler.call_llm_api(messages, model, False, ResponseFormat(type="json_object"))
         content = llm_grade.choices[0].message.content
         return json.loads(content)
     except Exception as e:
@@ -196,9 +194,7 @@ def from_module(cls, path_to_module: Path, module_name: str) -> Benchmark:
             ],
         )
         if hasattr(module, "comparison_commit"):
-            diff_edit = git_diff_from_comparison_commit(
-                output.samples[0], module.comparison_commit
-            )
+            diff_edit = git_diff_from_comparison_commit(output.samples[0], module.comparison_commit)
             for sample in output.samples:
                 if not sample.diff_edit:
                     sample.diff_edit = diff_edit
@@ -220,9 +216,7 @@ async def run(self, retries: int = 1) -> list[BenchmarkResult]:
         for i, sample in enumerate(self.samples):
             print("  Prompt:", sample.message_prompt)
             for j in range(1, retries + 1):
-                formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(
-                    " ", "_"
-                )
+                formatted_title = re.sub(r"[ '\"/\\-^]", "", sample.title).replace(" ", "_")
                 result = BenchmarkResult(
                     name=f"{formatted_title}-{i}-{j}",
                     family=formatted_title,
@@ -272,9 +266,7 @@ def run_benchmarks(user_benchmarks: list[str], directory: str, retries: int = 1)
             else:
                 continue
 
-            if len(user_benchmarks) > 0 and not benchmark_listed(
-                benchmark.title, user_benchmarks
-            ):
+            if len(user_benchmarks) > 0 and not benchmark_listed(benchmark.title, user_benchmarks):
                 continue
             benchmarks.append(benchmark)
     print("Found benchmarks:\n" + "\n".join(b.title for b in benchmarks))

diff --git a/benchmarks/benchmarks/mentat/clojure_exercism_runner.py b/benchmarks/benchmarks/mentat/clojure_exercism_runner.py
@@ -31,9 +31,7 @@ def verify():
             ExerciseRunnerFactory,
         )
 
-        added_to_factory = (
-            ExerciseRunnerFactory.RUNNERS["clojure"] == ClojureExerciseRunner
-        )
+        added_to_factory = ExerciseRunnerFactory.RUNNERS["clojure"] == ClojureExerciseRunner
 
         made_runner = hasattr(ClojureExerciseRunner, "run_test")
         made_runner = made_runner and hasattr(ClojureExerciseRunner, "passed")

diff --git a/benchmarks/context_benchmark.py b/benchmarks/context_benchmark.py
@@ -38,9 +38,7 @@ def _load_benchmarks() -> dict[str, dict[str, Any]]:
     return benchmarks
 
 
-def _convert_features_to_line_sets(
-    git_root: Path, features: list[CodeFeature]
-) -> defaultdict[set]:
+def _convert_features_to_line_sets(git_root: Path, features: list[CodeFeature]) -> defaultdict[set]:
     """Convert a list of features to a dict of {path: set(lines)} for comparison"""
     lines = defaultdict(set)
     for feature in features:
@@ -81,9 +79,7 @@ def evaluate(
     return {"precision": precision, "recall": recall, "f1": f1}
 
 
-async def select_features_for_benchmark(
-    session_context, benchmark, eval=True, use_expected=False, use_llm=True
-):
+async def select_features_for_benchmark(session_context, benchmark, eval=True, use_expected=False, use_llm=True):
     """Select features for benchmark using expected edits as a guide"""
     git_root = session_context.git_root
     config = session_context.config
@@ -97,26 +93,18 @@ async def select_features_for_benchmark(
     if use_expected:
         expected_edits = benchmark["expected_edits"]
         expected_edits_tokens = count_tokens(expected_edits, model)
-    max_context_tokens = (
-        model_context_size(model) - mentat_prompt_tokens - expected_edits_tokens
-    )
+    max_context_tokens = model_context_size(model) - mentat_prompt_tokens - expected_edits_tokens
     # Fill-in available context
     config.auto_context_tokens = 8000
     code_context.use_llm = use_llm
-    await code_context.get_code_message(
-        benchmark["prompt"], max_context_tokens, expected_edits
-    )
+    await code_context.get_code_message(benchmark["prompt"], max_context_tokens, expected_edits)
     git_root_length = len(str(git_root)) + 1
     selected_features = [f.ref()[git_root_length:] for f in code_context.features]
 
     selector_performance = {}
     if eval:
-        edited_features = [
-            CodeFeature(git_root / f) for f in benchmark["edited_features"]
-        ]
-        selector_performance = evaluate(
-            git_root, code_context.features, edited_features
-        )
+        edited_features = [CodeFeature(git_root / f) for f in benchmark["edited_features"]]
+        selector_performance = evaluate(git_root, code_context.features, edited_features)
     return {"features": selected_features, "score": selector_performance}
 
 

diff --git a/benchmarks/edit_rubric_benchmark.py b/benchmarks/edit_rubric_benchmark.py
@@ -40,7 +40,8 @@ def write_result(commit, result, repo_path):
         json.dump(results, f, indent=4)
 
 
-grader_prompt = dedent("""\
+grader_prompt = dedent(
+    """\
         Please grade the following diff on the following metrics:
         - correctness
         - readability
@@ -49,7 +50,8 @@ def write_result(commit, result, repo_path):
         Please reply with only a json object rating the diff from 1
         to 5 on each of those dimensions. For example:
         {"correctness": 4, "readability": 3, "style": 2, "surprisingness": 1}
-        """)
+        """
+)
 
 
 def evaluate_diff(diff: str) -> dict[str, int]:
@@ -68,9 +70,7 @@ def evaluate_diff(diff: str) -> dict[str, int]:
     return json.loads(message)
 
 
-async def test_edit_quality(
-    benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo
-):
+async def test_edit_quality(benchmarks, max_benchmarks, evaluate_baseline, repo, refresh_repo):
     repo_path = Path(__file__).parent / f"../../benchmark_repos/{repo}"
     tests = load_tests(repo_path)
     results = load_results(repo_path)
@@ -92,13 +92,17 @@ async def test_edit_quality(
         codebase = clone_repo(repo_url, repo_name)
         os.chdir(codebase)
         with open(".git/info/exclude", "w") as f:
-            f.write(dedent("""\
+            f.write(
+                dedent(
+                    """\
                 commit_information.json
                 benchmarks.json
                 benchmark_results.json
                 transcripts*.jsonl
                 gpt-output-cache.json
-                """))
+                """
+                )
+            )
         repo = Repo(".")
         start_commit = repo.commit()
         repo.git.checkout(test["commit"] + "^1")

diff --git a/benchmarks/exercise_runners/clojure_exercise_runner.py b/benchmarks/exercise_runners/clojure_exercise_runner.py
@@ -10,9 +10,7 @@ def __init__(self, exercise):
         self.full_path = self.dir / "src" / self.file
 
     def run_test(self):
-        self._run_test_command(
-            ["lein", "test", ":only", self.name + "-test"], cwd=self.dir
-        )
+        self._run_test_command(["lein", "test", ":only", self.name + "-test"], cwd=self.dir)
 
     def passed(self):
         try:

diff --git a/benchmarks/exercism_practice.py b/benchmarks/exercism_practice.py
@@ -38,14 +38,11 @@ def clone_exercism_repo(refresh_repo, language):
     + "reason: <reason_failed>\n"
     + "Your response will be parsed programmatically, so you MUST follow the format for"
     " the final line! The possible responses for the final line and what they mean"
-    " are as follows:\n"
-    + "blank (the coder didn't change the file at all from the stub you provided"
-    " them)\n"
-    + "wording (everything was correct, but the coder messed up the wording or spacing"
+    " are as follows:\n" + "blank (the coder didn't change the file at all from the stub you provided"
+    " them)\n" + "wording (everything was correct, but the coder messed up the wording or spacing"
     " which caused it to be rejected)\n"
     + "duplication (the coder had a random duplicated line that caused the code to not"
-    " be compiled/interpreted)\n"
-    + "syntax (the coder messed up their syntax, meaning their code couldn't be"
+    " be compiled/interpreted)\n" + "syntax (the coder messed up their syntax, meaning their code couldn't be"
     " compiled/interpreted)\n"
     + "logic (the coder messed up the logic)\n"
     + "other (some other reason caused it to fail)\n"
@@ -58,10 +55,7 @@ async def failure_analysis(exercise_runner, language):
     code = exercise_runner.read_code(language)
     test_results = exercise_runner.read_test_results()
 
-    final_message = (
-        f"All instructions:\n{instructions}\nCode to review:\n{code}\nTest"
-        f" results:\n{test_results}"
-    )
+    final_message = f"All instructions:\n{instructions}\nCode to review:\n{code}\nTest" f" results:\n{test_results}"
     messages = [
         {"role": "system", "content": prompt},
         {"role": "user", "content": final_message},
@@ -108,11 +102,7 @@ async def run_exercise(problem_dir, language="python", max_iterations=2):
     while iterations < max_iterations and not client.stopped.is_set():
         if exercise_runner.passed():
             break
-        message = (
-            prompt_1
-            if iterations == 0
-            else exercise_runner.get_error_message() + prompt_2
-        )
+        message = prompt_1 if iterations == 0 else exercise_runner.get_error_message() + prompt_2
         await client.call_mentat_auto_accept(message)
 
         exercise_runner.run_test()
@@ -204,9 +194,7 @@ def run_exercism_benchmark(
         pbar = tqdm.tqdm(total=num_exercises)
 
         result_map = pool.imap(
-            partial(
-                run_exercise_sync, language=language, max_iterations=max_iterations
-            ),
+            partial(run_exercise_sync, language=language, max_iterations=max_iterations),
             exercises,
         )
         results = []

diff --git a/benchmarks/migrations/added_metadata.py b/benchmarks/migrations/added_metadata.py
@@ -32,15 +32,11 @@ def migration(path: Path):
                 benchmark_run.metadata["file"] = file.name
 
             benchmark_run.save(folder=full_result_path, name=file.name)
-            benchmark_run.make_html_report(
-                html_path / file.name.replace(".json", ".html")
-            )
+            benchmark_run.make_html_report(html_path / file.name.replace(".json", ".html"))
 
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "path", type=Path, help="Path to the benchmark result directory"
-    )
+    parser.add_argument("path", type=Path, help="Path to the benchmark result directory")
     args = parser.parse_args()
     migration(args.path)