-
Notifications
You must be signed in to change notification settings - Fork 4.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #72 from SamparkAI/shubhra/fix/improve-swe
adding benchmark code
- Loading branch information
Showing
11 changed files
with
378 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
from .cmds import ( | ||
CreateFileCmd, | ||
CreateFileRequest, | ||
GoToLineNumInOpenFile, | ||
GoToRequest, | ||
OpenCmdRequest, | ||
OpenFile, | ||
) | ||
from .clone_github import GithubCloneCmd, GithubCloneRequest | ||
from .edit_cmd import EditFile, EditFileRequest | ||
from .run_cmd import RunCommandOnWorkspace, RunCommandOnWorkspaceRequest | ||
from .scroll_cmds import Scroll, ScrollRequest | ||
from .search_cmds import ( | ||
FindFileCmd, | ||
FindFileRequest, | ||
GetCurrentDirCmd, | ||
SearchDirCmd, | ||
SearchDirRequest, | ||
SearchFileCmd, | ||
SearchFileRequest, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,6 +22,7 @@ | |
FindFileCmd, | ||
SearchDirCmd, | ||
SearchFileCmd, | ||
GithubCloneCmd, | ||
GetCurrentDirCmd, | ||
) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
|
||
# Running benchamrk on SWE-bench Lite Dataset | ||
|
||
This guide provides instructions on how to download and use the SWE-bench Lite dataset from Hugging Face. | ||
|
||
## Dataset Description | ||
|
||
The SWE-bench Lite dataset is a curated collection of software engineering problems and their solutions. More details about the dataset can be found at the dataset's Hugging Face page: | ||
- [SWE-bench Lite Dataset on Hugging Face](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite) | ||
|
||
## Getting Started | ||
|
||
To use the dataset, you will need to install the `datasets` library from Hugging Face. You can install it using pip: | ||
```python | ||
from datasets import load_dataset | ||
|
||
def filter_short_problem_statements(instance): | ||
""" | ||
Filter function to exclude problem statements with fewer than 40 words. | ||
""" | ||
return len(instance["problem_statement"].split()) > 40 | ||
|
||
def main(): | ||
""" | ||
Main function to load and display entries from the SWE-bench dataset. | ||
""" | ||
# Load the SWE-bench dataset | ||
dev_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="dev") | ||
test_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test") | ||
|
||
# Filter the dataset to include only longer problem statements | ||
filtered_test_dataset = test_dataset.filter(filter_short_problem_statements) | ||
|
||
# Display the first few entries of the filtered dataset | ||
print(filtered_test_dataset[:5]) | ||
|
||
if __name__ == "__main__": | ||
main() | ||
``` | ||
## Dataset Fields | ||
|
||
The SWE-bench Lite dataset includes the following fields: | ||
|
||
- **instance_id**: A formatted instance identifier, usually as repo_owner__repo_name-PR-number. | ||
- **patch**: The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue. | ||
- **repo**: The repository owner/name identifier from GitHub. | ||
- **base_commit**: The commit hash of the repository representing the HEAD of the repository before the solution PR is applied. | ||
- **hints_text**: Comments made on the issue prior to the creation of the solution PR’s first commit creation date. | ||
- **created_at**: The creation date of the pull request. | ||
- **test_patch**: A test-file patch that was contributed by the solution PR. | ||
- **problem_statement**: The issue title and body. | ||
- **version**: Installation version to use for running evaluation. | ||
- **environment_setup_commit**: The commit hash to use for environment setup and installation. | ||
- **FAIL_TO_PASS**: A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution. | ||
- **PASS_TO_PASS**: A json list of strings that represent tests that should pass before and after the PR application. | ||
|
||
## Running the Task | ||
|
||
To run the task and save the history of the run, use the following Python code: | ||
|
||
```python | ||
python swe/try-swe.yaml --config_path <path/to/config> | ||
``` | ||
|
||
|
||
### Task Configuration | ||
|
||
To facilitate the use of the dataset in practical scenarios, a `task_config.yaml` file is used to configure the specifics of the task: | ||
```yaml | ||
backstory: |- | ||
issue_description: |- | ||
repo_name: "" | ||
instance_id: "" | ||
``` | ||
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
import os | ||
import json | ||
from datetime import datetime, timedelta | ||
|
||
|
||
def evaluate_accuracy_and_check_files(base_path="submit_logs", days_back=1): | ||
# Calculate the starting point for checking directories | ||
start_date = datetime.now() - timedelta(days=days_back) | ||
start_folder = start_date.strftime("%Y-%m-%d_%H-%M-%S") | ||
|
||
successful_submissions = 0 | ||
total_submissions = 0 | ||
patch_files_found = 0 | ||
|
||
# Walk through the base directory | ||
for root, dirs, files in os.walk(base_path): | ||
# Check if the directory is after the start date | ||
dir_name = os.path.basename(root) | ||
if dir_name >= start_folder: | ||
for file in files: | ||
if file.endswith('.json'): | ||
file_path = os.path.join(root, file) | ||
with open(file_path, 'r') as f: | ||
data = json.load(f) | ||
# Assuming that success is defined by some condition in the output | ||
if "success" in data["output"]: | ||
successful_submissions += 1 | ||
total_submissions += 1 | ||
# Check for patch files in the same directory | ||
patch_files = [f for f in os.listdir(root) if f.endswith('.patch')] | ||
patch_files_found += len(patch_files) | ||
|
||
# Calculate accuracy | ||
accuracy = successful_submissions / total_submissions if total_submissions > 0 else 0 | ||
|
||
# Output results | ||
return { | ||
"accuracy": accuracy, | ||
"total_submissions": total_submissions, | ||
"successful_submissions": successful_submissions, | ||
"patch_files_found": patch_files_found | ||
} | ||
|
||
|
||
if __name__ == "__main__": | ||
results = evaluate_accuracy_and_check_files() | ||
print("Evaluation Results:") | ||
print(f"Accuracy: {results['accuracy']:.2f}") | ||
print(f"Total Submissions: {results['total_submissions']}") | ||
print(f"Successful Submissions: {results['successful_submissions']}") | ||
print(f"Patch Files Found: {results['patch_files_found']}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
datasets==2.19.1 | ||
crewai==0.30.11 |
Oops, something went wrong.