Skip to content

Commit

Permalink
Merge pull request #72 from SamparkAI/shubhra/fix/improve-swe
Browse files Browse the repository at this point in the history
adding benchmark code
  • Loading branch information
shubhras01 authored May 31, 2024
2 parents 6de6dc3 + 837e601 commit 20af347
Show file tree
Hide file tree
Showing 11 changed files with 378 additions and 11 deletions.
1 change: 1 addition & 0 deletions composio/client/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -2436,6 +2436,7 @@ def from_app_and_action(cls, app: str, name: str) -> "Action":
GETCURRENTDIRCMD = ("cmdmanagertool", "cmdmanagertool_getcurrentdircmd", True, True)
GITHUBCLONECMD = ("cmdmanagertool", "cmdmanagertool_githubclonecmd", True, True)
GETWORKSPACEHISTORY = ("historykeeper", "historykeeper_getworkspacehistory", True, True)
GithubCloneCmd = ("cmdmanagertool", "cmdmanagertool_githubclonecmd", True, True)



Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .cmds import (
CreateFileCmd,
CreateFileRequest,
GoToLineNumInOpenFile,
GoToRequest,
OpenCmdRequest,
OpenFile,
)
from .clone_github import GithubCloneCmd, GithubCloneRequest
from .edit_cmd import EditFile, EditFileRequest
from .run_cmd import RunCommandOnWorkspace, RunCommandOnWorkspaceRequest
from .scroll_cmds import Scroll, ScrollRequest
from .search_cmds import (
FindFileCmd,
FindFileRequest,
GetCurrentDirCmd,
SearchDirCmd,
SearchDirRequest,
SearchFileCmd,
SearchFileRequest,
)
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class GithubCloneCmd(BaseAction):
"""
Clones a github repository
"""

_history_maintains: bool = True
_display_name = "Clone Github Repository Action"
_request_schema = GithubCloneRequest
_response_schema = GithubCloneResponse
Expand All @@ -45,7 +45,7 @@ def execute(
) -> GithubCloneResponse:
if not request_data.repo_name or not request_data.repo_name.strip():
raise ValueError(
"repo_name can not be null. Give a directory-name in which to search"
"repo_name can not be null. Give a repo_name to clone"
)

if not request_data.github_token or not request_data.github_token.strip():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
FindFileCmd,
SearchDirCmd,
SearchFileCmd,
GithubCloneCmd,
GetCurrentDirCmd,
)

Expand Down
30 changes: 30 additions & 0 deletions composio/local_tools/local_workspace/commons/history_processor.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,31 @@
import os
from collections import defaultdict
from functools import wraps
import json
from datetime import datetime
from pathlib import Path

from composio.local_tools.local_workspace.commons.get_logger import get_logger


logger = get_logger()
script_path = Path(__file__)
script_dir = script_path.parent
submit_logs_dir = script_dir / Path("../../../examples/swe/submit_logs/")


class HistoryProcessor:
def __init__(self):
self.history = defaultdict(list)
# make submit_path directory
try:
date_time_folder = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
base_dir = script_dir / Path(date_time_folder)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
self.base_dir = base_dir
except Exception as e:
raise Exception("error in making submit-path directory") from e

def log_command(self, workspace_id, command, output, state):
entry = {"command": command, "output": output, "state": state}
Expand All @@ -19,15 +35,24 @@ def get_history(self, workspace_id, n=5):
all_history = self.history.get(workspace_id, [])
return all_history[-n:]

def save_history_to_file(self, workspace_id: str, instance_id: str):
# Define the file path using instance-id and ensure it's unique per workspace
file_path = self.base_dir / Path(f"{workspace_id}_instance_{instance_id}.json")
history_logs = self.history.get(workspace_id, [])
with open(file_path, 'w') as file:
json.dump(history_logs, file)


def history_recorder():
def decorator(func):
@wraps(func)
def wrapper(self, *args, **kwargs):
output, return_code = func(self, *args, **kwargs)
is_submit_command = False
if hasattr(self, "history_processor") and hasattr(self, "workspace_id"):
command = ""
if hasattr(self, "command"):
is_submit_command = "submit" in self.command
command = self.command + " " + args[0].json()
else:
logger.error(
Expand All @@ -39,6 +64,11 @@ def wrapper(self, *args, **kwargs):
self.history_processor.log_command(
self.workspace_id, command, output, state
)

# save history to file-path once submit command is submitted
if is_submit_command:
self.history_processor.save_history_to_file(self.workspace_id, self.instance_id)

return output, return_code

return wrapper
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
EditFileRequest,
RunCommandOnWorkspace,
RunCommandOnWorkspaceRequest,
GithubCloneCmd,
GithubCloneRequest
)
from composio.local_tools.local_workspace.commons.history_processor import (
HistoryProcessor,
Expand All @@ -29,6 +31,9 @@ def check_simple_implementation():
h = HistoryProcessor()
workspace_id = w.get_workspace_manager(args)

# clone git repo
git_clone = GithubCloneRequest()

# create file
create_file_cmd = CreateFileCmd()
create_file_cmd.set_workspace_and_history(w, h)
Expand Down
76 changes: 76 additions & 0 deletions examples/benchmark/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

# Running benchamrk on SWE-bench Lite Dataset

This guide provides instructions on how to download and use the SWE-bench Lite dataset from Hugging Face.

## Dataset Description

The SWE-bench Lite dataset is a curated collection of software engineering problems and their solutions. More details about the dataset can be found at the dataset's Hugging Face page:
- [SWE-bench Lite Dataset on Hugging Face](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite)

## Getting Started

To use the dataset, you will need to install the `datasets` library from Hugging Face. You can install it using pip:
```python
from datasets import load_dataset

def filter_short_problem_statements(instance):
"""
Filter function to exclude problem statements with fewer than 40 words.
"""
return len(instance["problem_statement"].split()) > 40

def main():
"""
Main function to load and display entries from the SWE-bench dataset.
"""
# Load the SWE-bench dataset
dev_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="dev")
test_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")

# Filter the dataset to include only longer problem statements
filtered_test_dataset = test_dataset.filter(filter_short_problem_statements)

# Display the first few entries of the filtered dataset
print(filtered_test_dataset[:5])

if __name__ == "__main__":
main()
```
## Dataset Fields

The SWE-bench Lite dataset includes the following fields:

- **instance_id**: A formatted instance identifier, usually as repo_owner__repo_name-PR-number.
- **patch**: The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue.
- **repo**: The repository owner/name identifier from GitHub.
- **base_commit**: The commit hash of the repository representing the HEAD of the repository before the solution PR is applied.
- **hints_text**: Comments made on the issue prior to the creation of the solution PR’s first commit creation date.
- **created_at**: The creation date of the pull request.
- **test_patch**: A test-file patch that was contributed by the solution PR.
- **problem_statement**: The issue title and body.
- **version**: Installation version to use for running evaluation.
- **environment_setup_commit**: The commit hash to use for environment setup and installation.
- **FAIL_TO_PASS**: A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution.
- **PASS_TO_PASS**: A json list of strings that represent tests that should pass before and after the PR application.

## Running the Task

To run the task and save the history of the run, use the following Python code:

```python
python swe/try-swe.yaml --config_path <path/to/config>
```


### Task Configuration

To facilitate the use of the dataset in practical scenarios, a `task_config.yaml` file is used to configure the specifics of the task:
```yaml
backstory: |-
issue_description: |-
repo_name: ""
instance_id: ""
```
51 changes: 51 additions & 0 deletions examples/benchmark/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import json
from datetime import datetime, timedelta


def evaluate_accuracy_and_check_files(base_path="submit_logs", days_back=1):
# Calculate the starting point for checking directories
start_date = datetime.now() - timedelta(days=days_back)
start_folder = start_date.strftime("%Y-%m-%d_%H-%M-%S")

successful_submissions = 0
total_submissions = 0
patch_files_found = 0

# Walk through the base directory
for root, dirs, files in os.walk(base_path):
# Check if the directory is after the start date
dir_name = os.path.basename(root)
if dir_name >= start_folder:
for file in files:
if file.endswith('.json'):
file_path = os.path.join(root, file)
with open(file_path, 'r') as f:
data = json.load(f)
# Assuming that success is defined by some condition in the output
if "success" in data["output"]:
successful_submissions += 1
total_submissions += 1
# Check for patch files in the same directory
patch_files = [f for f in os.listdir(root) if f.endswith('.patch')]
patch_files_found += len(patch_files)

# Calculate accuracy
accuracy = successful_submissions / total_submissions if total_submissions > 0 else 0

# Output results
return {
"accuracy": accuracy,
"total_submissions": total_submissions,
"successful_submissions": successful_submissions,
"patch_files_found": patch_files_found
}


if __name__ == "__main__":
results = evaluate_accuracy_and_check_files()
print("Evaluation Results:")
print(f"Accuracy: {results['accuracy']:.2f}")
print(f"Total Submissions: {results['total_submissions']}")
print(f"Successful Submissions: {results['successful_submissions']}")
print(f"Patch Files Found: {results['patch_files_found']}")
2 changes: 2 additions & 0 deletions examples/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
datasets==2.19.1
crewai==0.30.11
Loading

0 comments on commit 20af347

Please sign in to comment.