Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

adding benchmark code #72

Merged
merged 14 commits into from
May 31, 2024
1 change: 1 addition & 0 deletions composio/client/enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -2436,6 +2436,7 @@ def from_app_and_action(cls, app: str, name: str) -> "Action":
GETCURRENTDIRCMD = ("cmdmanagertool", "cmdmanagertool_getcurrentdircmd", True, True)
GITHUBCLONECMD = ("cmdmanagertool", "cmdmanagertool_githubclonecmd", True, True)
GETWORKSPACEHISTORY = ("historykeeper", "historykeeper_getworkspacehistory", True, True)
GithubCloneCmd = ("cmdmanagertool", "cmdmanagertool_githubclonecmd", True, True)



Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from .cmds import (
CreateFileCmd,
CreateFileRequest,
GoToLineNumInOpenFile,
GoToRequest,
OpenCmdRequest,
OpenFile,
)
from .clone_github import GithubCloneCmd, GithubCloneRequest
from .edit_cmd import EditFile, EditFileRequest
from .run_cmd import RunCommandOnWorkspace, RunCommandOnWorkspaceRequest
from .scroll_cmds import Scroll, ScrollRequest
from .search_cmds import (
FindFileCmd,
FindFileRequest,
GetCurrentDirCmd,
SearchDirCmd,
SearchDirRequest,
SearchFileCmd,
SearchFileRequest,
)
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class GithubCloneCmd(BaseAction):
"""
Clones a github repository
"""

_history_maintains: bool = True
_display_name = "Clone Github Repository Action"
_request_schema = GithubCloneRequest
_response_schema = GithubCloneResponse
Expand All @@ -45,7 +45,7 @@ def execute(
) -> GithubCloneResponse:
if not request_data.repo_name or not request_data.repo_name.strip():
raise ValueError(
"repo_name can not be null. Give a directory-name in which to search"
"repo_name can not be null. Give a repo_name to clone"
)

if not request_data.github_token or not request_data.github_token.strip():
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
FindFileCmd,
SearchDirCmd,
SearchFileCmd,
GithubCloneCmd,
GetCurrentDirCmd,
)

Expand Down
30 changes: 30 additions & 0 deletions composio/local_tools/local_workspace/commons/history_processor.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,31 @@
import os
from collections import defaultdict
from functools import wraps
import json
from datetime import datetime
from pathlib import Path

from composio.local_tools.local_workspace.commons.get_logger import get_logger


logger = get_logger()
script_path = Path(__file__)
script_dir = script_path.parent
submit_logs_dir = script_dir / Path("../../../examples/swe/submit_logs/")


class HistoryProcessor:
def __init__(self):
self.history = defaultdict(list)
# make submit_path directory
try:
date_time_folder = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
base_dir = script_dir / Path(date_time_folder)
if not os.path.exists(base_dir):
os.makedirs(base_dir)
self.base_dir = base_dir
except Exception as e:
raise Exception("error in making submit-path directory") from e

def log_command(self, workspace_id, command, output, state):
entry = {"command": command, "output": output, "state": state}
Expand All @@ -19,15 +35,24 @@ def get_history(self, workspace_id, n=5):
all_history = self.history.get(workspace_id, [])
return all_history[-n:]

def save_history_to_file(self, workspace_id: str, instance_id: str):
# Define the file path using instance-id and ensure it's unique per workspace
file_path = self.base_dir / Path(f"{workspace_id}_instance_{instance_id}.json")
history_logs = self.history.get(workspace_id, [])
with open(file_path, 'w') as file:
json.dump(history_logs, file)


def history_recorder():
def decorator(func):
@wraps(func)
def wrapper(self, *args, **kwargs):
output, return_code = func(self, *args, **kwargs)
is_submit_command = False
if hasattr(self, "history_processor") and hasattr(self, "workspace_id"):
command = ""
if hasattr(self, "command"):
is_submit_command = "submit" in self.command
command = self.command + " " + args[0].json()
else:
logger.error(
Expand All @@ -39,6 +64,11 @@ def wrapper(self, *args, **kwargs):
self.history_processor.log_command(
self.workspace_id, command, output, state
)

# save history to file-path once submit command is submitted
if is_submit_command:
self.history_processor.save_history_to_file(self.workspace_id, self.instance_id)

return output, return_code

return wrapper
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
EditFileRequest,
RunCommandOnWorkspace,
RunCommandOnWorkspaceRequest,
GithubCloneCmd,
GithubCloneRequest
)
from composio.local_tools.local_workspace.commons.history_processor import (
HistoryProcessor,
Expand All @@ -29,6 +31,9 @@ def check_simple_implementation():
h = HistoryProcessor()
workspace_id = w.get_workspace_manager(args)

# clone git repo
git_clone = GithubCloneRequest()

# create file
create_file_cmd = CreateFileCmd()
create_file_cmd.set_workspace_and_history(w, h)
Expand Down
76 changes: 76 additions & 0 deletions examples/benchmark/Readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@

# Running benchamrk on SWE-bench Lite Dataset

This guide provides instructions on how to download and use the SWE-bench Lite dataset from Hugging Face.

## Dataset Description

The SWE-bench Lite dataset is a curated collection of software engineering problems and their solutions. More details about the dataset can be found at the dataset's Hugging Face page:
- [SWE-bench Lite Dataset on Hugging Face](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite)

## Getting Started

To use the dataset, you will need to install the `datasets` library from Hugging Face. You can install it using pip:
```python
from datasets import load_dataset

def filter_short_problem_statements(instance):
"""
Filter function to exclude problem statements with fewer than 40 words.
"""
return len(instance["problem_statement"].split()) > 40

def main():
"""
Main function to load and display entries from the SWE-bench dataset.
"""
# Load the SWE-bench dataset
dev_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="dev")
test_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")

# Filter the dataset to include only longer problem statements
filtered_test_dataset = test_dataset.filter(filter_short_problem_statements)

# Display the first few entries of the filtered dataset
print(filtered_test_dataset[:5])

if __name__ == "__main__":
main()
```
## Dataset Fields

The SWE-bench Lite dataset includes the following fields:

- **instance_id**: A formatted instance identifier, usually as repo_owner__repo_name-PR-number.
- **patch**: The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue.
- **repo**: The repository owner/name identifier from GitHub.
- **base_commit**: The commit hash of the repository representing the HEAD of the repository before the solution PR is applied.
- **hints_text**: Comments made on the issue prior to the creation of the solution PR’s first commit creation date.
- **created_at**: The creation date of the pull request.
- **test_patch**: A test-file patch that was contributed by the solution PR.
- **problem_statement**: The issue title and body.
- **version**: Installation version to use for running evaluation.
- **environment_setup_commit**: The commit hash to use for environment setup and installation.
- **FAIL_TO_PASS**: A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution.
- **PASS_TO_PASS**: A json list of strings that represent tests that should pass before and after the PR application.

## Running the Task

To run the task and save the history of the run, use the following Python code:

```python
python swe/try-swe.yaml --config_path <path/to/config>
```


### Task Configuration

To facilitate the use of the dataset in practical scenarios, a `task_config.yaml` file is used to configure the specifics of the task:
```yaml
backstory: |-
issue_description: |-
repo_name: ""
instance_id: ""
```


51 changes: 51 additions & 0 deletions examples/benchmark/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import json
from datetime import datetime, timedelta


def evaluate_accuracy_and_check_files(base_path="submit_logs", days_back=1):
# Calculate the starting point for checking directories
start_date = datetime.now() - timedelta(days=days_back)
start_folder = start_date.strftime("%Y-%m-%d_%H-%M-%S")

successful_submissions = 0
total_submissions = 0
patch_files_found = 0

# Walk through the base directory
for root, dirs, files in os.walk(base_path):
# Check if the directory is after the start date
dir_name = os.path.basename(root)
if dir_name >= start_folder:
for file in files:
if file.endswith('.json'):
file_path = os.path.join(root, file)
with open(file_path, 'r') as f:
data = json.load(f)
# Assuming that success is defined by some condition in the output
if "success" in data["output"]:
successful_submissions += 1
total_submissions += 1
# Check for patch files in the same directory
patch_files = [f for f in os.listdir(root) if f.endswith('.patch')]
patch_files_found += len(patch_files)

# Calculate accuracy
accuracy = successful_submissions / total_submissions if total_submissions > 0 else 0

# Output results
return {
"accuracy": accuracy,
"total_submissions": total_submissions,
"successful_submissions": successful_submissions,
"patch_files_found": patch_files_found
}


if __name__ == "__main__":
results = evaluate_accuracy_and_check_files()
print("Evaluation Results:")
print(f"Accuracy: {results['accuracy']:.2f}")
print(f"Total Submissions: {results['total_submissions']}")
print(f"Successful Submissions: {results['successful_submissions']}")
print(f"Patch Files Found: {results['patch_files_found']}")
2 changes: 2 additions & 0 deletions examples/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
datasets==2.19.1
crewai==0.30.11
Loading
Loading