Merge pull request #72 from SamparkAI/shubhra/fix/improve-swe

adding benchmark code
ComposioHQ · May 31, 2024 · 20af347 · 20af347
2 parents 6de6dc3 + 837e601
commit 20af347
Show file tree

Hide file tree

Showing 11 changed files with 378 additions and 11 deletions.
diff --git a/composio/client/enums.py b/composio/client/enums.py
@@ -2436,6 +2436,7 @@ def from_app_and_action(cls, app: str, name: str) -> "Action":
     GETCURRENTDIRCMD = ("cmdmanagertool", "cmdmanagertool_getcurrentdircmd", True, True)
     GITHUBCLONECMD = ("cmdmanagertool", "cmdmanagertool_githubclonecmd", True, True)
     GETWORKSPACEHISTORY = ("historykeeper", "historykeeper_getworkspacehistory", True, True)
+    GithubCloneCmd = ("cmdmanagertool", "cmdmanagertool_githubclonecmd", True, True)
 
 
 

diff --git a/composio/local_tools/local_workspace/cmd_manager/actions/__init__.py b/composio/local_tools/local_workspace/cmd_manager/actions/__init__.py
@@ -0,0 +1,21 @@
+from .cmds import (
+    CreateFileCmd,
+    CreateFileRequest,
+    GoToLineNumInOpenFile,
+    GoToRequest,
+    OpenCmdRequest,
+    OpenFile,
+)
+from .clone_github import GithubCloneCmd, GithubCloneRequest
+from .edit_cmd import EditFile, EditFileRequest
+from .run_cmd import RunCommandOnWorkspace, RunCommandOnWorkspaceRequest
+from .scroll_cmds import Scroll, ScrollRequest
+from .search_cmds import (
+    FindFileCmd,
+    FindFileRequest,
+    GetCurrentDirCmd,
+    SearchDirCmd,
+    SearchDirRequest,
+    SearchFileCmd,
+    SearchFileRequest,
+)
diff --git a/composio/local_tools/local_workspace/cmd_manager/actions/clone_github.py b/composio/local_tools/local_workspace/cmd_manager/actions/clone_github.py
@@ -34,7 +34,7 @@ class GithubCloneCmd(BaseAction):
     """
     Clones a github repository
     """
-
+    _history_maintains: bool = True
     _display_name = "Clone Github Repository Action"
     _request_schema = GithubCloneRequest
     _response_schema = GithubCloneResponse
@@ -45,7 +45,7 @@ def execute(
     ) -> GithubCloneResponse:
         if not request_data.repo_name or not request_data.repo_name.strip():
             raise ValueError(
-                "repo_name can not be null. Give a directory-name in which to search"
+                "repo_name can not be null. Give a repo_name to clone"
             )
 
         if not request_data.github_token or not request_data.github_token.strip():

diff --git a/composio/local_tools/local_workspace/cmd_manager/cmd_manager_tool.py b/composio/local_tools/local_workspace/cmd_manager/cmd_manager_tool.py
@@ -22,6 +22,7 @@
     FindFileCmd,
     SearchDirCmd,
     SearchFileCmd,
+    GithubCloneCmd,
     GetCurrentDirCmd,
 )
 

diff --git a/composio/local_tools/local_workspace/commons/history_processor.py b/composio/local_tools/local_workspace/commons/history_processor.py
@@ -1,15 +1,31 @@
+import os
 from collections import defaultdict
 from functools import wraps
+import json
+from datetime import datetime
+from pathlib import Path
 
 from composio.local_tools.local_workspace.commons.get_logger import get_logger
 
 
 logger = get_logger()
+script_path = Path(__file__)
+script_dir = script_path.parent
+submit_logs_dir = script_dir / Path("../../../examples/swe/submit_logs/")
 
 
 class HistoryProcessor:
     def __init__(self):
         self.history = defaultdict(list)
+        # make submit_path directory
+        try:
+            date_time_folder = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+            base_dir = script_dir / Path(date_time_folder)
+            if not os.path.exists(base_dir):
+                os.makedirs(base_dir)
+            self.base_dir = base_dir
+        except Exception as e:
+            raise Exception("error in making submit-path directory") from e
 
     def log_command(self, workspace_id, command, output, state):
         entry = {"command": command, "output": output, "state": state}
@@ -19,15 +35,24 @@ def get_history(self, workspace_id, n=5):
         all_history = self.history.get(workspace_id, [])
         return all_history[-n:]
 
+    def save_history_to_file(self, workspace_id: str, instance_id: str):
+        # Define the file path using instance-id and ensure it's unique per workspace
+        file_path = self.base_dir / Path(f"{workspace_id}_instance_{instance_id}.json")
+        history_logs = self.history.get(workspace_id, [])
+        with open(file_path, 'w') as file:
+            json.dump(history_logs, file)
+
 
 def history_recorder():
     def decorator(func):
         @wraps(func)
         def wrapper(self, *args, **kwargs):
             output, return_code = func(self, *args, **kwargs)
+            is_submit_command = False
             if hasattr(self, "history_processor") and hasattr(self, "workspace_id"):
                 command = ""
                 if hasattr(self, "command"):
+                    is_submit_command = "submit" in self.command
                     command = self.command + " " + args[0].json()
                 else:
                     logger.error(
@@ -39,6 +64,11 @@ def wrapper(self, *args, **kwargs):
                 self.history_processor.log_command(
                     self.workspace_id, command, output, state
                 )
+
+                # save history to file-path once submit command is submitted
+                if is_submit_command:
+                    self.history_processor.save_history_to_file(self.workspace_id, self.instance_id)
+
             return output, return_code
 
         return wrapper

diff --git a/composio/local_tools/local_workspace/test/check_implementation.py b/composio/local_tools/local_workspace/test/check_implementation.py
@@ -9,6 +9,8 @@
     EditFileRequest,
     RunCommandOnWorkspace,
     RunCommandOnWorkspaceRequest,
+    GithubCloneCmd,
+    GithubCloneRequest
 )
 from composio.local_tools.local_workspace.commons.history_processor import (
     HistoryProcessor,
@@ -29,6 +31,9 @@ def check_simple_implementation():
     h = HistoryProcessor()
     workspace_id = w.get_workspace_manager(args)
 
+    # clone git repo
+    git_clone = GithubCloneRequest()
+
     # create file
     create_file_cmd = CreateFileCmd()
     create_file_cmd.set_workspace_and_history(w, h)

diff --git a/examples/benchmark/Readme.md b/examples/benchmark/Readme.md
@@ -0,0 +1,76 @@
+
+# Running benchamrk on SWE-bench Lite Dataset
+
+This guide provides instructions on how to download and use the SWE-bench Lite dataset from Hugging Face.
+
+## Dataset Description
+
+The SWE-bench Lite dataset is a curated collection of software engineering problems and their solutions. More details about the dataset can be found at the dataset's Hugging Face page:
+- [SWE-bench Lite Dataset on Hugging Face](https://huggingface.co/datasets/princeton-nlp/SWE-bench_Lite)
+
+## Getting Started
+
+To use the dataset, you will need to install the `datasets` library from Hugging Face. You can install it using pip:
+```python
+from datasets import load_dataset
+
+def filter_short_problem_statements(instance):
+    """
+    Filter function to exclude problem statements with fewer than 40 words.
+    """
+    return len(instance["problem_statement"].split()) > 40
+
+def main():
+    """
+    Main function to load and display entries from the SWE-bench dataset.
+    """
+    # Load the SWE-bench dataset
+    dev_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="dev")
+    test_dataset = load_dataset("princeton-nlp/SWE-bench_Lite", split="test")
+
+    # Filter the dataset to include only longer problem statements
+    filtered_test_dataset = test_dataset.filter(filter_short_problem_statements)
+
+    # Display the first few entries of the filtered dataset
+    print(filtered_test_dataset[:5])
+
+if __name__ == "__main__":
+    main()
+```
+## Dataset Fields
+
+The SWE-bench Lite dataset includes the following fields:
+
+- **instance_id**: A formatted instance identifier, usually as repo_owner__repo_name-PR-number.
+- **patch**: The gold patch, the patch generated by the PR (minus test-related code), that resolved the issue.
+- **repo**: The repository owner/name identifier from GitHub.
+- **base_commit**: The commit hash of the repository representing the HEAD of the repository before the solution PR is applied.
+- **hints_text**: Comments made on the issue prior to the creation of the solution PR’s first commit creation date.
+- **created_at**: The creation date of the pull request.
+- **test_patch**: A test-file patch that was contributed by the solution PR.
+- **problem_statement**: The issue title and body.
+- **version**: Installation version to use for running evaluation.
+- **environment_setup_commit**: The commit hash to use for environment setup and installation.
+- **FAIL_TO_PASS**: A json list of strings that represent the set of tests resolved by the PR and tied to the issue resolution.
+- **PASS_TO_PASS**: A json list of strings that represent tests that should pass before and after the PR application.
+
+## Running the Task
+
+To run the task and save the history of the run, use the following Python code:
+
+```python
+python swe/try-swe.yaml --config_path <path/to/config>
+```
+
+
+### Task Configuration
+
+To facilitate the use of the dataset in practical scenarios, a `task_config.yaml` file is used to configure the specifics of the task:
+```yaml
+backstory: |-
+issue_description: |-
+repo_name: ""
+instance_id: ""
+```
+
+
diff --git a/examples/benchmark/evaluation.py b/examples/benchmark/evaluation.py
@@ -0,0 +1,51 @@
+import os
+import json
+from datetime import datetime, timedelta
+
+
+def evaluate_accuracy_and_check_files(base_path="submit_logs", days_back=1):
+    # Calculate the starting point for checking directories
+    start_date = datetime.now() - timedelta(days=days_back)
+    start_folder = start_date.strftime("%Y-%m-%d_%H-%M-%S")
+
+    successful_submissions = 0
+    total_submissions = 0
+    patch_files_found = 0
+
+    # Walk through the base directory
+    for root, dirs, files in os.walk(base_path):
+        # Check if the directory is after the start date
+        dir_name = os.path.basename(root)
+        if dir_name >= start_folder:
+            for file in files:
+                if file.endswith('.json'):
+                    file_path = os.path.join(root, file)
+                    with open(file_path, 'r') as f:
+                        data = json.load(f)
+                        # Assuming that success is defined by some condition in the output
+                        if "success" in data["output"]:
+                            successful_submissions += 1
+                        total_submissions += 1
+                        # Check for patch files in the same directory
+                        patch_files = [f for f in os.listdir(root) if f.endswith('.patch')]
+                        patch_files_found += len(patch_files)
+
+    # Calculate accuracy
+    accuracy = successful_submissions / total_submissions if total_submissions > 0 else 0
+
+    # Output results
+    return {
+        "accuracy": accuracy,
+        "total_submissions": total_submissions,
+        "successful_submissions": successful_submissions,
+        "patch_files_found": patch_files_found
+    }
+
+
+if __name__ == "__main__":
+    results = evaluate_accuracy_and_check_files()
+    print("Evaluation Results:")
+    print(f"Accuracy: {results['accuracy']:.2f}")
+    print(f"Total Submissions: {results['total_submissions']}")
+    print(f"Successful Submissions: {results['successful_submissions']}")
+    print(f"Patch Files Found: {results['patch_files_found']}")
diff --git a/examples/requirements.txt b/examples/requirements.txt
@@ -0,0 +1,2 @@
+datasets==2.19.1
+crewai==0.30.11