-
Notifications
You must be signed in to change notification settings - Fork 5.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'main' into rysweet-dotnet-ai-ext-abstractions
- Loading branch information
Showing
100 changed files
with
4,663 additions
and
140 deletions.
There are no files selected for viewing
85 changes: 85 additions & 0 deletions
85
dotnet/src/Microsoft.AutoGen/Agents/Agents/AIAgent/AiAgent.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.Globalization; | ||
using System.Text; | ||
using Microsoft.SemanticKernel; | ||
using Microsoft.SemanticKernel.Connectors.OpenAI; | ||
using Microsoft.SemanticKernel.Memory; | ||
|
||
namespace Microsoft.AutoGen.Agents; | ||
public abstract class AiAgent<T> : AgentBase where T : class, new() | ||
{ | ||
protected AgentState<T> _state; | ||
protected Kernel _kernel; | ||
private readonly ISemanticTextMemory _memory; | ||
|
||
public AiAgent(IAgentContext context, ISemanticTextMemory memory, Kernel kernel, EventTypes typeRegistry) : base(context, typeRegistry) | ||
{ | ||
_state = new(); | ||
_memory = memory; | ||
_kernel = kernel; | ||
} | ||
|
||
public void AddToHistory(string message, ChatUserType userType) => _state.History.Add(new ChatHistoryItem | ||
{ | ||
Message = message, | ||
Order = _state.History.Count + 1, | ||
UserType = userType | ||
}); | ||
|
||
public string AppendChatHistory(string ask) | ||
{ | ||
AddToHistory(ask, ChatUserType.User); | ||
return string.Join("\n", _state.History.Select(message => $"{message.UserType}: {message.Message}")); | ||
} | ||
|
||
public virtual async Task<string> CallFunction(string template, KernelArguments arguments, OpenAIPromptExecutionSettings? settings = null) | ||
{ | ||
// TODO: extract this to be configurable | ||
var promptSettings = settings ?? new OpenAIPromptExecutionSettings { MaxTokens = 4096, Temperature = 0.8, TopP = 1 }; | ||
var function = _kernel.CreateFunctionFromPrompt(template, promptSettings); | ||
var result = (await _kernel.InvokeAsync(function, arguments).ConfigureAwait(true)).ToString(); | ||
AddToHistory(result, ChatUserType.Agent); | ||
return result; | ||
} | ||
|
||
/// <summary> | ||
/// Adds knowledge to the | ||
/// </summary> | ||
/// <param name="instruction">The instruction string that uses the value of !index! as a placeholder to inject the data. Example:"Consider the following architectural guidelines: {waf}" </param> | ||
/// <param name="index">Knowledge index</param> | ||
/// <param name="arguments">The sk arguments, "input" is the argument </param> | ||
/// <returns></returns> | ||
public async Task<KernelArguments> AddKnowledge(string instruction, string index, KernelArguments arguments) | ||
{ | ||
var documents = _memory.SearchAsync(index, arguments["input"]?.ToString()!, 5); | ||
var kbStringBuilder = new StringBuilder(); | ||
await foreach (var doc in documents) | ||
{ | ||
kbStringBuilder.AppendLine(CultureInfo.InvariantCulture, $"{doc.Metadata.Text}"); | ||
} | ||
arguments[index] = instruction.Replace($"!{index}!", $"{kbStringBuilder}"); | ||
return arguments; | ||
} | ||
} | ||
|
||
// TODO Remove history when we introduce memory banks | ||
public class AgentState<T> where T : class, new() | ||
{ | ||
public List<ChatHistoryItem> History { get; set; } = []; | ||
public T Data { get; set; } = new(); | ||
} | ||
|
||
public class ChatHistoryItem | ||
{ | ||
public required string Message { get; set; } | ||
public ChatUserType UserType { get; set; } | ||
public int Order { get; set; } | ||
} | ||
|
||
public enum ChatUserType | ||
{ | ||
System, | ||
User, | ||
Agent | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
*/Results/ | ||
*/Tasks/ | ||
*/Downloads/ | ||
*/ENV.json |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
ENV.json |
5 changes: 5 additions & 0 deletions
5
python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"BING_API_KEY": "YOUR_KEY_KEY", | ||
"HOMEPAGE": "https://www.bing.com/", | ||
"WEB_SURFER_DEBUG_DIR": "/autogen/debug" | ||
} |
78 changes: 78 additions & 0 deletions
78
python/packages/agbench/benchmarks/AssistantBench/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
# AssistantBench Benchmark | ||
|
||
This scenario implements the [AssistantBench](https://assistantbench.github.io/) agent benchmark. Before you begin, make sure you have followed the instructions in `../README.md` to prepare your environment. We modify the evaluation code from AssistantBench in [Scripts](Scripts) and retain the license including it here [LICENSE](Scripts/evaluate_utils/LICENSE). Please find the original AssistantBench evaluation code here [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation). | ||
|
||
### Setup Environment Variables for AgBench | ||
|
||
Navigate to AssistantBench | ||
|
||
```bash | ||
cd benchmarks/AssistantBench | ||
``` | ||
|
||
Create a file called ENV.json with the following (required) contents (If you're using MagenticOne) | ||
|
||
```json | ||
{ | ||
"BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY", | ||
"HOMEPAGE": "https://www.bing.com/", | ||
"WEB_SURFER_DEBUG_DIR": "/autogen/debug", | ||
"CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}", | ||
"CHAT_COMPLETION_PROVIDER": "azure" | ||
} | ||
``` | ||
|
||
You can also use the openai client by replacing the last two entries in the ENV file by: | ||
|
||
- `CHAT_COMPLETION_PROVIDER='openai'` | ||
- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure: | ||
|
||
```json | ||
{ | ||
"api_key": "REPLACE_WITH_YOUR_API", | ||
"model": "gpt-4o-2024-05-13" | ||
} | ||
``` | ||
|
||
Now initialize the tasks. | ||
|
||
```bash | ||
python Scripts/init_tasks.py | ||
``` | ||
|
||
Note: This will attempt to download AssistantBench from Huggingface, but this requires authentication. | ||
|
||
After running the script, you should see the new following folders and files: | ||
|
||
``` | ||
. | ||
./Downloads | ||
./Downloads/AssistantBench | ||
./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl | ||
./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl | ||
./Tasks | ||
./Tasks/assistant_bench_v1.0_dev.jsonl | ||
./Tasks/assistant_bench_v1.0_dev.jsonl | ||
``` | ||
|
||
Then run `Scripts/init_tasks.py` again. | ||
|
||
Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`. | ||
|
||
### Running AssistantBench | ||
|
||
Now to run a specific subset of AssistantBench use: | ||
|
||
```bash | ||
agbench run Tasks/assistant_bench_v1.0_dev__MagenticOne.jsonl | ||
``` | ||
|
||
You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following: | ||
|
||
```bash | ||
agbench tabulate Results/assistant_bench_v1.0_dev__MagenticOne | ||
``` | ||
|
||
## References | ||
|
||
Yoran, Ori, Samuel Joseph Amouyal, Chaitanya Malaviya, Ben Bogin, Ofir Press, and Jonathan Berant. "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?." arXiv preprint arXiv:2407.15711 (2024). https://arxiv.org/abs/2407.15711 |
127 changes: 127 additions & 0 deletions
127
python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py | ||
import json | ||
from evaluate_utils.evaluate_factory import get_evaluator | ||
import numpy as np | ||
|
||
|
||
def find_isnan(samp): | ||
try: | ||
if np.isnan(samp): | ||
return True | ||
else: | ||
return False | ||
except: | ||
return False | ||
|
||
|
||
def fix_ans(answer): | ||
try: | ||
answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}') | ||
answer = answer.replace("': ", '": ') | ||
return answer | ||
except: | ||
return answer | ||
|
||
|
||
def parse_answer(answer): | ||
if len(answer) == 1: | ||
ans, is_num = fix_number(answer[0]) | ||
if is_num: | ||
return ans, "number" | ||
try: | ||
ans = json.loads(fix_ans(answer[0])) | ||
return [ans], "json" | ||
except: | ||
ans, is_num = fix_number(answer[0]) | ||
if is_num: | ||
return ans, "number" | ||
else: | ||
return answer[0], "string" | ||
else: | ||
try: | ||
ans = [json.loads(fix_ans(ex)) for ex in answer] | ||
return ans, "json" | ||
except: | ||
return answer, "string list" | ||
|
||
|
||
def fix_number(number): | ||
if type(number) == str: | ||
copy_ans = number | ||
copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip() | ||
copy_ans = copy_ans.strip() | ||
copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "") | ||
try: | ||
return float(copy_ans), True | ||
except: | ||
return number, False | ||
elif type(number) == int: | ||
return float(number), True | ||
else: | ||
return number, True | ||
|
||
|
||
def fix_prediction(prediction, gold_answer, evaluator): | ||
if ( | ||
type(prediction) == list | ||
and len(prediction) == 1 | ||
and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric())) | ||
): | ||
prediction = fix_number(prediction[0]) | ||
|
||
if type(prediction) != list: | ||
prediction, is_num = fix_number(prediction) | ||
if evaluator == "json": | ||
try: | ||
prediction = [json.loads(pred) for pred in prediction.split("\n")] | ||
except: | ||
prediction = [prediction] | ||
|
||
if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0): | ||
return prediction, False | ||
|
||
if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float: | ||
return prediction, False | ||
|
||
return prediction, True | ||
|
||
|
||
def question_scorer(prediction, gold_answer): | ||
""" | ||
prediction: str or list of str | ||
gold_answer: str or list of str | ||
returns a float between 0 and 1 | ||
""" | ||
try: | ||
try: | ||
prediction = json.loads(prediction) | ||
except: | ||
prediction = prediction | ||
|
||
answer_list = ( | ||
[x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer | ||
) | ||
gold_answer, evaluator = parse_answer(answer_list) | ||
prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator) | ||
|
||
has_ans = 1.0 | ||
if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction): | ||
has_ans = 0.0 | ||
|
||
if not run_eval: | ||
return 0.0 | ||
|
||
metric_eval = get_evaluator(evaluator) | ||
accuracy = metric_eval(prediction, gold_answer) | ||
# double check if the accuracy is a number between 0 and 1 | ||
if 0 <= accuracy <= 1: | ||
return accuracy | ||
else: | ||
# throw exception | ||
raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}") | ||
except Exception as e: | ||
print( | ||
f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}" | ||
) | ||
return 0.0 |
Oops, something went wrong.