Skip to content

Commit

Permalink
Merge branch 'main' into rysweet-dotnet-ai-ext-abstractions
Browse files Browse the repository at this point in the history
  • Loading branch information
rysweet authored Oct 18, 2024
2 parents 135018c + e11d84b commit 548cb1d
Show file tree
Hide file tree
Showing 100 changed files with 4,663 additions and 140 deletions.
85 changes: 85 additions & 0 deletions dotnet/src/Microsoft.AutoGen/Agents/Agents/AIAgent/AiAgent.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Copyright (c) Microsoft. All rights reserved.

using System.Globalization;
using System.Text;
using Microsoft.SemanticKernel;
using Microsoft.SemanticKernel.Connectors.OpenAI;
using Microsoft.SemanticKernel.Memory;

namespace Microsoft.AutoGen.Agents;
public abstract class AiAgent<T> : AgentBase where T : class, new()
{
protected AgentState<T> _state;
protected Kernel _kernel;
private readonly ISemanticTextMemory _memory;

public AiAgent(IAgentContext context, ISemanticTextMemory memory, Kernel kernel, EventTypes typeRegistry) : base(context, typeRegistry)
{
_state = new();
_memory = memory;
_kernel = kernel;
}

public void AddToHistory(string message, ChatUserType userType) => _state.History.Add(new ChatHistoryItem
{
Message = message,
Order = _state.History.Count + 1,
UserType = userType
});

public string AppendChatHistory(string ask)
{
AddToHistory(ask, ChatUserType.User);
return string.Join("\n", _state.History.Select(message => $"{message.UserType}: {message.Message}"));
}

public virtual async Task<string> CallFunction(string template, KernelArguments arguments, OpenAIPromptExecutionSettings? settings = null)
{
// TODO: extract this to be configurable
var promptSettings = settings ?? new OpenAIPromptExecutionSettings { MaxTokens = 4096, Temperature = 0.8, TopP = 1 };
var function = _kernel.CreateFunctionFromPrompt(template, promptSettings);
var result = (await _kernel.InvokeAsync(function, arguments).ConfigureAwait(true)).ToString();
AddToHistory(result, ChatUserType.Agent);
return result;
}

/// <summary>
/// Adds knowledge to the
/// </summary>
/// <param name="instruction">The instruction string that uses the value of !index! as a placeholder to inject the data. Example:"Consider the following architectural guidelines: {waf}" </param>
/// <param name="index">Knowledge index</param>
/// <param name="arguments">The sk arguments, "input" is the argument </param>
/// <returns></returns>
public async Task<KernelArguments> AddKnowledge(string instruction, string index, KernelArguments arguments)
{
var documents = _memory.SearchAsync(index, arguments["input"]?.ToString()!, 5);
var kbStringBuilder = new StringBuilder();
await foreach (var doc in documents)
{
kbStringBuilder.AppendLine(CultureInfo.InvariantCulture, $"{doc.Metadata.Text}");
}
arguments[index] = instruction.Replace($"!{index}!", $"{kbStringBuilder}");
return arguments;
}
}

// TODO Remove history when we introduce memory banks
public class AgentState<T> where T : class, new()
{
public List<ChatHistoryItem> History { get; set; } = [];
public T Data { get; set; } = new();
}

public class ChatHistoryItem
{
public required string Message { get; set; }
public ChatUserType UserType { get; set; }
public int Order { get; set; }
}

public enum ChatUserType
{
System,
User,
Agent
}
3 changes: 2 additions & 1 deletion dotnet/src/Microsoft.AutoGen/Agents/App.cs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
using Google.Protobuf;
using Microsoft.AspNetCore.Builder;

Check failure on line 3 in dotnet/src/Microsoft.AutoGen/Agents/App.cs

View workflow job for this annotation

GitHub Actions / Dotnet Build (ubuntu-latest, 3.11)

Using directive is unnecessary.

Check failure on line 3 in dotnet/src/Microsoft.AutoGen/Agents/App.cs

View workflow job for this annotation

GitHub Actions / Dotnet Build (macos-latest, 3.11)

Using directive is unnecessary.
using Microsoft.AutoGen.Runtime;
using Google.Protobuf;

Check failure on line 5 in dotnet/src/Microsoft.AutoGen/Agents/App.cs

View workflow job for this annotation

GitHub Actions / Dotnet Build (ubuntu-latest, 3.11)

Using directive is unnecessary.

Check failure on line 5 in dotnet/src/Microsoft.AutoGen/Agents/App.cs

View workflow job for this annotation

GitHub Actions / Dotnet Build (macos-latest, 3.11)

Using directive is unnecessary.
using Microsoft.AspNetCore.Builder;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;

Expand Down Expand Up @@ -39,7 +41,6 @@ public static async ValueTask<WebApplication> StartAsync(AgentTypes? agentTypes
await app.StartAsync().ConfigureAwait(false);
return Host;
}

public static async ValueTask<WebApplication> PublishMessageAsync(
string topic,
IMessage message,
Expand Down
4 changes: 4 additions & 0 deletions python/packages/agbench/benchmarks/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
*/Results/
*/Tasks/
*/Downloads/
*/ENV.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
ENV.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"BING_API_KEY": "YOUR_KEY_KEY",
"HOMEPAGE": "https://www.bing.com/",
"WEB_SURFER_DEBUG_DIR": "/autogen/debug"
}
78 changes: 78 additions & 0 deletions python/packages/agbench/benchmarks/AssistantBench/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
# AssistantBench Benchmark

This scenario implements the [AssistantBench](https://assistantbench.github.io/) agent benchmark. Before you begin, make sure you have followed the instructions in `../README.md` to prepare your environment. We modify the evaluation code from AssistantBench in [Scripts](Scripts) and retain the license including it here [LICENSE](Scripts/evaluate_utils/LICENSE). Please find the original AssistantBench evaluation code here [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation).

### Setup Environment Variables for AgBench

Navigate to AssistantBench

```bash
cd benchmarks/AssistantBench
```

Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)

```json
{
"BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY",
"HOMEPAGE": "https://www.bing.com/",
"WEB_SURFER_DEBUG_DIR": "/autogen/debug",
"CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
"CHAT_COMPLETION_PROVIDER": "azure"
}
```

You can also use the openai client by replacing the last two entries in the ENV file by:

- `CHAT_COMPLETION_PROVIDER='openai'`
- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:

```json
{
"api_key": "REPLACE_WITH_YOUR_API",
"model": "gpt-4o-2024-05-13"
}
```

Now initialize the tasks.

```bash
python Scripts/init_tasks.py
```

Note: This will attempt to download AssistantBench from Huggingface, but this requires authentication.

After running the script, you should see the new following folders and files:

```
.
./Downloads
./Downloads/AssistantBench
./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
./Tasks
./Tasks/assistant_bench_v1.0_dev.jsonl
./Tasks/assistant_bench_v1.0_dev.jsonl
```

Then run `Scripts/init_tasks.py` again.

Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`.

### Running AssistantBench

Now to run a specific subset of AssistantBench use:

```bash
agbench run Tasks/assistant_bench_v1.0_dev__MagenticOne.jsonl
```

You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:

```bash
agbench tabulate Results/assistant_bench_v1.0_dev__MagenticOne
```

## References

Yoran, Ori, Samuel Joseph Amouyal, Chaitanya Malaviya, Ben Bogin, Ofir Press, and Jonathan Berant. "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?." arXiv preprint arXiv:2407.15711 (2024). https://arxiv.org/abs/2407.15711
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py
import json
from evaluate_utils.evaluate_factory import get_evaluator
import numpy as np


def find_isnan(samp):
try:
if np.isnan(samp):
return True
else:
return False
except:
return False


def fix_ans(answer):
try:
answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
answer = answer.replace("': ", '": ')
return answer
except:
return answer


def parse_answer(answer):
if len(answer) == 1:
ans, is_num = fix_number(answer[0])
if is_num:
return ans, "number"
try:
ans = json.loads(fix_ans(answer[0]))
return [ans], "json"
except:
ans, is_num = fix_number(answer[0])
if is_num:
return ans, "number"
else:
return answer[0], "string"
else:
try:
ans = [json.loads(fix_ans(ex)) for ex in answer]
return ans, "json"
except:
return answer, "string list"


def fix_number(number):
if type(number) == str:
copy_ans = number
copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()
copy_ans = copy_ans.strip()
copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
try:
return float(copy_ans), True
except:
return number, False
elif type(number) == int:
return float(number), True
else:
return number, True


def fix_prediction(prediction, gold_answer, evaluator):
if (
type(prediction) == list
and len(prediction) == 1
and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))
):
prediction = fix_number(prediction[0])

if type(prediction) != list:
prediction, is_num = fix_number(prediction)
if evaluator == "json":
try:
prediction = [json.loads(pred) for pred in prediction.split("\n")]
except:
prediction = [prediction]

if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
return prediction, False

if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
return prediction, False

return prediction, True


def question_scorer(prediction, gold_answer):
"""
prediction: str or list of str
gold_answer: str or list of str
returns a float between 0 and 1
"""
try:
try:
prediction = json.loads(prediction)
except:
prediction = prediction

answer_list = (
[x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
)
gold_answer, evaluator = parse_answer(answer_list)
prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)

has_ans = 1.0
if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
has_ans = 0.0

if not run_eval:
return 0.0

metric_eval = get_evaluator(evaluator)
accuracy = metric_eval(prediction, gold_answer)
# double check if the accuracy is a number between 0 and 1
if 0 <= accuracy <= 1:
return accuracy
else:
# throw exception
raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")
except Exception as e:
print(
f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"
)
return 0.0
Loading

0 comments on commit 548cb1d

Please sign in to comment.