Merge branch 'main' into rysweet-dotnet-ai-ext-abstractions

microsoft · Oct 18, 2024 · 548cb1d · 548cb1d
2 parents 135018c + e11d84b
commit 548cb1d
Show file tree

Hide file tree

Showing 100 changed files with 4,663 additions and 140 deletions.
diff --git a/dotnet/src/Microsoft.AutoGen/Agents/Agents/AIAgent/AiAgent.cs b/dotnet/src/Microsoft.AutoGen/Agents/Agents/AIAgent/AiAgent.cs
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Globalization;
+using System.Text;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.Connectors.OpenAI;
+using Microsoft.SemanticKernel.Memory;
+
+namespace Microsoft.AutoGen.Agents;
+public abstract class AiAgent<T> : AgentBase where T : class, new()
+{
+    protected AgentState<T> _state;
+    protected Kernel _kernel;
+    private readonly ISemanticTextMemory _memory;
+
+    public AiAgent(IAgentContext context, ISemanticTextMemory memory, Kernel kernel, EventTypes typeRegistry) : base(context, typeRegistry)
+    {
+        _state = new();
+        _memory = memory;
+        _kernel = kernel;
+    }
+
+    public void AddToHistory(string message, ChatUserType userType) => _state.History.Add(new ChatHistoryItem
+    {
+        Message = message,
+        Order = _state.History.Count + 1,
+        UserType = userType
+    });
+
+    public string AppendChatHistory(string ask)
+    {
+        AddToHistory(ask, ChatUserType.User);
+        return string.Join("\n", _state.History.Select(message => $"{message.UserType}: {message.Message}"));
+    }
+
+    public virtual async Task<string> CallFunction(string template, KernelArguments arguments, OpenAIPromptExecutionSettings? settings = null)
+    {
+        // TODO: extract this to be configurable
+        var promptSettings = settings ?? new OpenAIPromptExecutionSettings { MaxTokens = 4096, Temperature = 0.8, TopP = 1 };
+        var function = _kernel.CreateFunctionFromPrompt(template, promptSettings);
+        var result = (await _kernel.InvokeAsync(function, arguments).ConfigureAwait(true)).ToString();
+        AddToHistory(result, ChatUserType.Agent);
+        return result;
+    }
+
+    /// <summary>
+    /// Adds knowledge to the 
+    /// </summary>
+    /// <param name="instruction">The instruction string that uses the value of !index! as a placeholder to inject the data. Example:"Consider the following architectural guidelines: {waf}" </param>
+    /// <param name="index">Knowledge index</param>
+    /// <param name="arguments">The sk arguments, "input" is the argument </param>
+    /// <returns></returns>
+    public async Task<KernelArguments> AddKnowledge(string instruction, string index, KernelArguments arguments)
+    {
+        var documents = _memory.SearchAsync(index, arguments["input"]?.ToString()!, 5);
+        var kbStringBuilder = new StringBuilder();
+        await foreach (var doc in documents)
+        {
+            kbStringBuilder.AppendLine(CultureInfo.InvariantCulture, $"{doc.Metadata.Text}");
+        }
+        arguments[index] = instruction.Replace($"!{index}!", $"{kbStringBuilder}");
+        return arguments;
+    }
+}
+
+// TODO Remove history when we introduce memory banks
+public class AgentState<T> where T : class, new()
+{
+    public List<ChatHistoryItem> History { get; set; } = [];
+    public T Data { get; set; } = new();
+}
+
+public class ChatHistoryItem
+{
+    public required string Message { get; set; }
+    public ChatUserType UserType { get; set; }
+    public int Order { get; set; }
+}
+
+public enum ChatUserType
+{
+    System,
+    User,
+    Agent
+}
diff --git a/dotnet/src/Microsoft.AutoGen/Agents/App.cs b/dotnet/src/Microsoft.AutoGen/Agents/App.cs
@@ -2,6 +2,8 @@
 using Google.Protobuf;
 using Microsoft.AspNetCore.Builder;
 using Microsoft.AutoGen.Runtime;
+using Google.Protobuf;
+using Microsoft.AspNetCore.Builder;
 using Microsoft.Extensions.DependencyInjection;
 using Microsoft.Extensions.Hosting;
 
@@ -39,7 +41,6 @@ public static async ValueTask<WebApplication> StartAsync(AgentTypes? agentTypes
         await app.StartAsync().ConfigureAwait(false);
         return Host;
     }
-
     public static async ValueTask<WebApplication> PublishMessageAsync(
         string topic,
         IMessage message,

diff --git a/python/packages/agbench/benchmarks/.gitignore b/python/packages/agbench/benchmarks/.gitignore
@@ -0,0 +1,4 @@
+*/Results/
+*/Tasks/
+*/Downloads/
+*/ENV.json
diff --git a/python/packages/agbench/benchmarks/AssistantBench/.gitignore b/python/packages/agbench/benchmarks/AssistantBench/.gitignore
@@ -0,0 +1 @@
+ENV.json
diff --git a/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample b/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample
@@ -0,0 +1,5 @@
+{
+    "BING_API_KEY": "YOUR_KEY_KEY",
+    "HOMEPAGE": "https://www.bing.com/",
+    "WEB_SURFER_DEBUG_DIR": "/autogen/debug"
+}
diff --git a/python/packages/agbench/benchmarks/AssistantBench/README.md b/python/packages/agbench/benchmarks/AssistantBench/README.md
@@ -0,0 +1,78 @@
+# AssistantBench Benchmark
+
+This scenario implements the [AssistantBench](https://assistantbench.github.io/) agent benchmark. Before you begin, make sure you have followed the instructions in `../README.md` to prepare your environment. We modify the evaluation code from AssistantBench in [Scripts](Scripts) and retain the license  including it here [LICENSE](Scripts/evaluate_utils/LICENSE).  Please find the original AssistantBench evaluation code here [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation).
+
+### Setup Environment Variables for AgBench
+
+Navigate to AssistantBench
+
+```bash
+cd benchmarks/AssistantBench
+```
+
+Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
+
+```json
+{
+    "BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY",
+    "HOMEPAGE": "https://www.bing.com/",
+    "WEB_SURFER_DEBUG_DIR": "/autogen/debug",
+    "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
+    "CHAT_COMPLETION_PROVIDER": "azure"
+}
+```
+
+You can also use the openai client by replacing the last two entries in the ENV file by:
+
+- `CHAT_COMPLETION_PROVIDER='openai'`
+- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
+
+```json
+{
+  "api_key": "REPLACE_WITH_YOUR_API",
+  "model": "gpt-4o-2024-05-13"
+}
+```
+
+Now initialize the tasks.
+
+```bash
+python Scripts/init_tasks.py
+```
+
+Note: This will attempt to download AssistantBench from Huggingface, but this requires authentication.
+
+After running the script, you should see the new following folders and files:
+
+```
+.
+./Downloads
+./Downloads/AssistantBench
+./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
+./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
+./Tasks
+./Tasks/assistant_bench_v1.0_dev.jsonl
+./Tasks/assistant_bench_v1.0_dev.jsonl
+```
+
+Then run `Scripts/init_tasks.py` again.
+
+Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`.
+
+### Running AssistantBench
+
+Now to run a specific subset of AssistantBench use:
+
+```bash
+agbench run Tasks/assistant_bench_v1.0_dev__MagenticOne.jsonl
+```
+
+You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:
+
+```bash
+agbench tabulate Results/assistant_bench_v1.0_dev__MagenticOne
+```
+
+## References
+
+Yoran, Ori, Samuel Joseph Amouyal, Chaitanya Malaviya, Ben Bogin, Ofir Press, and Jonathan Berant. "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?." arXiv preprint arXiv:2407.15711 (2024). https://arxiv.org/abs/2407.15711
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py
@@ -0,0 +1,127 @@
+# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py
+import json
+from evaluate_utils.evaluate_factory import get_evaluator
+import numpy as np
+
+
+def find_isnan(samp):
+    try:
+        if np.isnan(samp):
+            return True
+        else:
+            return False
+    except:
+        return False
+
+
+def fix_ans(answer):
+    try:
+        answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
+        answer = answer.replace("': ", '": ')
+        return answer
+    except:
+        return answer
+
+
+def parse_answer(answer):
+    if len(answer) == 1:
+        ans, is_num = fix_number(answer[0])
+        if is_num:
+            return ans, "number"
+        try:
+            ans = json.loads(fix_ans(answer[0]))
+            return [ans], "json"
+        except:
+            ans, is_num = fix_number(answer[0])
+            if is_num:
+                return ans, "number"
+            else:
+                return answer[0], "string"
+    else:
+        try:
+            ans = [json.loads(fix_ans(ex)) for ex in answer]
+            return ans, "json"
+        except:
+            return answer, "string list"
+
+
+def fix_number(number):
+    if type(number) == str:
+        copy_ans = number
+        copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()
+        copy_ans = copy_ans.strip()
+        copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
+        try:
+            return float(copy_ans), True
+        except:
+            return number, False
+    elif type(number) == int:
+        return float(number), True
+    else:
+        return number, True
+
+
+def fix_prediction(prediction, gold_answer, evaluator):
+    if (
+        type(prediction) == list
+        and len(prediction) == 1
+        and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))
+    ):
+        prediction = fix_number(prediction[0])
+
+    if type(prediction) != list:
+        prediction, is_num = fix_number(prediction)
+        if evaluator == "json":
+            try:
+                prediction = [json.loads(pred) for pred in prediction.split("\n")]
+            except:
+                prediction = [prediction]
+
+    if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
+        return prediction, False
+
+    if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
+        return prediction, False
+
+    return prediction, True
+
+
+def question_scorer(prediction, gold_answer):
+    """
+    prediction: str or list of str
+    gold_answer: str or list of str
+
+    returns a float between 0 and 1
+    """
+    try:
+        try:
+            prediction = json.loads(prediction)
+        except:
+            prediction = prediction
+
+        answer_list = (
+            [x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
+        )
+        gold_answer, evaluator = parse_answer(answer_list)
+        prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
+
+        has_ans = 1.0
+        if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
+            has_ans = 0.0
+
+        if not run_eval:
+            return 0.0
+
+        metric_eval = get_evaluator(evaluator)
+        accuracy = metric_eval(prediction, gold_answer)
+        # double check if the accuracy is a number between 0 and 1
+        if 0 <= accuracy <= 1:
+            return accuracy
+        else:
+            # throw exception
+            raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")
+    except Exception as e:
+        print(
+            f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"
+        )
+        return 0.0