diff --git a/dotnet/src/Microsoft.AutoGen/Agents/Agents/AIAgent/AiAgent.cs b/dotnet/src/Microsoft.AutoGen/Agents/Agents/AIAgent/AiAgent.cs
new file mode 100644
index 000000000000..d5b4675e8945
--- /dev/null
+++ b/dotnet/src/Microsoft.AutoGen/Agents/Agents/AIAgent/AiAgent.cs
@@ -0,0 +1,85 @@
+// Copyright (c) Microsoft. All rights reserved.
+
+using System.Globalization;
+using System.Text;
+using Microsoft.SemanticKernel;
+using Microsoft.SemanticKernel.Connectors.OpenAI;
+using Microsoft.SemanticKernel.Memory;
+
+namespace Microsoft.AutoGen.Agents;
+public abstract class AiAgent<T> : AgentBase where T : class, new()
+{
+    protected AgentState<T> _state;
+    protected Kernel _kernel;
+    private readonly ISemanticTextMemory _memory;
+
+    public AiAgent(IAgentContext context, ISemanticTextMemory memory, Kernel kernel, EventTypes typeRegistry) : base(context, typeRegistry)
+    {
+        _state = new();
+        _memory = memory;
+        _kernel = kernel;
+    }
+
+    public void AddToHistory(string message, ChatUserType userType) => _state.History.Add(new ChatHistoryItem
+    {
+        Message = message,
+        Order = _state.History.Count + 1,
+        UserType = userType
+    });
+
+    public string AppendChatHistory(string ask)
+    {
+        AddToHistory(ask, ChatUserType.User);
+        return string.Join("\n", _state.History.Select(message => $"{message.UserType}: {message.Message}"));
+    }
+
+    public virtual async Task<string> CallFunction(string template, KernelArguments arguments, OpenAIPromptExecutionSettings? settings = null)
+    {
+        // TODO: extract this to be configurable
+        var promptSettings = settings ?? new OpenAIPromptExecutionSettings { MaxTokens = 4096, Temperature = 0.8, TopP = 1 };
+        var function = _kernel.CreateFunctionFromPrompt(template, promptSettings);
+        var result = (await _kernel.InvokeAsync(function, arguments).ConfigureAwait(true)).ToString();
+        AddToHistory(result, ChatUserType.Agent);
+        return result;
+    }
+
+    /// <summary>
+    /// Adds knowledge to the 
+    /// </summary>
+    /// <param name="instruction">The instruction string that uses the value of !index! as a placeholder to inject the data. Example:"Consider the following architectural guidelines: {waf}" </param>
+    /// <param name="index">Knowledge index</param>
+    /// <param name="arguments">The sk arguments, "input" is the argument </param>
+    /// <returns></returns>
+    public async Task<KernelArguments> AddKnowledge(string instruction, string index, KernelArguments arguments)
+    {
+        var documents = _memory.SearchAsync(index, arguments["input"]?.ToString()!, 5);
+        var kbStringBuilder = new StringBuilder();
+        await foreach (var doc in documents)
+        {
+            kbStringBuilder.AppendLine(CultureInfo.InvariantCulture, $"{doc.Metadata.Text}");
+        }
+        arguments[index] = instruction.Replace($"!{index}!", $"{kbStringBuilder}");
+        return arguments;
+    }
+}
+
+// TODO Remove history when we introduce memory banks
+public class AgentState<T> where T : class, new()
+{
+    public List<ChatHistoryItem> History { get; set; } = [];
+    public T Data { get; set; } = new();
+}
+
+public class ChatHistoryItem
+{
+    public required string Message { get; set; }
+    public ChatUserType UserType { get; set; }
+    public int Order { get; set; }
+}
+
+public enum ChatUserType
+{
+    System,
+    User,
+    Agent
+}
diff --git a/dotnet/src/Microsoft.AutoGen/Agents/App.cs b/dotnet/src/Microsoft.AutoGen/Agents/App.cs
index ebacd0876e5f..e2504f739a7a 100644
--- a/dotnet/src/Microsoft.AutoGen/Agents/App.cs
+++ b/dotnet/src/Microsoft.AutoGen/Agents/App.cs
@@ -2,6 +2,8 @@
 using Google.Protobuf;
 using Microsoft.AspNetCore.Builder;
 using Microsoft.AutoGen.Runtime;
+using Google.Protobuf;
+using Microsoft.AspNetCore.Builder;
 using Microsoft.Extensions.DependencyInjection;
 using Microsoft.Extensions.Hosting;
 
@@ -39,7 +41,6 @@ public static async ValueTask<WebApplication> StartAsync(AgentTypes? agentTypes
         await app.StartAsync().ConfigureAwait(false);
         return Host;
     }
-
     public static async ValueTask<WebApplication> PublishMessageAsync(
         string topic,
         IMessage message,
diff --git a/python/packages/agbench/benchmarks/.gitignore b/python/packages/agbench/benchmarks/.gitignore
new file mode 100644
index 000000000000..4fe755350dcb
--- /dev/null
+++ b/python/packages/agbench/benchmarks/.gitignore
@@ -0,0 +1,4 @@
+*/Results/
+*/Tasks/
+*/Downloads/
+*/ENV.json
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/AssistantBench/.gitignore b/python/packages/agbench/benchmarks/AssistantBench/.gitignore
new file mode 100644
index 000000000000..f6c9d117b084
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/.gitignore
@@ -0,0 +1 @@
+ENV.json
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample b/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample
new file mode 100644
index 000000000000..1f2c4915e3c7
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/ENV.json.sample
@@ -0,0 +1,5 @@
+{
+    "BING_API_KEY": "YOUR_KEY_KEY",
+    "HOMEPAGE": "https://www.bing.com/",
+    "WEB_SURFER_DEBUG_DIR": "/autogen/debug"
+}
diff --git a/python/packages/agbench/benchmarks/AssistantBench/README.md b/python/packages/agbench/benchmarks/AssistantBench/README.md
new file mode 100644
index 000000000000..30bcf881fb00
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/README.md
@@ -0,0 +1,78 @@
+# AssistantBench Benchmark
+
+This scenario implements the [AssistantBench](https://assistantbench.github.io/) agent benchmark. Before you begin, make sure you have followed the instructions in `../README.md` to prepare your environment. We modify the evaluation code from AssistantBench in [Scripts](Scripts) and retain the license  including it here [LICENSE](Scripts/evaluate_utils/LICENSE).  Please find the original AssistantBench evaluation code here [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation).
+
+### Setup Environment Variables for AgBench
+
+Navigate to AssistantBench
+
+```bash
+cd benchmarks/AssistantBench
+```
+
+Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
+
+```json
+{
+    "BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY",
+    "HOMEPAGE": "https://www.bing.com/",
+    "WEB_SURFER_DEBUG_DIR": "/autogen/debug",
+    "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
+    "CHAT_COMPLETION_PROVIDER": "azure"
+}
+```
+
+You can also use the openai client by replacing the last two entries in the ENV file by:
+
+- `CHAT_COMPLETION_PROVIDER='openai'`
+- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
+
+```json
+{
+  "api_key": "REPLACE_WITH_YOUR_API",
+  "model": "gpt-4o-2024-05-13"
+}
+```
+
+Now initialize the tasks.
+
+```bash
+python Scripts/init_tasks.py
+```
+
+Note: This will attempt to download AssistantBench from Huggingface, but this requires authentication.
+
+After running the script, you should see the new following folders and files:
+
+```
+.
+./Downloads
+./Downloads/AssistantBench
+./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
+./Downloads/AssistantBench/assistant_bench_v1.0_dev.jsonl
+./Tasks
+./Tasks/assistant_bench_v1.0_dev.jsonl
+./Tasks/assistant_bench_v1.0_dev.jsonl
+```
+
+Then run `Scripts/init_tasks.py` again.
+
+Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`.
+
+### Running AssistantBench
+
+Now to run a specific subset of AssistantBench use:
+
+```bash
+agbench run Tasks/assistant_bench_v1.0_dev__MagenticOne.jsonl
+```
+
+You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:
+
+```bash
+agbench tabulate Results/assistant_bench_v1.0_dev__MagenticOne
+```
+
+## References
+
+Yoran, Ori, Samuel Joseph Amouyal, Chaitanya Malaviya, Ben Bogin, Ofir Press, and Jonathan Berant. "AssistantBench: Can Web Agents Solve Realistic and Time-Consuming Tasks?." arXiv preprint arXiv:2407.15711 (2024). https://arxiv.org/abs/2407.15711
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py
new file mode 100644
index 000000000000..56d1a04faa67
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/assistantbench_evaluator.py
@@ -0,0 +1,127 @@
+# This Script is slightly modified from the creators of the AssistantBench dataset https://huggingface.co/spaces/AssistantBench/leaderboard/blob/main/evaluation/evaluator.py
+import json
+from evaluate_utils.evaluate_factory import get_evaluator
+import numpy as np
+
+
+def find_isnan(samp):
+    try:
+        if np.isnan(samp):
+            return True
+        else:
+            return False
+    except:
+        return False
+
+
+def fix_ans(answer):
+    try:
+        answer = answer.replace("{'", '{"').replace("', '", '", "').replace("': '", '": "').replace("'}", '"}')
+        answer = answer.replace("': ", '": ')
+        return answer
+    except:
+        return answer
+
+
+def parse_answer(answer):
+    if len(answer) == 1:
+        ans, is_num = fix_number(answer[0])
+        if is_num:
+            return ans, "number"
+        try:
+            ans = json.loads(fix_ans(answer[0]))
+            return [ans], "json"
+        except:
+            ans, is_num = fix_number(answer[0])
+            if is_num:
+                return ans, "number"
+            else:
+                return answer[0], "string"
+    else:
+        try:
+            ans = [json.loads(fix_ans(ex)) for ex in answer]
+            return ans, "json"
+        except:
+            return answer, "string list"
+
+
+def fix_number(number):
+    if type(number) == str:
+        copy_ans = number
+        copy_ans = " ".join(" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")).strip()
+        copy_ans = copy_ans.strip()
+        copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
+        try:
+            return float(copy_ans), True
+        except:
+            return number, False
+    elif type(number) == int:
+        return float(number), True
+    else:
+        return number, True
+
+
+def fix_prediction(prediction, gold_answer, evaluator):
+    if (
+        type(prediction) == list
+        and len(prediction) == 1
+        and (type(prediction[0]) == int or ((type(prediction[0]) == str) and prediction[0].isnumeric()))
+    ):
+        prediction = fix_number(prediction[0])
+
+    if type(prediction) != list:
+        prediction, is_num = fix_number(prediction)
+        if evaluator == "json":
+            try:
+                prediction = [json.loads(pred) for pred in prediction.split("\n")]
+            except:
+                prediction = [prediction]
+
+    if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
+        return prediction, False
+
+    if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
+        return prediction, False
+
+    return prediction, True
+
+
+def question_scorer(prediction, gold_answer):
+    """
+    prediction: str or list of str
+    gold_answer: str or list of str
+
+    returns a float between 0 and 1
+    """
+    try:
+        try:
+            prediction = json.loads(prediction)
+        except:
+            prediction = prediction
+
+        answer_list = (
+            [x for x in gold_answer.split("\n") if len(x.strip()) > 0] if type(gold_answer) != list else gold_answer
+        )
+        gold_answer, evaluator = parse_answer(answer_list)
+        prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
+
+        has_ans = 1.0
+        if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
+            has_ans = 0.0
+
+        if not run_eval:
+            return 0.0
+
+        metric_eval = get_evaluator(evaluator)
+        accuracy = metric_eval(prediction, gold_answer)
+        # double check if the accuracy is a number between 0 and 1
+        if 0 <= accuracy <= 1:
+            return accuracy
+        else:
+            # throw exception
+            raise ValueError(f"Accuracy should be a float between 0 and 1, but got {accuracy}")
+    except Exception as e:
+        print(
+            f"Something went wrong while evaluating prediction {prediction} vs gold answer {gold_answer} with error {e}"
+        )
+        return 0.0
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py
new file mode 100644
index 000000000000..61c40acc72f8
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/custom_tabulate.py
@@ -0,0 +1,232 @@
+import os
+import sys
+import re
+from agbench.tabulate_cmd import default_tabulate
+import json
+import pandas as pd
+import sqlite3
+import glob
+import numpy as np
+sys.path.append(os.path.dirname(__file__))
+
+from assistantbench_evaluator import question_scorer
+
+EXCLUDE_DIR_NAMES = ["__pycache__"]
+
+
+def normalize_answer(a):
+    # Lower case
+    # Trim (left and right)
+    # standardize comma separated values
+    # Replace multiple spaces with one space
+    # Remove trailing punctuation
+    norm_answer = ", ".join(a.strip().lower().split(","))
+    norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
+    return norm_answer
+
+
+def scorer(instance_dir):
+    # Read the expected answer
+    expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
+    if not os.path.isfile(expected_answer_file):
+        return None
+
+    expected_answer = None
+    with open(expected_answer_file, "rt") as fh:
+        expected_answer = fh.read().strip()
+
+    # Read the console
+    console_log_file = os.path.join(instance_dir, "console_log.txt")
+    if not os.path.isfile(console_log_file):
+        return None
+
+    console_log = ""
+    with open(console_log_file, "rt") as fh:
+        console_log = fh.read()
+
+        final_answer = None
+        m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
+        if m:
+            final_answer = m.group(1).strip()
+
+        # Missing the final answer line
+        if final_answer is None:
+            return None
+        # get accuracy from assistantbench util, no normalization done for accuracy
+        accuracy = question_scorer(final_answer, expected_answer)
+        n_ex = normalize_answer(expected_answer)
+        n_final = normalize_answer(final_answer)
+        return (accuracy, n_ex, n_final)
+
+
+def get_number_of_chat_messages(chat_messages_dir):
+    result = 0
+    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
+        with open(file, "r") as f:
+            content = json.load(f)
+            for agent, messages in content.items():
+                result += len(messages)
+    return result
+
+
+def main(args):
+    parsed_args, all_results = default_tabulate(args, scorer=scorer)
+    excel_path = parsed_args.excel
+
+    if excel_path:
+        excel_dir = os.path.dirname(excel_path) or "."
+        if not os.path.exists(excel_dir):
+            os.makedirs(excel_dir, exist_ok=True)
+
+        if not excel_path.endswith((".xlsx", ".xls")):
+            excel_path += ".xlsx"
+
+        runlogs = (
+            parsed_args.runlogs
+            if parsed_args.runlogs.endswith("/")
+            else parsed_args.runlogs + "/"
+        )
+
+        if os.path.isdir(runlogs):
+            task_ids = sorted(
+                [
+                    task_id
+                    for task_id in os.listdir(runlogs)
+                    if task_id not in EXCLUDE_DIR_NAMES
+                ],
+                key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
+            )
+        else:
+            raise ValueError("please input a valid directory to tabulate result")
+
+        trials = (
+            sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x))
+            if len(task_ids) > 0
+            else []
+        )
+        dbnames = [
+            [f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids]
+            for trial in trials
+        ]
+
+        query = """
+            SELECT cost, session_id, response, start_time, end_time
+            FROM (
+                SELECT invocation_id, cost, session_id, response, start_time, end_time,
+                    ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
+                FROM chat_completions
+            )
+            WHERE rn = 1;
+        """
+
+        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
+            for trial_index, each_trial in enumerate(dbnames):
+                result_df = pd.DataFrame(
+                    columns=[
+                        "id",
+                        "status",
+                        "expected_answer",
+                        "final_answer",
+                        "cost",
+                        "latency",
+                        "num_of_llm_requests",
+                        "num_of_chat_messages",
+                        "prompt_tokens",
+                        "completion_tokens",
+                        "total_tokens",
+                        "model",
+                    ]
+                )
+
+                result_df_type_mapping = {
+                    "id": str,
+                    "status": bool,
+                    "expected_answer": str,
+                    "final_answer": str,
+                    "cost": float,
+                    "latency": float,
+                    "num_of_llm_requests": int,
+                    "num_of_chat_messages": int,
+                    "prompt_tokens": int,
+                    "completion_tokens": int,
+                    "total_tokens": int,
+                }
+
+                for dbname, scorer_results in zip(each_trial, all_results):
+                    task_id = scorer_results[0]
+                    scorer_result = scorer_results[trial_index + 1]
+
+                    status, expected_answer, final_answer = (
+                        scorer_result if scorer_result else (False, "", "")
+                    )
+
+                    con = sqlite3.connect(dbname)
+
+                    # TODO: if large amount of data, add chunksize
+                    telemetry_df = pd.read_sql_query(query, con)
+
+                    earliest_starttime = pd.to_datetime(
+                        telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f"
+                    ).min()
+                    latest_endtime = pd.to_datetime(
+                        telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f"
+                    ).max()
+
+                    num_of_chat_messages = get_number_of_chat_messages(
+                        chat_messages_dir=os.path.dirname(dbname)
+                    )
+                    result = {
+                        "id": task_id,
+                        "status": status,
+                        "expected_answer": expected_answer,
+                        "final_answer": final_answer,
+                        "cost": telemetry_df["cost"].sum(),
+                        "latency": (
+                            latest_endtime - earliest_starttime
+                        ).total_seconds(),
+                        "num_of_llm_requests": len(telemetry_df),
+                        "num_of_chat_messages": num_of_chat_messages,
+                        "prompt_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["prompt_tokens"]
+                            if "usage" in json.loads(x)
+                            and "prompt_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "completion_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["completion_tokens"]
+                            if "usage" in json.loads(x)
+                            and "completion_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "total_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["total_tokens"]
+                            if "usage" in json.loads(x)
+                            and "total_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "model": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["model"]
+                            if "model" in json.loads(x)
+                            else ""
+                        )
+                        .unique(),
+                    }
+
+                    result_df = result_df.astype(result_df_type_mapping)
+                    result_df = pd.concat(
+                        [result_df, pd.DataFrame([result])], ignore_index=True
+                    )
+                result_df.to_excel(
+                    writer, sheet_name=f"trial_{trial_index}", index=False
+                )
+
+
+if __name__ == "__main__" and __package__ is None:
+    main(sys.argv)
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/LICENSE b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/LICENSE
new file mode 100644
index 000000000000..f49a4e16e68b
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/__init__.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_dicts.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_dicts.py
new file mode 100644
index 000000000000..9ce61c8cea99
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_dicts.py
@@ -0,0 +1,71 @@
+# From AssistantBench modified slightly.
+from typing import Dict, List
+import numpy as np
+
+from .utils import _align_bags
+
+
+def calculate_f1_score(precision, recall):
+    if precision + recall == 0:
+        return 0  # Handle the case to avoid division by zero
+    return 2 * (precision * recall) / (precision + recall)
+
+
+def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool):
+    from .evaluate_factory import get_evaluator_from_gold_answer
+
+    recall = []
+    for gold_key, gold_value in gold.items():
+        pred_value = pred.get(gold_key)
+        gold_value = fix_number(gold_value)
+        pred_value = fix_number(pred_value)
+        if gold_key not in pred:
+            recall.append(0)
+        else:
+            evaluator = (
+                get_evaluator_from_gold_answer(type(gold_value))
+                if use_gold_for_eval
+                else get_evaluator_from_gold_answer(type(pred_value))
+            )
+            if type(pred_value) != type(gold_value):
+                recall.append(0)
+                continue
+            recall.append(evaluator(pred_value, gold_value))
+    avg_recall = np.average(recall)
+    return avg_recall
+
+
+def fix_number(number):
+    if type(number) == str:
+        copy_ans = number
+        copy_ans = " ".join(
+            " ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
+        ).strip()
+        copy_ans = copy_ans.strip()
+        copy_ans = copy_ans.replace(",", ".")
+        try:
+            return float(copy_ans)
+        except:
+            return number
+    elif type(number) == int:
+        return float(number)
+    else:
+        return number
+
+
+def evaluate_pair_of_dicts(pred: Dict, gold: Dict):
+    recall = calc_recall(pred, gold, True)
+    precision = calc_recall(gold, pred, False)
+    f1 = calculate_f1_score(precision, recall)
+    return f1
+
+
+def evaluate_dicts(pred: List[Dict], gold: List[Dict]):
+    if not (
+        type(pred) == dict
+        or len(pred) == 0
+        or (type(pred) == list and type(pred[0]) == dict)
+    ):
+        return 0
+    max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts)
+    return np.average(max_alignment_scores)
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_factory.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_factory.py
new file mode 100644
index 000000000000..6a63c0a26eeb
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_factory.py
@@ -0,0 +1,30 @@
+#From AssistantBench modified slightly.
+
+from typing import Union, Dict
+
+from .evaluate_dicts import evaluate_dicts
+from .evaluate_numbers import evaluate_numbers
+from .evaluate_strings import evaluate_strings
+
+EvaluatorFactory = {
+    "string": evaluate_strings,
+    "number": evaluate_numbers,
+    "json": evaluate_dicts,
+    "string list": evaluate_strings,
+}
+
+EvaluatorFactoryFromType = {
+    str: evaluate_strings,
+    int: evaluate_numbers,
+    float: evaluate_numbers,
+    bool: evaluate_strings,
+    list: evaluate_strings,
+}
+
+
+def get_evaluator(evaluator: str):
+    return EvaluatorFactory[evaluator]
+
+
+def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]):
+    return EvaluatorFactoryFromType[gold_answer]
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_numbers.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_numbers.py
new file mode 100644
index 000000000000..74a51b512653
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_numbers.py
@@ -0,0 +1,35 @@
+#From AssistantBench modified slightly.
+
+from typing import Union
+import numpy as np
+
+
+# Renamed calc_z function to distance_function_log
+def distance_function_log(pred: float, gold: float):
+    if pred == gold == 0:
+        return 1
+    if pred == 0:
+        pred = 1e-4
+    if gold == 0:
+        gold = 1e-4
+    if pred > gold:
+        return max(0, 1 - np.log(pred / gold))
+    else:
+        return max(0, 1 - np.log(gold / pred))
+
+
+def evaluate_numbers(pred: Union[float, str], gold: float):
+    res = None
+    if type(pred) != float and type(pred) != int:
+        try:
+            pred = float(pred)
+        except ValueError:
+            res = 0
+    if type(gold) != float and type(gold) != int:
+        try:
+            gold = float(gold)
+        except ValueError:
+            res = 0
+    if res is None:
+        res = distance_function_log(pred, gold)
+    return res
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_strings.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_strings.py
new file mode 100644
index 000000000000..301eff3b7764
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/evaluate_strings.py
@@ -0,0 +1,180 @@
+"""
+From AssistantBench modified slightly.
+Evaluation for two strings or list of strings.
+
+Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
+"""
+
+from collections import defaultdict
+from typing import List, Set, Tuple, Union
+import string
+import re
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+
+# From here through _normalize_answer was originally copied from:
+# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
+# Then cleaned up and modified a bit.
+def _remove_articles(text: str) -> str:
+    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    return re.sub(regex, " ", text)
+
+
+def _white_space_fix(text: str) -> str:
+    return " ".join(text.split())
+
+
+EXCLUDE = set(string.punctuation)
+
+
+def _remove_punc(text: str) -> str:
+    if not _is_number(text):
+        return "".join(ch for ch in text if ch not in EXCLUDE)
+    else:
+        return text
+
+
+def _lower(text: str) -> str:
+    return text.lower()
+
+
+def _tokenize(text: str) -> List[str]:
+    return re.split(" |-", text)
+
+
+def _normalize_answer(text: str) -> str:
+    """Lower text and remove punctuation, articles and extra whitespace."""
+
+    parts = [
+        _white_space_fix(
+            _remove_articles(_normalize_number(_remove_punc(_lower(token))))
+        )
+        for token in _tokenize(text)
+    ]
+    parts = [part for part in parts if part.strip()]
+    normalized = " ".join(parts).strip()
+    return normalized
+
+
+def _is_number(text: str) -> bool:
+    try:
+        float(text)
+        return True
+    except ValueError:
+        return False
+
+
+def _normalize_number(text: str) -> str:
+    if _is_number(text):
+        return str(float(text))
+    else:
+        return text
+
+
+def _answer_to_bags(
+    answer: Union[str, List[str], Tuple[str, ...]],
+) -> Tuple[List[str], List[Set[str]]]:
+    if isinstance(answer, (list, tuple)):
+        raw_spans = answer
+    else:
+        raw_spans = [answer]
+    normalized_spans: List[str] = []
+    token_bags = []
+    for raw_span in raw_spans:
+        normalized_span = _normalize_answer(raw_span)
+        normalized_spans.append(normalized_span)
+        token_bags.append(set(normalized_span.split()))
+    return normalized_spans, token_bags
+
+
+def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
+    """
+    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
+    between them and gets maximum metric values over all the answers.
+    """
+    scores = np.zeros([len(gold), len(predicted)])
+    for gold_index, gold_item in enumerate(gold):
+        for pred_index, pred_item in enumerate(predicted):
+            if _match_numbers_if_present(gold_item, pred_item):
+                scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
+    row_ind, col_ind = linear_sum_assignment(-scores)
+
+    max_scores = np.zeros([max(len(gold), len(predicted))])
+    for row, column in zip(row_ind, col_ind):
+        max_scores[row] = max(max_scores[row], scores[row, column])
+    return max_scores
+
+
+def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
+    intersection = len(gold_bag.intersection(predicted_bag))
+    if not predicted_bag:
+        precision = 1.0
+    else:
+        precision = intersection / float(len(predicted_bag))
+    if not gold_bag:
+        recall = 1.0
+    else:
+        recall = intersection / float(len(gold_bag))
+    f1 = (
+        (2 * precision * recall) / (precision + recall)
+        if not (precision == 0.0 and recall == 0.0)
+        else 0.0
+    )
+    return f1
+
+
+def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
+    gold_numbers = set()
+    predicted_numbers = set()
+    for word in gold_bag:
+        if _is_number(word):
+            gold_numbers.add(word)
+    for word in predicted_bag:
+        if _is_number(word):
+            predicted_numbers.add(word)
+    if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
+        return True
+    return False
+
+
+def get_metrics(
+    predicted: Union[str, List[str], Tuple[str, ...]],
+    gold: Union[str, List[str], Tuple[str, ...]],
+) -> Tuple[float, float]:
+    """
+    Takes a predicted answer and a gold answer (that are both either a string or a list of
+    strings), and returns exact match and the DROP F1 metric for the prediction.  If you are
+    writing a script for evaluating objects in memory (say, the output of predictions during
+    validation, or while training), this is the function you want to call, after using
+    :func:`answer_json_to_strings` when reading the gold answer from the released data file.
+    """
+    predicted_bags = _answer_to_bags(predicted)
+    gold_bags = _answer_to_bags(gold)
+
+    if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(
+        gold_bags[0]
+    ):
+        exact_match = 1.0
+    else:
+        exact_match = 0.0
+
+    f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
+    f1 = np.mean(f1_per_bag)
+    f1 = round(f1, 2)
+    return exact_match, f1
+
+
+def evaluate_strings(prediction, gold):
+    if type(prediction) != list and type(prediction) != str:
+        prediction = str(prediction)
+    if type(gold) != list and type(gold) != str:
+        gold = str(gold)
+    try:
+        predicted_bags = _answer_to_bags(prediction)
+        gold_bags = _answer_to_bags(gold)
+        f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
+        f1 = np.mean(f1_per_bag)
+    except Exception:
+        f1 = 0.0
+    return f1
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/readme.md b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/readme.md
new file mode 100644
index 000000000000..733706ff4eeb
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/readme.md
@@ -0,0 +1 @@
+These files were obtained from the creators of the AssistantBench benchmark and modified slightly. You can find the latest version at [https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation](https://huggingface.co/spaces/AssistantBench/leaderboard/tree/main/evaluation)
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/utils.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/utils.py
new file mode 100644
index 000000000000..ea55f392a55a
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/evaluate_utils/utils.py
@@ -0,0 +1,24 @@
+from typing import List, Set, Tuple, Union, Callable
+import numpy as np
+from scipy.optimize import linear_sum_assignment
+
+
+def _align_bags(
+    predicted: List[Set[str]],
+    gold: List[Set[str]],
+    method: Callable[[object, object], float],
+) -> List[float]:
+    """
+    Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
+    between them and gets maximum metric values over all the answers.
+    """
+    scores = np.zeros([len(gold), len(predicted)])
+    for gold_index, gold_item in enumerate(gold):
+        for pred_index, pred_item in enumerate(predicted):
+            scores[gold_index, pred_index] = method(pred_item, gold_item)
+    row_ind, col_ind = linear_sum_assignment(-scores)
+
+    max_scores = np.zeros([max(len(gold), len(predicted))])
+    for row, column in zip(row_ind, col_ind):
+        max_scores[row] = max(max_scores[row], scores[row, column])
+    return max_scores
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Scripts/init_tasks.py b/python/packages/agbench/benchmarks/AssistantBench/Scripts/init_tasks.py
new file mode 100644
index 000000000000..752739cb6d9f
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Scripts/init_tasks.py
@@ -0,0 +1,93 @@
+import json
+import os
+import re
+import sys
+
+from huggingface_hub import snapshot_download
+
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
+TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
+TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
+DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
+REPO_DIR = os.path.join(DOWNLOADS_DIR, "AssistantBench")
+
+
+def download_assistantbench():
+    """Download the AssistantBench benchmark from Hugging Face."""
+
+    if not os.path.isdir(DOWNLOADS_DIR):
+        os.mkdir(DOWNLOADS_DIR)
+
+    """Download the AssistantBench dataset from Hugging Face Hub"""
+    snapshot_download(
+        repo_id="AssistantBench/AssistantBench",
+        repo_type="dataset",
+        local_dir=REPO_DIR,
+        local_dir_use_symlinks=True,
+    )
+
+
+def create_jsonl(data_file_path, file_name, template):
+    """Creates a JSONL scenario file with a given name, and template path."""
+    tasks = []
+    with open(data_file_path) as fh:
+        for line in fh:
+            data = json.loads(line)
+            tasks.append(data)
+    file_name = os.path.basename(file_name)
+    if not os.path.isdir(TASKS_DIR):
+        os.mkdir(TASKS_DIR)
+
+    with open(os.path.join(TASKS_DIR, file_name), "wt") as fh:
+        for task in tasks:
+            if "answer" not in task or task["answer"] is None:
+                task["answer"] = ""
+            print(f"Converting: [{file_name}] {task['id']}")
+            template_cp_list = [template]
+            record = {
+                "id": task["id"],
+                "template": template_cp_list,
+                "substitutions": {
+                    "scenario.py": {
+                        "__FILE_NAME__": "",
+                    },
+                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["answer"]},
+                    "prompt.txt": {"__PROMPT__": task["task"]},
+                },
+                "difficulty": task["difficulty"],
+                "explanation": task["explanation"],
+                "metadata": task["metadata"],
+                "gold_url": task["gold_url"],
+                "set": task["set"],
+            }
+            fh.write(json.dumps(record).strip() + "\n")
+
+
+###############################################################################
+def main():
+    ab_validation_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_dev.jsonl")
+    ab_test_files = os.path.join(REPO_DIR, "assistant_bench_v1.0_test.jsonl")
+
+    if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
+        download_assistantbench()
+
+    if not os.path.isfile(ab_validation_files) or not os.path.isfile(ab_test_files):
+        sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the AssistantBench repository.")
+
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
+    print(templates)
+    # make a copy of the data in the Tasks directory
+    for t in templates.items():
+        create_jsonl(ab_validation_files, f"assistant_bench_v1.0_dev__{t[0]}.jsonl", t[1])
+        create_jsonl(ab_test_files, f"assistant_bench_v1.0_test__{t[0]}.jsonl", t[1])
+
+
+if __name__ == "__main__" and __package__ is None:
+    main()
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/expected_answer.txt b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/expected_answer.txt
new file mode 100644
index 000000000000..8153c2bf8242
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/expected_answer.txt
@@ -0,0 +1 @@
+__EXPECTED_ANSWER__
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/prompt.txt b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/requirements.txt b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/requirements.txt
new file mode 100644
index 000000000000..6004841b72dc
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/requirements.txt
@@ -0,0 +1,4 @@
+/autogen_python/packages/autogen-core
+/autogen_python/packages/autogen-magentic-one
+azure-identity
+tiktoken
diff --git a/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/scenario.py b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/scenario.py
new file mode 100644
index 000000000000..3d79a16614a4
--- /dev/null
+++ b/python/packages/agbench/benchmarks/AssistantBench/Templates/MagenticOne/scenario.py
@@ -0,0 +1,219 @@
+import asyncio
+import logging
+import os
+import re
+import tiktoken
+
+from openai import AzureOpenAI
+
+from typing import List
+
+from autogen_core.base import AgentId, AgentProxy, TopicId
+from autogen_core.application import SingleThreadedAgentRuntime
+from autogen_core.application.logging import EVENT_LOGGER_NAME
+from autogen_core.components.models import (
+    AzureOpenAIChatCompletionClient,
+    ChatCompletionClient,
+    ModelCapabilities,
+    UserMessage,
+    LLMMessage,
+)
+from autogen_core.components import DefaultSubscription, DefaultTopicId
+from autogen_core.components.code_executor import LocalCommandLineCodeExecutor
+from autogen_core.components.models import AssistantMessage
+
+from autogen_magentic_one.markdown_browser import MarkdownConverter, UnsupportedFormatException
+from autogen_magentic_one.agents.coder import Coder, Executor
+from autogen_magentic_one.agents.orchestrator import LedgerOrchestrator
+from autogen_magentic_one.messages import BroadcastMessage
+from autogen_magentic_one.agents.multimodal_web_surfer import MultimodalWebSurfer
+from autogen_magentic_one.agents.file_surfer import FileSurfer
+from autogen_magentic_one.utils import LogHandler, message_content_to_str, create_completion_client_from_env
+
+encoding = None
+def count_token(value: str) -> int:
+    # TODO:: Migrate to model_client.count_tokens
+    global encoding
+    if encoding is None:
+        encoding = tiktoken.encoding_for_model("gpt-4o-2024-05-13")
+    return len(encoding.encode(value))
+
+async def response_preparer(task: str, source: str, client: ChatCompletionClient, transcript: List[LLMMessage]) -> str:
+    messages: List[LLMMessage] = []
+
+    # copy them to this context
+    for message in transcript:
+        messages.append(
+            UserMessage(
+                content = message_content_to_str(message.content),
+                # TODO fix this -> remove type ignore
+                source=message.source, # type: ignore
+            )
+        )
+
+    # Remove messages until we are within 2k of the context window limit
+    while len(messages) and client.remaining_tokens( messages ) < 2000:
+        messages.pop(0)
+
+    # Add the preamble
+    messages.insert(0,
+        UserMessage(
+            content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:",
+            source=source,
+        )
+    )
+
+    # ask for the final answer
+    messages.append(
+        UserMessage(
+            content= f"""
+Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
+
+{task}
+
+To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
+Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
+""",
+            source=source,
+        )
+    )
+
+
+    response = await client.create(messages)
+    assert isinstance(response.content, str)
+
+    # No answer
+    if "unable to determine" in response.content.lower():
+        messages.append( AssistantMessage(content=response.content, source="self" ) )
+        messages.append(
+            UserMessage(
+                content= f"""
+I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
+
+To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
+Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
+ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+""".strip(),
+                source=source,
+            )
+        )
+
+        response = await client.create(messages)
+        assert isinstance(response.content, str)
+        return re.sub(r"EDUCATED GUESS:", "FINAL ANSWER:", response.content)
+
+    else:
+        return response.content
+
+
+async def main() -> None:
+    # Read the prompt
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read().strip()
+    filename = "__FILE_NAME__".strip()
+
+    # Create the runtime.
+    runtime = SingleThreadedAgentRuntime()
+
+    # Create the AzureOpenAI client from the environment file
+    client = create_completion_client_from_env()
+
+
+    mlm_client = create_completion_client_from_env()
+
+
+    # Register agents.
+    await runtime.register(
+        "Assistant",
+        lambda: Coder(model_client=client),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    coder = AgentProxy(AgentId("Assistant", "default"), runtime)
+
+    await runtime.register(
+        "ComputerTerminal",
+        lambda: Executor(executor=LocalCommandLineCodeExecutor(), confirm_execution="ACCEPT_ALL"),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    executor = AgentProxy(AgentId("ComputerTerminal", "default"), runtime)
+
+    await runtime.register(
+        "FileSurfer",
+        lambda: FileSurfer(model_client=client),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    file_surfer = AgentProxy(AgentId("FileSurfer", "default"), runtime)
+
+    await runtime.register(
+        "WebSurfer",
+        lambda: MultimodalWebSurfer(), # Configuration is set later by init()
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    web_surfer = AgentProxy(AgentId("WebSurfer", "default"), runtime)
+
+    await runtime.register("Orchestrator", lambda: LedgerOrchestrator(
+            agents=[coder, executor, file_surfer, web_surfer],
+            model_client=client,
+            max_rounds=30,
+            max_time=25*60,
+        ),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    orchestrator = AgentProxy(AgentId("Orchestrator", "default"), runtime)
+
+    runtime.start()
+
+    actual_surfer = await runtime.try_get_underlying_agent_instance(web_surfer.id, type=MultimodalWebSurfer)
+    await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium")
+
+    filename_prompt = ""
+    if len(filename) > 0:
+        #relpath = os.path.join("coding", filename)
+        #file_uri = pathlib.Path(os.path.abspath(os.path.expanduser(relpath))).as_uri()
+
+        filename_prompt = f"The question is about a file, document or image, which can be accessed by the filename '{filename}' in the current working directory."
+
+        mlm_prompt = f"""Write a detailed caption for this image. Pay special attention to any details that might be useful for someone answering the following:
+
+{prompt}
+""".strip()
+
+        try:
+            mdconverter = MarkdownConverter(mlm_client=mlm_client, mlm_model="gpt-4o-2024-05-13")
+            res = mdconverter.convert(filename, mlm_prompt=mlm_prompt)
+            if res.text_content:
+                if count_token(res.text_content) < 8000:  # Don't put overly-large documents into the prompt
+                    filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content
+        except UnsupportedFormatException:
+            pass
+
+    task = f"{prompt}\n\n{filename_prompt}"
+
+    await runtime.publish_message(
+        BroadcastMessage(content=UserMessage(content=task.strip(), source="human")),
+        topic_id=DefaultTopicId(),
+    )
+
+    await runtime.stop_when_idle()
+
+    # Output the final answer
+    actual_orchestrator = await runtime.try_get_underlying_agent_instance(orchestrator.id, type=LedgerOrchestrator)
+    transcript: List[LLMMessage] = actual_orchestrator._chat_history # type: ignore
+    print(await response_preparer(task=task, source=(await orchestrator.metadata)["type"], client=client, transcript=transcript))
+
+
+if __name__ == "__main__":
+    logger = logging.getLogger(EVENT_LOGGER_NAME)
+    logger.setLevel(logging.INFO)
+    log_handler = LogHandler()
+    logger.handlers = [log_handler]
+    asyncio.run(main())
diff --git a/python/packages/agbench/benchmarks/GAIA/.gitignore b/python/packages/agbench/benchmarks/GAIA/.gitignore
new file mode 100644
index 000000000000..f4a377d8b83c
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/.gitignore
@@ -0,0 +1,5 @@
+data
+gaia_validation_TeamOne
+*_results.csv
+results.csv
+ENV.json
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/GAIA/ENV.json.sample b/python/packages/agbench/benchmarks/GAIA/ENV.json.sample
new file mode 100644
index 000000000000..1f2c4915e3c7
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/ENV.json.sample
@@ -0,0 +1,5 @@
+{
+    "BING_API_KEY": "YOUR_KEY_KEY",
+    "HOMEPAGE": "https://www.bing.com/",
+    "WEB_SURFER_DEBUG_DIR": "/autogen/debug"
+}
diff --git a/python/packages/agbench/benchmarks/GAIA/README.md b/python/packages/agbench/benchmarks/GAIA/README.md
new file mode 100644
index 000000000000..753d8e4ed51a
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/README.md
@@ -0,0 +1,83 @@
+# GAIA Benchmark
+
+This scenario implements the [GAIA](https://arxiv.org/abs/2311.12983) agent benchmark. Before you begin, make sure you have followed instruction in `../README.md` to prepare your environment.
+
+### Setup Environment Variables for AgBench
+
+Navigate to GAIA
+
+```bash
+cd benchmarks/GAIA
+```
+
+Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
+
+```json
+{
+    "BING_API_KEY": "REPLACE_WITH_YOUR_BING_API_KEY",
+    "HOMEPAGE": "https://www.bing.com/",
+    "WEB_SURFER_DEBUG_DIR": "/autogen/debug",
+    "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
+    "CHAT_COMPLETION_PROVIDER": "azure"
+}
+```
+
+You can also use the openai client by replacing the last two entries in the ENV file by:
+
+- `CHAT_COMPLETION_PROVIDER='openai'`
+- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
+
+```json
+{
+  "api_key": "REPLACE_WITH_YOUR_API",
+  "model": "gpt-4o-2024-05-13"
+}
+```
+
+You might need to add additional packages to the requirements.txt file inside the Templates/MagenticOne folder.
+
+Now initialize the tasks.
+
+```bash
+python Scripts/init_tasks.py
+```
+
+Note: This will attempt to download GAIA from Hugginface, but this requires authentication.
+
+The resulting folder structure should look like this:
+
+```
+.
+./Downloads
+./Downloads/GAIA
+./Downloads/GAIA/2023
+./Downloads/GAIA/2023/test
+./Downloads/GAIA/2023/validation
+./Scripts
+./Templates
+./Templates/TeamOne
+```
+
+Then run `Scripts/init_tasks.py` again.
+
+Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`.
+
+### Running GAIA
+
+Now to run a specific subset of GAIA use:
+
+```bash
+agbench run Tasks/gaia_validation_level_1__MagenticOne.jsonl
+```
+
+You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:
+
+```bash
+agbench tabulate Results/gaia_validation_level_1__MagenticOne/
+```
+
+## References
+
+**GAIA: a benchmark for General AI Assistants** `<br/>`
+Grégoire Mialon, Clémentine Fourrier, Craig Swift, Thomas Wolf, Yann LeCun, Thomas Scialom `<br/>`
+[https://arxiv.org/abs/2311.12983](https://arxiv.org/abs/2311.12983)
diff --git a/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py b/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py
new file mode 100644
index 000000000000..ec51863e9c7b
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Scripts/custom_tabulate.py
@@ -0,0 +1,197 @@
+import os
+import sys
+import re
+from agbench.tabulate_cmd import default_tabulate
+import json
+import pandas as pd
+import sqlite3
+import glob
+import numpy as np
+
+EXCLUDE_DIR_NAMES = ["__pycache__"]
+
+
+def normalize_answer(a):
+    # Lower case
+    # Trim (left and right)
+    # standardize comma separated values
+    # Replace multiple spaces with one space
+    # Remove trailing punctuation
+    norm_answer = ", ".join(a.strip().lower().split(","))
+    norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
+    return norm_answer
+
+
+def scorer(instance_dir):
+    # Read the expected answer
+    expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
+    if not os.path.isfile(expected_answer_file):
+        return None
+
+    expected_answer = None
+    with open(expected_answer_file, "rt") as fh:
+        expected_answer = fh.read().strip()
+
+    # Read the console
+    console_log_file = os.path.join(instance_dir, "console_log.txt")
+    if not os.path.isfile(console_log_file):
+        return None
+
+    console_log = ""
+    with open(console_log_file, "rt") as fh:
+        console_log = fh.read()
+
+        final_answer = None 
+        m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
+        if m:
+            final_answer = m.group(1).strip()
+
+        # Missing the final answer line
+        if final_answer is None:
+            return None
+
+        # Return true if they are equal after normalization
+        n_ex = normalize_answer(expected_answer)
+        n_final = normalize_answer(final_answer)
+        return (
+            (n_ex != "" and n_ex == n_final),
+            n_ex,
+            n_final
+        )
+
+
+def get_number_of_chat_messages(chat_messages_dir):
+    result = 0
+    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
+        with open(file, "r") as f:
+            content = json.load(f)
+            for agent, messages in content.items():
+                result += len(messages)
+    return result
+
+
+def main(args):
+    parsed_args, all_results = default_tabulate(args, scorer=scorer)
+    excel_path = parsed_args.excel
+
+    if excel_path:
+        excel_dir = os.path.dirname(excel_path) or "."
+        if not os.path.exists(excel_dir):
+            os.makedirs(excel_dir, exist_ok=True)
+
+        if not excel_path.endswith((".xlsx", ".xls")):
+            excel_path += ".xlsx"
+
+        runlogs = parsed_args.runlogs if parsed_args.runlogs.endswith("/") else parsed_args.runlogs + "/"
+
+        if os.path.isdir(runlogs):
+            task_ids = sorted(
+                [task_id for task_id in os.listdir(runlogs) if task_id not in EXCLUDE_DIR_NAMES],
+                key=lambda s: os.path.getmtime(os.path.join(parsed_args.runlogs, s)),
+            )
+        else:
+            raise ValueError("please input a valid directory to tabulate result")
+
+        trials = sorted(os.listdir(f"{runlogs}{task_ids[0]}"), key=lambda x: int(x)) if len(task_ids) > 0 else []
+        dbnames = [[f"{runlogs}{task_id}/{trial}/telemetry.db" for task_id in task_ids] for trial in trials]
+
+        query = """
+            SELECT cost, session_id, response, start_time, end_time
+            FROM (
+                SELECT invocation_id, cost, session_id, response, start_time, end_time,
+                    ROW_NUMBER() OVER (PARTITION BY invocation_id ORDER BY start_time) as rn
+                FROM chat_completions
+            )
+            WHERE rn = 1;
+        """
+
+        with pd.ExcelWriter(excel_path, engine="openpyxl") as writer:
+            for trial_index, each_trial in enumerate(dbnames):
+                result_df = pd.DataFrame(
+                    columns=[
+                        "id",
+                        "status",
+                        "expected_answer",
+                        "final_answer",
+                        "cost",
+                        "latency",
+                        "num_of_llm_requests",
+                        "num_of_chat_messages",
+                        "prompt_tokens",
+                        "completion_tokens",
+                        "total_tokens",
+                        "model",
+                    ]
+                )
+
+                result_df_type_mapping = {
+                    "id": str,
+                    "status": bool,
+                    "expected_answer": str,
+                    "final_answer": str,
+                    "cost": float,
+                    "latency": float,
+                    "num_of_llm_requests": int,
+                    "num_of_chat_messages": int,
+                    "prompt_tokens": int,
+                    "completion_tokens": int,
+                    "total_tokens": int,
+                }
+
+                for dbname, scorer_results in zip(each_trial, all_results):
+                    task_id = scorer_results[0]
+                    scorer_result = scorer_results[trial_index + 1]
+
+                    status, expected_answer, final_answer = scorer_result if scorer_result else (False,"","")
+
+                    con = sqlite3.connect(dbname)
+
+                    # TODO: if large amount of data, add chunksize
+                    telemetry_df = pd.read_sql_query(query, con)
+
+                    earliest_starttime = pd.to_datetime(telemetry_df["start_time"], format="%Y-%m-%d %H:%M:%S.%f").min()
+                    latest_endtime = pd.to_datetime(telemetry_df["end_time"], format="%Y-%m-%d %H:%M:%S.%f").max()
+
+                    num_of_chat_messages = get_number_of_chat_messages(chat_messages_dir=os.path.dirname(dbname))
+                    result = {
+                        "id": task_id,
+                        "status": status,
+                        "expected_answer": expected_answer,
+                        "final_answer": final_answer,
+                        "cost": telemetry_df["cost"].sum(),
+                        "latency": (latest_endtime - earliest_starttime).total_seconds(),
+                        "num_of_llm_requests": len(telemetry_df),
+                        "num_of_chat_messages": num_of_chat_messages,
+                        "prompt_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["prompt_tokens"]
+                            if "usage" in json.loads(x) and "prompt_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "completion_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["completion_tokens"]
+                            if "usage" in json.loads(x) and "completion_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "total_tokens": telemetry_df["response"]
+                        .apply(
+                            lambda x: json.loads(x)["usage"]["total_tokens"]
+                            if "usage" in json.loads(x) and "total_tokens" in json.loads(x)["usage"]
+                            else 0
+                        )
+                        .sum(),
+                        "model": telemetry_df["response"]
+                        .apply(lambda x: json.loads(x)["model"] if "model" in json.loads(x) else "")
+                        .unique(),
+                    }
+
+                    result_df = result_df.astype(result_df_type_mapping)
+                    result_df = pd.concat([result_df, pd.DataFrame([result])], ignore_index=True)
+                result_df.to_excel(writer, sheet_name=f"trial_{trial_index}", index=False)
+
+
+if __name__ == "__main__" and __package__ is None:
+    main(sys.argv)
diff --git a/python/packages/agbench/benchmarks/GAIA/Scripts/init_tasks.py b/python/packages/agbench/benchmarks/GAIA/Scripts/init_tasks.py
new file mode 100644
index 000000000000..7b572fa5edd1
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Scripts/init_tasks.py
@@ -0,0 +1,158 @@
+#
+# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
+# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
+#
+
+import json
+import os
+import re
+import sys
+
+from huggingface_hub import snapshot_download
+
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
+TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
+TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
+DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
+REPO_DIR = os.path.join(DOWNLOADS_DIR, "GAIA")
+
+
+def download_gaia():
+    """Download the GAIA benchmark from Hugging Face."""
+
+    if not os.path.isdir(DOWNLOADS_DIR):
+        os.mkdir(DOWNLOADS_DIR)
+
+    """Download the GAIA dataset from Hugging Face Hub"""
+    snapshot_download(
+        repo_id="gaia-benchmark/GAIA",
+        repo_type="dataset",
+        local_dir=REPO_DIR,
+        local_dir_use_symlinks=True,
+    )
+
+
+def create_jsonl(name, tasks, files_dir, template):
+    """Creates a JSONL scenario file with a given name, and template path."""
+
+    if not os.path.isdir(TASKS_DIR):
+        os.mkdir(TASKS_DIR)
+
+    with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
+        for task in tasks:
+            print(f"Converting: [{name}] {task['task_id']}")
+
+            # Figure out what files we need to copy
+            template_cp_list = [template]
+            if len(task["file_name"].strip()) > 0:
+                template_cp_list.append(
+                    [
+                        os.path.join(files_dir, task["file_name"].strip()),
+                        task["file_name"].strip(),
+                        #os.path.join("coding", task["file_name"].strip()),
+                    ]
+                )
+
+            record = {
+                "id": task["task_id"],
+                "template": template_cp_list,
+                "substitutions": {
+                    "scenario.py": {
+                        "__FILE_NAME__": task["file_name"],
+                    },
+                    "expected_answer.txt": {"__EXPECTED_ANSWER__": task["Final answer"]},
+                    "prompt.txt": {"__PROMPT__": task["Question"]},
+                },
+            }
+
+            fh.write(json.dumps(record).strip() + "\n")
+
+
+###############################################################################
+def main():
+    gaia_validation_files = os.path.join(REPO_DIR, "2023", "validation")
+    gaia_test_files = os.path.join(REPO_DIR, "2023", "test")
+
+    if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
+        download_gaia()
+
+    if not os.path.isdir(gaia_validation_files) or not os.path.isdir(gaia_test_files):
+        sys.exit(f"Error: '{REPO_DIR}' does not appear to be a copy of the GAIA repository.")
+
+    # Load the GAIA data
+    gaia_validation_tasks = [[], [], []]
+    with open(os.path.join(gaia_validation_files, "metadata.jsonl")) as fh:
+        for line in fh:
+            data = json.loads(line)
+            gaia_validation_tasks[data["Level"] - 1].append(data)
+
+    gaia_test_tasks = [[], [], []]
+    with open(os.path.join(gaia_test_files, "metadata.jsonl")) as fh:
+        for line in fh:
+            data = json.loads(line)
+
+            # A welcome message -- not a real task
+            if data["task_id"] == "0-0-0-0-0":
+                continue
+
+            gaia_test_tasks[data["Level"] - 1].append(data)
+
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
+
+    # Add coding directories if needed (these are usually empty and left out of the repo)
+    #for template in templates.values():
+    #    code_dir_path = os.path.join(template, "coding")
+    #    if not os.path.isdir(code_dir_path):
+    #        os.mkdir(code_dir_path)
+
+    # Create the various combinations of [models] x [templates]
+    for t in templates.items():
+        create_jsonl(
+            f"gaia_validation_level_1__{t[0]}",
+            gaia_validation_tasks[0],
+            gaia_validation_files,
+            t[1],
+        )
+        create_jsonl(
+            f"gaia_validation_level_2__{t[0]}",
+            gaia_validation_tasks[1],
+            gaia_validation_files,
+            t[1],
+        )
+        create_jsonl(
+            f"gaia_validation_level_3__{t[0]}",
+            gaia_validation_tasks[2],
+            gaia_validation_files,
+            t[1],
+        )
+        create_jsonl(
+            f"gaia_test_level_1__{t[0]}",
+            gaia_test_tasks[0],
+            gaia_test_files,
+            t[1],
+        )
+        create_jsonl(
+            f"gaia_test_level_2__{t[0]}",
+            gaia_test_tasks[1],
+            gaia_test_files,
+            t[1],
+        )
+        create_jsonl(
+            f"gaia_test_level_3__{t[0]}",
+            gaia_test_tasks[2],
+            gaia_test_files,
+            t[1],
+        )
+
+
+if __name__ == "__main__" and __package__ is None:
+    main()
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/expected_answer.txt b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/expected_answer.txt
new file mode 100644
index 000000000000..8153c2bf8242
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/expected_answer.txt
@@ -0,0 +1 @@
+__EXPECTED_ANSWER__
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/prompt.txt b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/requirements.txt b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/requirements.txt
new file mode 100644
index 000000000000..2070aca75263
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/requirements.txt
@@ -0,0 +1,4 @@
+/autogen_python/packages/autogen-core
+/autogen_python/packages/autogen-magentic-one
+azure-identity
+tiktoken
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/scenario.py b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/scenario.py
new file mode 100644
index 000000000000..4058d75f3971
--- /dev/null
+++ b/python/packages/agbench/benchmarks/GAIA/Templates/MagenticOne/scenario.py
@@ -0,0 +1,218 @@
+import asyncio
+import logging
+import os
+import re
+import tiktoken
+
+from openai import AzureOpenAI
+
+from typing import List
+
+from autogen_core.base import AgentId, AgentProxy, TopicId
+from autogen_core.application import SingleThreadedAgentRuntime
+from autogen_core.application.logging import EVENT_LOGGER_NAME
+from autogen_core.components.models import (
+    AzureOpenAIChatCompletionClient,
+    ChatCompletionClient,
+    ModelCapabilities,
+    UserMessage,
+    LLMMessage,
+)
+from autogen_core.components import DefaultSubscription, DefaultTopicId
+from autogen_core.components.code_executor import LocalCommandLineCodeExecutor
+from autogen_core.components.models import AssistantMessage
+
+from autogen_magentic_one.markdown_browser import MarkdownConverter, UnsupportedFormatException
+from autogen_magentic_one.agents.coder import Coder, Executor
+from autogen_magentic_one.agents.orchestrator import LedgerOrchestrator
+from autogen_magentic_one.messages import BroadcastMessage
+from autogen_magentic_one.agents.multimodal_web_surfer import MultimodalWebSurfer
+from autogen_magentic_one.agents.file_surfer import FileSurfer
+from autogen_magentic_one.utils import LogHandler, message_content_to_str, create_completion_client_from_env
+
+encoding = None
+def count_token(value: str) -> int:
+    # TODO:: Migrate to model_client.count_tokens
+    global encoding
+    if encoding is None:
+        encoding = tiktoken.encoding_for_model("gpt-4o-2024-05-13")
+    return len(encoding.encode(value))
+
+async def response_preparer(task: str, source: str, client: ChatCompletionClient, transcript: List[LLMMessage]) -> str:
+    messages: List[LLMMessage] = []
+
+    # copy them to this context
+    for message in transcript:
+        messages.append(
+            UserMessage(
+                content = message_content_to_str(message.content),
+                # TODO fix this -> remove type ignore
+                source=message.source, # type: ignore
+            )
+        )
+
+    # Remove messages until we are within 2k of the context window limit
+    while len(messages) and client.remaining_tokens( messages ) < 2000:
+        messages.pop(0)
+
+    # Add the preamble
+    messages.insert(0,
+        UserMessage(
+            content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:",
+            source=source,
+        )
+    )
+
+    # ask for the final answer
+    messages.append(
+        UserMessage(
+            content= f"""
+Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience:
+
+{task}
+
+To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
+Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings.
+ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine'
+""",
+            source=source,
+        )
+    )
+
+
+    response = await client.create(messages)
+    assert isinstance(response.content, str)
+
+    # No answer
+    if "unable to determine" in response.content.lower():
+        messages.append( AssistantMessage(content=response.content, source="self" ) )
+        messages.append(
+            UserMessage(
+                content= f"""
+I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation.
+
+To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS]
+Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc.
+ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.)
+If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise.
+If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'.
+If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings.
+""".strip(),
+                source=source,
+            )
+        )
+
+        response = await client.create(messages)
+        assert isinstance(response.content, str)
+        return re.sub(r"EDUCATED GUESS:", "FINAL ANSWER:", response.content)
+
+    else:
+        return response.content
+
+
+async def main() -> None:
+    # Read the prompt
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read().strip()
+    filename = "__FILE_NAME__".strip()
+
+    # Create the runtime.
+    runtime = SingleThreadedAgentRuntime()
+
+    # Create the AzureOpenAI client, with AAD auth, from environment
+    client = create_completion_client_from_env()
+
+
+    mlm_client = create_completion_client_from_env()
+
+    # Register agents.
+    await runtime.register(
+        "Assistant",
+        lambda: Coder(model_client=client),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    coder = AgentProxy(AgentId("Assistant", "default"), runtime)
+
+    await runtime.register(
+        "ComputerTerminal",
+        lambda: Executor(executor=LocalCommandLineCodeExecutor(), confirm_execution="ACCEPT_ALL"),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    executor = AgentProxy(AgentId("ComputerTerminal", "default"), runtime)
+
+    await runtime.register(
+        "FileSurfer",
+        lambda: FileSurfer(model_client=client),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    file_surfer = AgentProxy(AgentId("FileSurfer", "default"), runtime)
+
+    await runtime.register(
+        "WebSurfer",
+        lambda: MultimodalWebSurfer(), # Configuration is set later by init()
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    web_surfer = AgentProxy(AgentId("WebSurfer", "default"), runtime)
+
+    await runtime.register("Orchestrator", lambda: LedgerOrchestrator(
+            agents=[coder, executor, file_surfer, web_surfer],
+            model_client=client,
+            max_rounds=30,
+            max_time=25*60,
+        ),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    orchestrator = AgentProxy(AgentId("Orchestrator", "default"), runtime)
+
+    runtime.start()
+
+    actual_surfer = await runtime.try_get_underlying_agent_instance(web_surfer.id, type=MultimodalWebSurfer)
+    await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium")
+
+    filename_prompt = ""
+    if len(filename) > 0:
+        #relpath = os.path.join("coding", filename)
+        #file_uri = pathlib.Path(os.path.abspath(os.path.expanduser(relpath))).as_uri()
+
+        filename_prompt = f"The question is about a file, document or image, which can be accessed by the filename '{filename}' in the current working directory."
+
+        mlm_prompt = f"""Write a detailed caption for this image. Pay special attention to any details that might be useful for someone answering the following:
+
+{prompt}
+""".strip()
+
+        try:
+            mdconverter = MarkdownConverter(mlm_client=mlm_client, mlm_model="gpt-4o-2024-05-13")
+            res = mdconverter.convert(filename, mlm_prompt=mlm_prompt)
+            if res.text_content:
+                if count_token(res.text_content) < 8000:  # Don't put overly-large documents into the prompt
+                    filename_prompt += "\n\nHere are the file's contents:\n\n" + res.text_content
+        except UnsupportedFormatException:
+            pass
+
+    task = f"{prompt}\n\n{filename_prompt}"
+
+    await runtime.publish_message(
+        BroadcastMessage(content=UserMessage(content=task.strip(), source="human")),
+        topic_id=DefaultTopicId(),
+    )
+
+    await runtime.stop_when_idle()
+
+    # Output the final answer
+    actual_orchestrator = await runtime.try_get_underlying_agent_instance(orchestrator.id, type=LedgerOrchestrator)
+    transcript: List[LLMMessage] = actual_orchestrator._chat_history # type: ignore
+    print(await response_preparer(task=task, source=(await orchestrator.metadata)["type"], client=client, transcript=transcript))
+
+
+if __name__ == "__main__":
+    logger = logging.getLogger(EVENT_LOGGER_NAME)
+    logger.setLevel(logging.INFO)
+    log_handler = LogHandler()
+    logger.handlers = [log_handler]
+    asyncio.run(main())
diff --git a/python/packages/agbench/benchmarks/HumanEval/README.md b/python/packages/agbench/benchmarks/HumanEval/README.md
new file mode 100644
index 000000000000..25acc2630523
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/README.md
@@ -0,0 +1,68 @@
+# HumanEval Benchmark
+
+This scenario implements a modified version of the [HumanEval](https://arxiv.org/abs/2107.03374) benchmark.
+Compared to the original benchmark, there are **two key differences** here:
+
+- A chat model rather than a completion model is used.
+- The agents get pass/fail feedback about their implementations, and can keep trying until they succeed or run out of tokens or turns.
+
+## Running the tasks
+
+
+Navigate to HumanEval
+
+```bash
+cd benchmarks/HumanEval
+```
+
+Create a file called ENV.json with the following (required) contents (If you're using MagenticOne)
+
+```json
+{
+    "CHAT_COMPLETION_KWARGS_JSON": "{\"api_version\": \"2024-02-15-preview\", \"azure_endpoint\": \"YOUR_ENDPOINT/\", \"model_capabilities\": {\"function_calling\": true, \"json_output\": true, \"vision\": true}, \"azure_ad_token_provider\": \"DEFAULT\", \"model\": \"gpt-4o-2024-05-13\"}",
+    "CHAT_COMPLETION_PROVIDER": "azure"
+}
+```
+
+You can also use the openai client by replacing the last two entries in the ENV file by:
+
+- `CHAT_COMPLETION_PROVIDER='openai'`
+- `CHAT_COMPLETION_KWARGS_JSON` with the following JSON structure:
+
+```json
+{
+  "api_key": "REPLACE_WITH_YOUR_API",
+  "model": "gpt-4o-2024-05-13"
+}
+```
+
+Now initialize the tasks.
+
+```bash
+python Scripts/init_tasks.py
+```
+
+Note: This will attempt to download HumanEval
+
+Then run `Scripts/init_tasks.py` again.
+
+Once the script completes, you should now see a folder in your current directory called `Tasks` that contains one JSONL file per template in `Templates`.
+
+Now to run a specific subset of HumanEval use:
+
+```bash
+agbench run Tasks/human_eval_MagenticOne.jsonl
+```
+
+You should see the command line print the raw logs that shows the agents in action To see a summary of the results (e.g., task completion rates), in a new terminal run the following:
+
+```bash
+agbench tabulate Results/human_eval_MagenticOne
+```
+
+
+## References
+
+**Evaluating Large Language Models Trained on Code**`<br/>`
+Mark Chen, Jerry Tworek, Heewoo Jun, Qiming Yuan, Henrique Ponde de Oliveira Pinto, Jared Kaplan, Harri Edwards, Yuri Burda, Nicholas Joseph, Greg Brockman, Alex Ray, Raul Puri, Gretchen Krueger, Michael Petrov, Heidy Khlaaf, Girish Sastry, Pamela Mishkin, Brooke Chan, Scott Gray, Nick Ryder, Mikhail Pavlov, Alethea Power, Lukasz Kaiser, Mohammad Bavarian, Clemens Winter, Philippe Tillet, Felipe Petroski Such, Dave Cummings, Matthias Plappert, Fotios Chantzis, Elizabeth Barnes, Ariel Herbert-Voss, William Hebgen Guss, Alex Nichol, Alex Paino, Nikolas Tezak, Jie Tang, Igor Babuschkin, Suchir Balaji, Shantanu Jain, William Saunders, Christopher Hesse, Andrew N. Carr, Jan Leike, Josh Achiam, Vedant Misra, Evan Morikawa, Alec Radford, Matthew Knight, Miles Brundage, Mira Murati, Katie Mayer, Peter Welinder, Bob McGrew, Dario Amodei, Sam McCandlish, Ilya Sutskever, Wojciech Zaremba`<br/>`
+[https://arxiv.org/abs/2107.03374](https://arxiv.org/abs/2107.03374)
diff --git a/python/packages/agbench/benchmarks/HumanEval/Scripts/custom_tabulate.py b/python/packages/agbench/benchmarks/HumanEval/Scripts/custom_tabulate.py
new file mode 100644
index 000000000000..8d689a9f1667
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Scripts/custom_tabulate.py
@@ -0,0 +1,12 @@
+import os
+import sys
+
+from agbench.tabulate_cmd import default_tabulate
+
+
+def main(args):
+    default_tabulate(args)
+
+
+if __name__ == "__main__" and __package__ is None:
+    main(sys.argv)
diff --git a/python/packages/agbench/benchmarks/HumanEval/Scripts/init_tasks.py b/python/packages/agbench/benchmarks/HumanEval/Scripts/init_tasks.py
new file mode 100644
index 000000000000..df4e6b194841
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Scripts/init_tasks.py
@@ -0,0 +1,124 @@
+#
+# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
+# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
+#
+
+import base64
+import gzip
+import io
+import json
+import os
+import re
+
+import requests
+
+URL = "https://github.com/openai/human-eval/raw/master/data/HumanEval.jsonl.gz"
+
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
+TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
+TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
+
+# A selected subset of HumanEval problems to work with during development
+
+# Deprecated 2/5/2024 -- Use subsample instead
+REDUCED_SET = [
+    "HumanEval/2",
+    "HumanEval/26",
+    "HumanEval/32",
+    "HumanEval/33",
+    "HumanEval/36",
+    "HumanEval/38",
+    "HumanEval/41",
+    "HumanEval/50",
+    "HumanEval/56",
+    "HumanEval/65",
+    "HumanEval/67",
+    "HumanEval/84",
+    "HumanEval/85",
+    "HumanEval/86",
+    "HumanEval/89",
+    "HumanEval/99",
+    "HumanEval/104",
+    "HumanEval/113",
+    "HumanEval/115",
+    "HumanEval/120",
+    "HumanEval/124",
+    "HumanEval/126",
+    "HumanEval/132",
+    "HumanEval/135",
+    "HumanEval/140",
+    "HumanEval/146",
+]
+
+
+def download_human_eval():
+    """Download the HumanEval dataset, un-gzips it, and returns a list of its parsed JSON objects."""
+
+    # Send a HTTP request to the URL of the file
+    response = requests.get(URL)
+
+    # Ensure we raise an error if the download failed
+    response.raise_for_status()
+
+    # Create a BytesIO object from the response content
+    buffer = io.BytesIO(response.content)
+
+    # Read the file, line by line, populating a list of parsed JSON objects
+    results = []
+    with gzip.GzipFile(fileobj=buffer) as f_in:
+        for line in f_in:
+            # Parse each line as JSON
+            results.append(json.loads(line))
+
+    return results
+
+
+def create_jsonl(name, tasks, template):
+    """Creates a JSONL scenario file with a given name, list of HumanEval tasks, and template path."""
+
+    # Create a task directory if it doesn't exist
+    if not os.path.isdir(TASKS_DIR):
+        os.mkdir(TASKS_DIR)
+
+    # Create the jsonl file
+    with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
+        for task in tasks:
+            print(f"Converting: [{name}] {task['task_id']}")
+
+            record = {
+                "id": task["task_id"].replace("/", "_"),
+                "template": template,
+                "substitutions": {
+                    "scenario.py": {"__ENTRY_POINT__": task["entry_point"]},
+                    "prompt.txt": {"__PROMPT__": task["prompt"]},
+                    "unit_tests.py": {"__TEST__": task["test"]},
+                },
+            }
+
+            fh.write(json.dumps(record).strip() + "\n")
+
+
+###############################################################################
+def main():
+    human_eval = download_human_eval()
+    # Deprecated: reduced_human_eval = [t for t in human_eval if t["task_id"] in REDUCED_SET]
+
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
+
+    # Create the various combinations of [models] x [templates]
+    for t in templates.items():
+        create_jsonl(f"human_eval_{t[0]}", human_eval, t[1])
+        # Deprecated: create_jsonl(f"r_human_eval_{t[0]}", reduced_human_eval, t[1])
+
+
+if __name__ == "__main__" and __package__ is None:
+    main()
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/prompt.txt b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/prompt.txt
new file mode 100644
index 000000000000..482f50dca311
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/prompt.txt
@@ -0,0 +1 @@
+__PROMPT__
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/requirements.txt b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/requirements.txt
new file mode 100644
index 000000000000..2070aca75263
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/requirements.txt
@@ -0,0 +1,4 @@
+/autogen_python/packages/autogen-core
+/autogen_python/packages/autogen-magentic-one
+azure-identity
+tiktoken
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/scenario.py b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/scenario.py
new file mode 100644
index 000000000000..3229eba12589
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/scenario.py
@@ -0,0 +1,107 @@
+import asyncio
+import logging
+
+from autogen_core.base import AgentId, AgentProxy, TopicId
+from autogen_core.application import SingleThreadedAgentRuntime
+from autogen_core.application.logging import EVENT_LOGGER_NAME
+from autogen_core.components import DefaultSubscription, DefaultTopicId
+from autogen_core.components.code_executor import LocalCommandLineCodeExecutor
+from autogen_core.components.models import (
+    AzureOpenAIChatCompletionClient,
+    ModelCapabilities,
+    UserMessage,
+)
+
+from autogen_magentic_one.agents.coder import Coder, Executor
+from autogen_magentic_one.agents.orchestrator import RoundRobinOrchestrator
+from autogen_magentic_one.messages import BroadcastMessage, OrchestrationEvent
+from autogen_magentic_one.utils import create_completion_client_from_env
+
+
+async def main() -> None:
+    # Create the runtime.
+    runtime = SingleThreadedAgentRuntime()
+
+    # Create the AzureOpenAI client
+    client = create_completion_client_from_env()
+
+    # Register agents.
+    await runtime.register(
+        "Coder",
+        lambda: Coder(model_client=client),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    coder = AgentProxy(AgentId("Coder", "default"), runtime)
+
+    await runtime.register(
+        "Executor",
+        lambda: Executor(
+            "A agent for executing code", executor=LocalCommandLineCodeExecutor(), confirm_execution="ACCEPT_ALL"
+        ),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    executor = AgentProxy(AgentId("Executor", "default"), runtime)
+
+    await runtime.register(
+        "Orchestrator", 
+        lambda: RoundRobinOrchestrator([coder, executor]),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+
+    prompt = ""
+    with open("prompt.txt", "rt") as fh:
+        prompt = fh.read()
+
+    entry_point = "__ENTRY_POINT__"
+
+    task = f"""
+The following python code imports the `run_tests` function from unit_tests.py, and runs
+it on the function `{entry_point}`. This will run a set of automated unit tests to verify the
+correct implementation of `{entry_point}`. However, `{entry_point}` is only partially
+implemented in the code below. Complete the implementation of `{entry_point}` and then execute
+a new stand-alone code block that contains everything needed to run the tests, including: importing
+`unit_tests`, calling `run_tests({entry_point})`, as well as {entry_point}'s complete definition,
+such that this code block can be run directly in Python.
+
+```python
+from unit_tests import run_tests
+
+{prompt}
+
+# Run the unit tests
+run_tests({entry_point})
+```
+""".strip()
+
+    runtime.start()
+
+    await runtime.publish_message(
+        BroadcastMessage(content=UserMessage(content=task, source="human")),
+        topic_id=DefaultTopicId(),
+    )
+
+    await runtime.stop_when_idle()
+
+
+class MyHandler(logging.Handler):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def emit(self, record: logging.LogRecord) -> None:
+        try:
+            if isinstance(record.msg, OrchestrationEvent):
+                print(f"""---------------------------------------------------------------------------
+\033[91m{record.msg.source}:\033[0m
+
+{record.msg.message}""", flush=True)
+        except Exception:
+            self.handleError(record)
+
+
+if __name__ == "__main__":
+
+    logger = logging.getLogger(EVENT_LOGGER_NAME)
+    logger.setLevel(logging.INFO)
+    my_handler = MyHandler()
+    logger.handlers = [my_handler]
+    asyncio.run(main())
diff --git a/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/unit_tests.py b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/unit_tests.py
new file mode 100644
index 000000000000..55be54d4196f
--- /dev/null
+++ b/python/packages/agbench/benchmarks/HumanEval/Templates/MagenticOne/unit_tests.py
@@ -0,0 +1,15 @@
+# Disable ruff linter for template files
+# ruff: noqa: F821 E722
+import sys
+
+
+__TEST__
+
+
+def run_tests(candidate):
+    try:
+        check(candidate)
+        # We can search for this string in the output
+        print("ALL TESTS PASSED !#!#")
+    except AssertionError:
+        sys.exit("SOME TESTS FAILED - TRY AGAIN !#!#")
diff --git a/python/packages/agbench/benchmarks/README.md b/python/packages/agbench/benchmarks/README.md
new file mode 100644
index 000000000000..0e26093d19f5
--- /dev/null
+++ b/python/packages/agbench/benchmarks/README.md
@@ -0,0 +1,18 @@
+# Benchmarking Agents
+
+This directory provides ability to benchmarks agents (e.g., built using Autogen) using AgBench. Use the instructions below to prepare your environment for benchmarking. Once done, proceed to relevant benchmarks directory (e.g., `benchmarks/GAIA`) for further scenario-specific instructions.
+
+## Setup on WSL
+
+1. Install Docker Desktop. After installation, restart is needed, then open Docker Desktop, in Settings, Ressources, WSL Integration, Enable integration with additional distros – Ubuntu
+2. Clone autogen and export `AUTOGEN_REPO_BASE`. This environment variable enables the Docker containers to use the correct version agents.
+    ```bash
+    git clone git@github.com:microsoft/autogen.git
+    export AUTOGEN_REPO_BASE=<path_to_autogen>
+    ```
+3. Install `agbench`. AgBench is currently a tool in the Autogen repo.
+
+    ```bash
+    cd autogen/python/packages/agbench
+    pip install -e .
+    ```
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/WebArena/ENV.sample b/python/packages/agbench/benchmarks/WebArena/ENV.sample
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/packages/agbench/benchmarks/WebArena/README.md b/python/packages/agbench/benchmarks/WebArena/README.md
new file mode 100644
index 000000000000..74e17f892dad
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/README.md
@@ -0,0 +1,8 @@
+# WebArena Benchmark
+
+This scenario implements the [WebArena](https://github.com/web-arena-x/webarena/tree/main) benchmark. The evaluation code has been modified from WebArena in [evaluation_harness](Templates/Common/evaluation_harness) we retain the License from WebArena and include it here [LICENSE](Templates/Common/evaluation_harness/LICENSE).
+
+
+## References
+
+Zhou, Shuyan, Frank F. Xu, Hao Zhu, Xuhui Zhou, Robert Lo, Abishek Sridhar, Xianyi Cheng et al. "Webarena: A realistic web environment for building autonomous agents." arXiv preprint arXiv:2307.13854 (2023).
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/WebArena/Scripts/custom_tabulate.py b/python/packages/agbench/benchmarks/WebArena/Scripts/custom_tabulate.py
new file mode 100644
index 000000000000..6697a08749a5
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Scripts/custom_tabulate.py
@@ -0,0 +1,35 @@
+import os
+import sys
+import re
+from agbench.tabulate_cmd import default_tabulate
+
+
+def scorer(instance_dir):
+
+    # Read the console
+    console_log_file = os.path.join(instance_dir, "console_log.txt")
+    if not os.path.isfile(console_log_file):
+        return None
+
+    console_log = ""
+    with open(console_log_file, "rt") as fh:
+        console_log = fh.read()
+
+        final_score = None 
+        m = re.search(r"FINAL SCORE:(.*?)\n", console_log, re.DOTALL)
+        if m:
+            final_score = m.group(1).strip()
+
+        # Missing the final answer line
+        if final_score is None:
+            return None
+        else:
+            return float(final_score) > 0
+
+
+def main(args):
+    default_tabulate(args, scorer=scorer)
+
+
+if __name__ == "__main__" and __package__ is None:
+    main(sys.argv)
diff --git a/python/packages/agbench/benchmarks/WebArena/Scripts/init_tasks.py b/python/packages/agbench/benchmarks/WebArena/Scripts/init_tasks.py
new file mode 100644
index 000000000000..5ba3fd4d08f4
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Scripts/init_tasks.py
@@ -0,0 +1,122 @@
+#
+# Run this file to download the human_eval dataset, and create a corresponding testbed scenario:
+# (default: ../scenarios/human_eval_two_agents_gpt4.jsonl and ./scenarios/human_eval_two_agents_gpt35.jsonl)
+#
+
+import requests
+import tarfile
+import hashlib
+import io
+import json
+import os
+import re
+import sys
+
+URL = "https://raw.githubusercontent.com/web-arena-x/webarena/main/config_files/test.raw.json"
+
+SCRIPT_PATH = os.path.realpath(__file__)
+SCRIPT_NAME = os.path.basename(SCRIPT_PATH)
+SCRIPT_DIR = os.path.dirname(SCRIPT_PATH)
+
+SCENARIO_DIR = os.path.realpath(os.path.join(SCRIPT_DIR, os.path.pardir))
+TEMPLATES_DIR = os.path.join(SCENARIO_DIR, "Templates")
+TASKS_DIR = os.path.join(SCENARIO_DIR, "Tasks")
+DOWNLOADS_DIR = os.path.join(SCENARIO_DIR, "Downloads")
+
+
+def download():
+    """Download the WebArena dataset (if not already downloaded).
+    Return a JSON list of problem instances."""
+
+    if not os.path.isdir(DOWNLOADS_DIR):
+        os.mkdir(DOWNLOADS_DIR)
+
+    json_file = os.path.join(DOWNLOADS_DIR, "test.raw.json")
+
+    if not os.path.isfile(json_file):
+        # Send a HTTP request to the URL
+        response = requests.get(URL, stream=True)
+        response.raise_for_status()
+
+        # If the HTTP request returns a status code 200, proceed
+        with open(json_file, "wb") as fh:
+            for chunk in response.iter_content(chunk_size=512):
+                fh.write(chunk)
+
+    # Load the problems
+    problems = None
+    with open(json_file, "rb") as fh:
+        problems = json.load(fh)
+    return problems
+
+
+def create_jsonl(name, tasks, template):
+    """Creates a JSONL scenario file with a given name, dictionary of MATH problems, and template path."""
+
+    # Create a task directory if it doesn't exist
+    if not os.path.isdir(TASKS_DIR):
+        os.mkdir(TASKS_DIR)
+
+    # Create the jsonl file
+    prompt_fields = ["task_id", "intent_template_id", "sites", "require_login", "start_url", "geolocation", "intent"]
+    with open(os.path.join(TASKS_DIR, name + ".jsonl"), "wt") as fh:
+        for task in tasks:
+            print(f"Converting: {name}, {task['task_id']}")
+
+            task_prompt = {}
+            for field in prompt_fields:
+                task_prompt[field] = task[field]
+
+            record = {
+                "id": str(task["task_id"]),
+                "template": [os.path.join(TEMPLATES_DIR, "Common"), template],
+                "substitutions": {
+                    "task_prompt.json.txt": {"__TASK_PROMPT__": json.dumps(task_prompt, indent=4)},
+                    "full_task.json.txt": {"__FULL_TASK__": json.dumps(task, indent=4)},
+                },
+            }
+
+            fh.write(json.dumps(record).strip() + "\n")
+
+
+###############################################################################
+def main():
+    tasks = download()
+
+    # list all directories in the Templates directory
+    # and populate a dictionary with the name and path
+    templates = {}
+    for entry in os.scandir(TEMPLATES_DIR):
+        if entry.is_dir():
+            if entry.name == "Common":  # Skip the common template, which will be included in all
+                continue
+            templates[re.sub(r"\s", "", entry.name)] = entry.path
+
+    # Divide the tasks by their websites and if they are validation or test
+    page_groups = dict()
+    for task in tasks:
+
+        # We don't know how the intent ids are distributed, so hash them to get a uniform distribution
+        template_hash = hashlib.md5(str(task["intent_template_id"]).encode("utf-8")).hexdigest()
+
+        # The full hash will consist of 32 hexadecimal digits. We can get a 50/50 split by checking if the first digit is in the range (0-7) vs (8-F)
+        task_set = "validation" if template_hash[0] in "01234567" else "test"
+
+        key = task["sites"][0]
+        if len(task["sites"]) > 1:
+            key = "several_sites"
+        key = task_set + "_" + key
+
+        # key = "__".join(sorted([s for s in task["sites"]]))
+        if key not in page_groups:
+            page_groups[key] = list()
+        page_groups[key].append(task)
+
+    # Create the json files
+    for t in templates.items():
+        for pg in page_groups:
+            create_jsonl(f"webarena__{pg}_{t[0]}", page_groups[pg], t[1])
+
+
+if __name__ == "__main__" and __package__ is None:
+    main()
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/ATTRIBUTION b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/ATTRIBUTION
new file mode 100644
index 000000000000..0713904fb45b
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/ATTRIBUTION
@@ -0,0 +1,28 @@
+The contents of this `evaluation_harness` folder are adapted from:
+
+              https://github.com/web-arena-x/webarena
+
+under the following license:
+
+=========================================================================================================
+
+Copyright (c) 2024 Jing Yu Koh, Robert Lo, Lawrence Jang, Vikram Duvvur, Ming Chong Lim, and Po-Yu Huang
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/LICENSE b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/LICENSE
new file mode 100644
index 000000000000..f49a4e16e68b
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/__init__.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/__init__.py
new file mode 100644
index 000000000000..e942c1066769
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/__init__.py
@@ -0,0 +1,6 @@
+from .evaluators import *
+from .helper_functions import (
+    shopping_get_latest_order_url,
+    shopping_get_sku_latest_review_author,
+    shopping_get_sku_latest_review_rating,
+)
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/env_config.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/env_config.py
new file mode 100644
index 000000000000..ed84ae4735ef
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/env_config.py
@@ -0,0 +1,90 @@
+# websites domain
+import os
+
+REDDIT = os.environ.get("REDDIT", "")
+SHOPPING = os.environ.get("SHOPPING", "")
+SHOPPING_ADMIN = os.environ.get("SHOPPING_ADMIN", "")
+GITLAB = os.environ.get("GITLAB", "")
+WIKIPEDIA = os.environ.get("WIKIPEDIA", "")
+MAP = os.environ.get("MAP", "")
+HOMEPAGE = os.environ.get("HOMEPAGE", "")
+
+REDDIT_USERNAME = os.environ.get("REDDIT_USERNAME", "")
+REDDIT_PASSWORD = os.environ.get("REDDIT_PASSWORD", "")
+
+GITLAB_USERNAME = os.environ.get("GITLAB_USERNAME", "")
+GITLAB_PASSWORD = os.environ.get("GITLAB_PASSWORD", "")
+
+SHOPPING_USERNAME = os.environ.get("SHOPPING_USERNAME", "")
+SHOPPING_PASSWORD = os.environ.get("SHOPPING_PASSWORD", "")
+
+SHOPPING_ADMIN_USERNAME = os.environ.get("SHOPPING_ADMIN_USERNAME", "")
+SHOPPING_ADMIN_PASSWORD = os.environ.get("SHOPPING_ADMIN_PASSWORD", "")
+
+assert REDDIT and SHOPPING and SHOPPING_ADMIN and GITLAB and WIKIPEDIA and MAP and HOMEPAGE, (
+    "Please setup the URLs to each site. Current: \n"
+    + f"Reddit: {REDDIT}\n"
+    + f"Shopping: {SHOPPING}\n"
+    + f"Shopping Admin: {SHOPPING_ADMIN}\n"
+    + f"Gitlab: {GITLAB}\n"
+    + f"Wikipedia: {WIKIPEDIA}\n"
+    + f"Map: {MAP}\n"
+    + f"Homepage: {HOMEPAGE}\n"
+)
+
+ACCOUNTS = {
+    "reddit": {"username": REDDIT_USERNAME, "password": REDDIT_PASSWORD},
+    "gitlab": {"username": GITLAB_USERNAME, "password": GITLAB_PASSWORD},
+    "shopping": {"username": SHOPPING_USERNAME, "password": SHOPPING_PASSWORD},
+    "shopping_admin": {"username": SHOPPING_ADMIN_USERNAME, "password": SHOPPING_ADMIN_PASSWORD},
+    "shopping_site_admin": {"username": SHOPPING_ADMIN_USERNAME, "password": SHOPPING_ADMIN_PASSWORD},
+}
+
+URL_MAPPINGS = {
+    REDDIT: "http://reddit.com",
+    SHOPPING: "http://onestopmarket.com",
+    SHOPPING_ADMIN: "http://luma.com/admin",
+    GITLAB: "http://gitlab.com",
+    WIKIPEDIA: "http://wikipedia.org",
+    MAP: "http://openstreetmap.org",
+    HOMEPAGE: "http://homepage.com",
+}
+
+# ADDED BY MSR Frontiers
+#########################
+SITE_URLS = {
+    "reddit": REDDIT,
+    "gitlab": GITLAB, 
+    "shopping": SHOPPING,
+    "shopping_admin": SHOPPING_ADMIN,
+    "shopping_site_admin": SHOPPING_ADMIN,
+    "map": MAP,
+    "wikipedia": WIKIPEDIA,
+}
+
+LOGIN_PROMPTS = {
+    "reddit": f"Type '{REDDIT}' into the address bar to navigate to the site. Click 'Log in', type the username '{ACCOUNTS['reddit']['username']}', and password is '{ACCOUNTS['reddit']['password']}'. Finally click the login button.",
+    "gitlab": f"Type '{GITLAB}' into the address bar to navigate to the site. At the log in prompt, type the username '{ACCOUNTS['gitlab']['username']}', and the password '{ACCOUNTS['gitlab']['password']}'. Finally click the 'Sign in' button.",
+    "shopping": f"Type '{SHOPPING}' into the address bar to navigate to the site. Click 'Sign In' at the top of the page. Enter the Email '{ACCOUNTS['shopping']['username']}', and password '{ACCOUNTS['shopping']['password']}'. Finally click the 'Sign In' button.",
+    "shopping_admin": f"Type '{SHOPPING_ADMIN}' into the address bar to navigate to the site. At the log in prompt, enter the username '{ACCOUNTS['shopping_admin']['username']}', and the password '{ACCOUNTS['shopping_admin']['password']}'. Finally click the 'Sign In' button.",
+}
+
+SITE_DESCRIPTIONS = {
+    "reddit": "a Postmill forum populated with a large sample of data crawled from Reddit. Postmill is similar to Reddit, but the UI is distinct, and 'subreddits' begin with /f/ rather than /r/",
+    "gitlab": "a Gitlab site populated with various programming projects. Gitlab is similar to GitHub, though the UIs are slightly different",
+    "shopping": "an online store built with the Magento open source eCommerce platform",
+    "shopping_admin": "the content management admin portal for an online store running the Magento open source eCommerce software",
+}
+
+
+def url_to_sitename(url):
+    if url.startswith(REDDIT):
+        return "reddit"
+    elif url.startswith(GITLAB):
+        return "gitlab"
+    elif url.startswith(SHOPPING):
+        return "shopping"
+    elif url.startswith(SHOPPING_ADMIN):
+        return "shopping_admin"
+    else:
+        return None
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/evaluators.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/evaluators.py
new file mode 100644
index 000000000000..05c9a3bc15a1
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/evaluators.py
@@ -0,0 +1,387 @@
+"""From WebArena. base class for evaluation"""
+
+# answer string match
+import collections
+import html
+import importlib
+import json
+import time
+import urllib
+import inspect
+from pathlib import Path
+from typing import Any, Tuple, Union, TypedDict, Dict
+
+from beartype import beartype
+from nltk.tokenize import word_tokenize  # type: ignore
+from playwright.async_api import CDPSession, Page
+
+import numpy as np
+import numpy.typing as npt
+
+from .helper_functions import (
+    PseudoPage,
+    gitlab_get_project_memeber_role,
+    llm_fuzzy_match,
+    llm_ua_match,
+    reddit_get_post_url,
+    shopping_get_latest_order_url,
+    shopping_get_sku_latest_review_author,
+    shopping_get_sku_latest_review_rating,
+)
+
+
+# Subset used for evaluation (added by: adamfo)
+#####################################################################
+class Action(TypedDict):
+    answer: str
+
+
+Observation = str | npt.NDArray[np.uint8]
+
+
+class StateInfo(TypedDict):
+    observation: dict[str, Observation]
+    info: Dict[str, Any]
+
+
+Trajectory = list[Union[Action, StateInfo]]
+
+
+def make_answer_trajecotry(answer: str) -> Trajectory:
+    ans = Action()
+    ans["answer"] = answer
+    return [ans]
+
+
+#####################################################################
+class Evaluator(object):
+    def __init__(self, eval_tag: str = "") -> None:
+        self.eval_tag = eval_tag
+
+    @beartype
+    async def __call__(
+        self,
+        trajectory: Trajectory,
+        config_file: Path | str,
+        page: Page | PseudoPage,
+        client: CDPSession,
+        azure_config: dict[str, Any] | None = None,
+    ) -> float:
+        raise NotImplementedError
+
+    @staticmethod
+    def get_last_action(trajectory: Trajectory) -> Action:
+        try:
+            # is_bearable(trajectory[-1], Action)
+            last_action = trajectory[-1]
+        except Exception:
+            raise ValueError("The last element of trajectory should be an action, add a fake stop action if needed")
+
+        return last_action  # type: ignore[return-value]
+
+    @staticmethod
+    def get_last_state(trajectory: Trajectory) -> StateInfo:
+        try:
+            # is_bearable(trajectory[-2], StateInfo)
+            last_state = trajectory[-2]
+        except Exception:
+            raise ValueError(
+                "The second last element of trajectory should be a state, add a fake stop action if needed"
+            )
+
+        return last_state  # type: ignore[return-value]
+
+
+class StringEvaluator(Evaluator):
+    """Check whether the answer is correct with:
+    exact match: the answer is exactly the same as the reference answer
+    must include: each phrase in the reference answer must be included in the answer
+    fuzzy match: the answer is similar to the reference answer, using LLM judge
+    """
+
+    @staticmethod
+    @beartype
+    def clean_answer(answer: str) -> str:
+        answer = answer.strip()
+        if answer.startswith("'") and answer.endswith("'"):
+            answer = answer[1:-1]
+        elif answer.startswith('"') and answer.endswith('"'):
+            answer = answer[1:-1]
+        return answer.lower()
+
+    @staticmethod
+    @beartype
+    def exact_match(ref: str, pred: str) -> float:
+        return float(StringEvaluator.clean_answer(pred) == StringEvaluator.clean_answer(ref))
+
+    @staticmethod
+    @beartype
+    def must_include(ref: str, pred: str, tokenize: bool = False) -> float:
+        clean_ref = StringEvaluator.clean_answer(ref)
+        clean_pred = StringEvaluator.clean_answer(pred)
+        # tokenize the answer if the ref is a single word
+        # prevent false positive (e.g, 0)
+        if tokenize and len(clean_ref) == 1 and len(word_tokenize(clean_ref)) == 1:
+            tok_pred = word_tokenize(clean_pred)
+            return float(clean_ref in tok_pred)
+        else:
+            return float(clean_ref in clean_pred)
+
+    @staticmethod
+    @beartype
+    def fuzzy_match(ref: str, pred: str, intent: str, azure_config: dict[str, Any] | None) -> float:
+        return llm_fuzzy_match(pred, ref, intent, azure_config)
+
+    @staticmethod
+    @beartype
+    def ua_match(ref: str, pred: str, intent: str, azure_config: dict[str, Any] | None) -> float:
+        return llm_ua_match(pred, ref, intent, azure_config)
+
+    async def __call__(
+        self,
+        trajectory: Trajectory,
+        config_file: Path | str,
+        page: Page | PseudoPage | None = None,
+        client: CDPSession | None = None,
+        azure_config: dict[str, Any] | None = None,
+    ) -> float:
+        with open(config_file, "r") as f:
+            configs = json.load(f)
+
+        last_action = self.get_last_action(trajectory)
+        pred = self.clean_answer(last_action["answer"])
+
+        score = 1.0
+        for approach, value in configs["eval"]["reference_answers"].items():
+            match approach:
+                case "exact_match":
+                    score *= self.exact_match(ref=value, pred=pred)
+
+                case "must_include":
+                    assert isinstance(value, list)
+                    for must_value in value:
+                        score *= self.must_include(
+                            ref=must_value,
+                            pred=pred,
+                            tokenize=(len(value) == 1),
+                        )
+                case "fuzzy_match":
+                    intent = configs["intent"]
+                    if value == "N/A":
+                        # if the instruction only asks the model to generate N/A when encountering an unachievable task
+                        # without more concrete reasons
+                        score *= self.exact_match(ref=value, pred=pred)
+                        # if the instruction also asks the model to generate the reason why the task is unachievable
+                        # this should be the default as it will prevent false positive N/A`
+                        if score != 1:
+                            score = 1.0 * self.ua_match(
+                                intent=configs["intent"],
+                                ref=configs["eval"]["string_note"],
+                                pred=pred,
+                                azure_config=azure_config,
+                            )
+                    else:
+                        assert isinstance(value, list)
+                        for reference in value:
+                            score *= self.fuzzy_match(
+                                ref=reference, pred=pred, intent=intent, azure_config=azure_config
+                            )
+        return score
+
+
+class URLEvaluator(Evaluator):
+    """Check URL matching"""
+
+    @beartype
+    async def __call__(
+        self,
+        trajectory: Trajectory,
+        config_file: Path | str,
+        page: Page | PseudoPage,
+        client: CDPSession | None = None,
+        azure_config: dict[str, Any] | None = None,
+    ) -> float:
+        with open(config_file, "r") as f:
+            configs = json.load(f)
+
+        def clean_url(url: str) -> str:
+            url = str(url)
+            url = url.rstrip("/")
+            return url
+
+        def parse_url(url: str) -> tuple[str, dict[str, list[str]]]:
+            """Parse a URL into its base, path, and query components."""
+            parsed_url = urllib.parse.urlparse(url)
+            base_path = parsed_url.netloc + parsed_url.path
+            query = urllib.parse.parse_qs(parsed_url.query)
+            return base_path, query
+
+        def parse_urls(
+            urls: list[str],
+        ) -> tuple[list[str], dict[str, set[str]]]:
+            """Parse a list of URLs."""
+            base_paths = []
+            queries = collections.defaultdict(set)
+            for url in urls:
+                base_path, query = parse_url(url)
+                base_paths.append(base_path)
+                for k, v in query.items():
+                    queries[k].update(v)
+            return base_paths, queries
+
+        pred = clean_url(page.url)
+        ref_urls = configs["eval"]["reference_url"].split(" |OR| ")
+        ref_urls = [clean_url(url) for url in ref_urls]
+        matching_rule = configs["eval"].get("url_note", "GOLD in PRED")
+        if matching_rule == "GOLD in PRED":
+            print(f"Pred: {pred}")
+            print(f"Ref: {ref_urls}")
+            ref_base_paths, ref_queries = parse_urls(ref_urls)
+            pred_base_paths, pred_query = parse_url(pred)
+
+            base_score = float(any([ref_base_path in pred_base_paths for ref_base_path in ref_base_paths]))
+            query_score = 1.0
+            for k, possible_values in ref_queries.items():
+                query_score *= float(
+                    any(possible_ref_value in pred_query.get(k, []) for possible_ref_value in possible_values)
+                )
+            score = base_score * query_score
+
+        else:
+            raise ValueError(f"Unknown matching rule: {matching_rule}")
+
+        return score
+
+
+class HTMLContentEvaluator(Evaluator):
+    """Check whether the contents appear in the page"""
+
+    @beartype
+    async def __call__(
+        self,
+        trajectory: Trajectory,
+        config_file: Path | str,
+        page: Page | PseudoPage,
+        client: CDPSession | None = None,
+        azure_config: dict[str, Any] | None = None,
+    ) -> float:
+        with open(config_file, "r") as f:
+            configs = json.load(f)
+
+        targets = configs["eval"]["program_html"]
+
+        score = 1.0
+        for target in targets:
+            target_url: str = target["url"]  # which url to check
+            if target_url.startswith("func"):
+                func = target_url.split("func:")[1]
+                func = func.replace("__last_url__", page.url)
+                target_url = eval(func)
+                if inspect.isawaitable(target_url):
+                    target_url = await target_url
+
+            locator: str = target["locator"]  # js element locator
+
+            # navigate to that url
+            if target_url != "last":
+                await page.goto(target_url)
+                time.sleep(3)  # TODO [shuyanzh]: fix this hard-coded sleep
+
+            # empty, use the full page
+            if not locator.strip():
+                selected_element = await page.content()
+            # use JS to select the element
+            elif locator.startswith("document.") or locator.startswith("[...document."):
+                if "prep_actions" in target:
+                    try:
+                        for prep_action in target["prep_actions"]:
+                            await page.evaluate(f"() => {prep_action}")
+                    except Exception:
+                        pass
+                try:
+                    selected_element = await page.evaluate(f"() => {locator}")
+                    selected_element = str(selected_element)
+                    if not selected_element:
+                        selected_element = ""
+                except Exception:
+                    # the page is wrong, return empty
+                    selected_element = ""
+            # run program to call API
+            elif locator.startswith("func:"):  # a helper function
+                func = locator.split("func:")[1]
+                func = func.replace("__page__", "page")
+                selected_element = eval(func)
+                if inspect.isawaitable(selected_element):
+                    selected_element = await selected_element
+            else:
+                raise ValueError(f"Unknown locator: {locator}")
+
+            selected_element = html.unescape(selected_element)
+
+            if "exact_match" in target["required_contents"]:
+                required_contents = target["required_contents"]["exact_match"]
+                cur_score = StringEvaluator.exact_match(ref=required_contents, pred=selected_element)
+                score *= float(cur_score)
+                # print(f"[exact match] {cur_score}, selected element: {selected_element}, required contents: {required_contents}")
+            elif "must_include" in target["required_contents"]:
+                required_contents = target["required_contents"]["must_include"]
+                assert isinstance(required_contents, list)
+                for content in required_contents:
+                    content_or = content.split(" |OR| ")
+                    cur_score = any(
+                        [
+                            StringEvaluator.must_include(
+                                ref=content,
+                                pred=selected_element,
+                                tokenize=False,
+                            )
+                            for content in content_or
+                        ]
+                    )
+                    score *= float(cur_score)
+                    # print(f"[must include] {cur_score}, selected element: {selected_element}, required contents: {content_or}")
+            else:
+                raise ValueError(f"Unknown required_contents: {target['required_contents'].keys()}")
+        return score
+
+
+class EvaluatorComb:
+    def __init__(self, evaluators: list[Evaluator]) -> None:
+        self.evaluators = evaluators
+
+    @beartype
+    async def __call__(
+        self,
+        trajectory: Trajectory,
+        config_file: Path | str,
+        page: Page | PseudoPage,
+        client: CDPSession,
+        azure_config: dict[str, Any] | None = None,
+    ) -> float:
+        score = 1.0
+        for evaluator in self.evaluators:
+            cur_score = await evaluator(trajectory, config_file, page, client, azure_config)
+            score *= cur_score
+        return score
+
+
+@beartype
+def evaluator_router(config_file: Path | str) -> EvaluatorComb:
+    """Router to get the evaluator class"""
+    with open(config_file, "r") as f:
+        configs = json.load(f)
+
+    eval_types = configs["eval"]["eval_types"]
+    evaluators: list[Evaluator] = []
+    for eval_type in eval_types:
+        match eval_type:
+            case "string_match":
+                evaluators.append(StringEvaluator())
+            case "url_match":
+                evaluators.append(URLEvaluator())
+            case "program_html":
+                evaluators.append(HTMLContentEvaluator())
+            case _:
+                raise ValueError(f"eval_type {eval_type} is not supported")
+
+    return EvaluatorComb(evaluators)
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/helper_functions.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/helper_functions.py
new file mode 100644
index 000000000000..eff8520b5ab4
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/helper_functions.py
@@ -0,0 +1,233 @@
+"""From WebArena with minor modifications. Implements helper functions to assist evaluation cases where other evaluators are not suitable."""
+
+import json
+from typing import Any
+from urllib.parse import urlparse
+
+import requests
+from playwright.async_api import Page
+
+from .env_config import (
+    ACCOUNTS,
+    GITLAB,
+    MAP,
+    REDDIT,
+    SHOPPING,
+    SHOPPING_ADMIN,
+    WIKIPEDIA,
+)
+
+from .openai_utils import (
+    generate_from_openai_chat_completion,
+)
+
+import autogen
+
+
+def shopping_get_auth_token() -> str:
+    response = requests.post(
+        url=f"{SHOPPING}/rest/default/V1/integration/admin/token",
+        headers={"content-type": "application/json"},
+        data=json.dumps(
+            {
+                "username": ACCOUNTS["shopping_site_admin"]["username"],
+                "password": ACCOUNTS["shopping_site_admin"]["password"],
+            }
+        ),
+    )
+    token: str = response.json()
+    return token
+
+
+def shopping_get_latest_order_url() -> str:
+    """Get the latest order url from the shopping website."""
+
+    header = {
+        "Authorization": f"Bearer {shopping_get_auth_token()}",
+        "Content-Type": "application/json",
+    }
+
+    params = {
+        "searchCriteria[sortOrders][0][field]": "created_at",
+        "searchCriteria[sortOrders][0][direction]": "DESC",
+        "searchCriteria[pageSize]": "1",
+    }
+
+    response = requests.get(f"{SHOPPING}/rest/V1/orders", params=params, headers=header)
+    assert response.status_code == 200
+    response_obj = response.json()["items"][0]
+    order_id = int(response_obj["increment_id"])
+    order_url = f"{SHOPPING}/sales/order/view/order_id/{order_id}/"
+    return order_url
+
+
+def shopping_get_sku_latest_review_author(sku: str) -> str:
+    """Get the latest review for shopping admin."""
+    header = {
+        "Authorization": f"Bearer {shopping_get_auth_token()}",
+        "Content-Type": "application/json",
+    }
+    response = requests.get(f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header)
+    assert response.status_code == 200
+    response_obj = response.json()
+    if len(response_obj) == 0:
+        return ""
+    author: str = response_obj[-1]["nickname"]
+    return author
+
+
+def shopping_get_sku_latest_review_rating(sku: str) -> str:
+    """Get the latest review for shopping admin."""
+    header = {
+        "Authorization": f"Bearer {shopping_get_auth_token()}",
+        "Content-Type": "application/json",
+    }
+    response = requests.get(f"{SHOPPING}/rest/V1/products/{sku}/reviews", headers=header)
+    assert response.status_code == 200
+    response_obj = response.json()
+    if len(response_obj) == 0:
+        return ""
+    assert response_obj[0]["ratings"][0]["rating_name"] == "Rating"
+    rating: str = str(response_obj[-1]["ratings"][0]["percent"])
+    return rating
+
+
+def reddit_get_post_url(url: str) -> str:
+    """Get the post url"""
+    # Url is http://domain/f/subreddit/post_id/...
+    # get domain, subreddit, post_id
+    domain = urlparse(url).netloc
+    tok_url = urlparse(url).path.split("/")
+    # not a valid post/comment url, return the url as is
+    if len(tok_url) < 4:
+        return url
+    if tok_url[1] != "f":
+        return url
+    subreddit = urlparse(url).path.split("/")[2]
+    post_id = urlparse(url).path.split("/")[3]
+    scheme = urlparse(url).scheme
+    post_url = f"{scheme}://{domain}/f/{subreddit}/{post_id}/"
+    return post_url
+
+
+async def gitlab_get_project_memeber_role(page: Page, account_name: str) -> str:
+    # get the account index
+    try:
+        account_idx = await page.evaluate(
+            f"""(() => {{
+                const elements = document.querySelectorAll("td[data-label='Account'] span.gl-avatar-labeled-sublabel");
+                let index = -1;  // Default value if not found
+
+                for(let i = 0; i < elements.length; i++) {{
+                    if(elements[i].outerText === '@{account_name}') {{
+                        index = i;
+                        break;
+                    }}
+                }}
+
+                return index;
+            }})()"""
+        )
+
+        # get the role
+        role: str = await page.evaluate(
+            f"""(() => {{
+                return document.querySelectorAll("td.col-max-role span")[{account_idx}].outerText;
+            }})()"""
+        )
+    except Exception:
+        role = ""
+
+    return role
+
+
+def llm_fuzzy_match(pred: str, reference: str, question: str, azure_config: dict[str, Any] | None) -> float:
+    """Check whether the prediction matches the reference with GPT4-turbo"""
+    messages: list[dict[str, Any]] = []
+    # construct the question to ask
+    message = "Help a teacher to grade the answer of a student given a question. Keep in mind that the student may use different phrasing or wording to answer the question. The goal is to evaluate whether the answer is semantically equivalent to the reference answer.\n"
+    message += f"question: {question}\n"
+    message += f"reference answer: {reference}\n"
+    message += "all the string 'N/A' that you see is a special sequence that means 'not achievable'\n"
+    message += f"student answer: {pred}\n"
+    message += "Conclude the judgement by correct/incorrect/partially correct."
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": message},
+    ]
+
+    response = None
+    if azure_config is None:
+        response = generate_from_openai_chat_completion(
+            model="gpt-4-1106-preview",
+            messages=messages,
+            temperature=0,
+            max_tokens=768,
+            top_p=1.0,
+            context_length=0,
+        ).lower()
+    else:
+        client = autogen.OpenAIWrapper(**azure_config)
+        raw_response = client.create(context=None, messages=messages)
+        response = client.extract_text_or_completion_object(raw_response)[0].lower()
+
+    if "partially correct" in response or "incorrect" in response:
+        return 0.0
+    else:
+        assert "correct" in response
+        return 1.0
+
+
+def llm_ua_match(pred: str, reference: str, question: str, azure_config: dict[str, Any] | None) -> float:
+    """Check whether the prediction matches the reference with GPT-turbo"""
+    messages: list[dict[str, Any]] = []
+    # construct the question to ask
+    message = ""
+    message += f"task: {question}\n"
+    message += f"actual unachievable reason: {reference}\n"
+    message += f"reported unachievable reason: {pred}\n"
+    message += (
+        "The task described above is inherently unachievable due to the reason specified under 'actual unachievable reason'. "
+        "An individual previously attempted this task and was unable to complete it. They provided a reason for their failure, "
+        "which is listed under 'reported unachievable reason'. Your role is to review both the actual and reported reasons. "
+        "Determine if the reported reason aligns with the actual reason, even if implicitly. "
+        "If the stated reason is in line with the actual reason, respond with 'same'. Otherwise, respond with 'different'."
+    )
+    messages = [
+        {"role": "system", "content": "You are a helpful assistant"},
+        {"role": "user", "content": message},
+    ]
+
+    response = None
+    if azure_config is None:
+        response = generate_from_openai_chat_completion(
+            model="gpt-4-1106-preview",
+            messages=messages,
+            temperature=0,
+            max_tokens=768,
+            top_p=1.0,
+            context_length=0,
+        ).lower()
+    else:
+        client = autogen.OpenAIWrapper(**azure_config)
+        raw_response = client.create(context=None, messages=messages)
+        response = client.extract_text_or_completion_object(raw_response)[0].lower()
+
+    if "different" in response:
+        return 0.0
+    else:
+        assert "same" in response
+        return 1.0
+
+
+class PseudoPage:
+    def __init__(self, original_page: Page, url: str):
+        self.url = url
+        self.original_page = original_page
+
+    def __getattr__(self, attr: str) -> Any:
+        # Delegate attribute access to the original page object
+        if attr not in ["url"]:
+            return getattr(self.original_page, attr)
+        else:
+            return getattr(self, attr)
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/openai_utils.py b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/openai_utils.py
new file mode 100644
index 000000000000..1381f392cdf2
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/Common/evaluation_harness/openai_utils.py
@@ -0,0 +1,275 @@
+"""Tools to generate from OpenAI prompts.
+Adopted from https://github.com/zeno-ml/zeno-build/"""
+
+import asyncio
+import logging
+import os
+import random
+import time
+from typing import Any
+
+import aiolimiter
+import openai
+from openai import AsyncOpenAI, OpenAI
+
+client = None
+aclient = None
+if "OPENAI_API_KEY" not in os.environ and "OAI_CONFIG_LIST" not in os.environ:
+    raise ValueError("Neither OPENAI_API_KEY nor OAI_CONFIG_LIST is defined in the environment.")
+
+if "OPENAI_API_KEY" in os.environ:
+    client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])
+    aclient = AsyncOpenAI(api_key=os.environ["OPENAI_API_KEY"])
+from tqdm.asyncio import tqdm_asyncio
+
+
+def retry_with_exponential_backoff(  # type: ignore
+    func,
+    initial_delay: float = 1,
+    exponential_base: float = 2,
+    jitter: bool = True,
+    max_retries: int = 3,
+    errors: tuple[Any] = (
+        openai.RateLimitError,
+        openai.BadRequestError,
+        openai.InternalServerError,
+    ),
+):
+    """Retry a function with exponential backoff."""
+
+    def wrapper(*args, **kwargs):  # type: ignore
+        # Initialize variables
+        num_retries = 0
+        delay = initial_delay
+
+        # Loop until a successful response or max_retries is hit or an exception is raised
+        while True:
+            try:
+
+                return func(*args, **kwargs)
+
+            # Retry on specified errors
+            except errors:
+                # Increment retries
+                num_retries += 1
+
+                # Check if max retries has been reached
+                if num_retries > max_retries:
+                    raise Exception(f"Maximum number of retries ({max_retries}) exceeded.")
+
+                # Increment the delay
+                delay *= exponential_base * (1 + jitter * random.random())
+
+                # Sleep for the delay
+                time.sleep(delay)
+
+            # Raise exceptions for any errors not specified
+            except Exception as e:
+                raise e
+
+    return wrapper
+
+
+async def _throttled_openai_completion_acreate(
+    engine: str,
+    prompt: str,
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    limiter: aiolimiter.AsyncLimiter,
+) -> dict[str, Any]:
+    async with limiter:
+        for _ in range(3):
+            try:
+                return await aclient.completions.create(
+                    engine=engine,
+                    prompt=prompt,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    top_p=top_p,
+                )
+            except openai.RateLimitError:
+                logging.warning("OpenAI API rate limit exceeded. Sleeping for 10 seconds.")
+                await asyncio.sleep(10)
+            except openai.APIError as e:
+                logging.warning(f"OpenAI API error: {e}")
+                break
+        return {"choices": [{"message": {"content": ""}}]}
+
+
+async def agenerate_from_openai_completion(
+    prompts: list[str],
+    engine: str,
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    context_length: int,
+    requests_per_minute: int = 300,
+) -> list[str]:
+    """Generate from OpenAI Completion API.
+
+    Args:
+        prompts: list of prompts
+        temperature: Temperature to use.
+        max_tokens: Maximum number of tokens to generate.
+        top_p: Top p to use.
+        context_length: Length of context to use.
+        requests_per_minute: Number of requests per minute to allow.
+
+    Returns:
+        List of generated responses.
+    """
+    if "OPENAI_API_KEY" not in os.environ:
+        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
+
+    limiter = aiolimiter.AsyncLimiter(requests_per_minute)
+    async_responses = [
+        _throttled_openai_completion_acreate(
+            engine=engine,
+            prompt=prompt,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            limiter=limiter,
+        )
+        for prompt in prompts
+    ]
+    responses = await tqdm_asyncio.gather(*async_responses)
+    return [x["choices"][0]["text"] for x in responses]
+
+
+@retry_with_exponential_backoff
+def generate_from_openai_completion(
+    prompt: str,
+    engine: str,
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    context_length: int,
+    stop_token: str | None = None,
+) -> str:
+    if "OPENAI_API_KEY" not in os.environ:
+        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
+
+    response = client.completions.create(
+        prompt=prompt,
+        engine=engine,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        top_p=top_p,
+        stop=[stop_token],
+    )
+    answer: str = response["choices"][0]["text"]
+    return answer
+
+
+async def _throttled_openai_chat_completion_acreate(
+    model: str,
+    messages: list[dict[str, str]],
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    limiter: aiolimiter.AsyncLimiter,
+) -> dict[str, Any]:
+    async with limiter:
+        for _ in range(3):
+            try:
+                return await aclient.chat.completions.create(
+                    model=model,
+                    messages=messages,
+                    temperature=temperature,
+                    max_tokens=max_tokens,
+                    top_p=top_p,
+                )
+            except openai.RateLimitError:
+                logging.warning("OpenAI API rate limit exceeded. Sleeping for 10 seconds.")
+                await asyncio.sleep(10)
+            except asyncio.exceptions.TimeoutError:
+                logging.warning("OpenAI API timeout. Sleeping for 10 seconds.")
+                await asyncio.sleep(10)
+            except openai.APIError as e:
+                logging.warning(f"OpenAI API error: {e}")
+                break
+        return {"choices": [{"message": {"content": ""}}]}
+
+
+async def agenerate_from_openai_chat_completion(
+    messages_list: list[list[dict[str, str]]],
+    engine: str,
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    context_length: int,
+    requests_per_minute: int = 300,
+) -> list[str]:
+    """Generate from OpenAI Chat Completion API.
+
+    Args:
+        messages_list: list of message list
+        temperature: Temperature to use.
+        max_tokens: Maximum number of tokens to generate.
+        top_p: Top p to use.
+        context_length: Length of context to use.
+        requests_per_minute: Number of requests per minute to allow.
+
+    Returns:
+        List of generated responses.
+    """
+    if "OPENAI_API_KEY" not in os.environ:
+        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
+
+    limiter = aiolimiter.AsyncLimiter(requests_per_minute)
+    async_responses = [
+        _throttled_openai_chat_completion_acreate(
+            model=engine,
+            messages=message,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_p=top_p,
+            limiter=limiter,
+        )
+        for message in messages_list
+    ]
+    responses = await tqdm_asyncio.gather(*async_responses)
+    return [x["choices"][0]["message"]["content"] for x in responses]
+
+
+@retry_with_exponential_backoff
+def generate_from_openai_chat_completion(
+    messages: list[dict[str, str]],
+    model: str,
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    context_length: int,
+    stop_token: str | None = None,
+) -> str:
+    if "OPENAI_API_KEY" not in os.environ:
+        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
+    response = client.chat.completions.create(
+        model=model,
+        messages=messages,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        top_p=top_p,
+    )
+    answer: str = response.choices[0].message.content
+    return answer
+
+
+@retry_with_exponential_backoff
+# debug only
+def fake_generate_from_openai_chat_completion(
+    messages: list[dict[str, str]],
+    model: str,
+    temperature: float,
+    max_tokens: int,
+    top_p: float,
+    context_length: int,
+    stop_token: str | None = None,
+) -> str:
+    if "OPENAI_API_KEY" not in os.environ:
+        raise ValueError("OPENAI_API_KEY environment variable must be set when using OpenAI API.")
+
+    answer = "Let's think step-by-step. This page shows a list of links and buttons. There is a search box with the label 'Search query'. I will click on the search box to type the query. So the action I will perform is \"click [60]\"."
+    return answer
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/full_task.json.txt b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/full_task.json.txt
new file mode 100644
index 000000000000..2b53498b848c
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/full_task.json.txt
@@ -0,0 +1 @@
+__FULL_TASK__
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/requirements.txt b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/requirements.txt
new file mode 100644
index 000000000000..2070aca75263
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/requirements.txt
@@ -0,0 +1,4 @@
+/autogen_python/packages/autogen-core
+/autogen_python/packages/autogen-magentic-one
+azure-identity
+tiktoken
\ No newline at end of file
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/scenario.py b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/scenario.py
new file mode 100644
index 000000000000..79a36862f9d2
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/scenario.py
@@ -0,0 +1,274 @@
+import asyncio
+import logging
+import json
+import os
+import re
+import nltk
+
+from typing import Any, Dict, List, Tuple, Union
+
+from autogen_core.base import AgentId, AgentProxy, TopicId
+from autogen_core.application import SingleThreadedAgentRuntime
+from autogen_core.application.logging import EVENT_LOGGER_NAME
+from autogen_core.components import DefaultSubscription, DefaultTopicId
+from autogen_core.components.code_executor import LocalCommandLineCodeExecutor
+from autogen_core.components.models import (
+    AzureOpenAIChatCompletionClient,
+    ChatCompletionClient,
+    ModelCapabilities,
+    UserMessage,
+    SystemMessage,
+    LLMMessage,
+)
+from autogen_magentic_one.markdown_browser import MarkdownConverter, UnsupportedFormatException
+from autogen_magentic_one.agents.coder import Coder, Executor
+from autogen_magentic_one.agents.orchestrator import RoundRobinOrchestrator, LedgerOrchestrator
+from autogen_magentic_one.messages import BroadcastMessage, OrchestrationEvent, RequestReplyMessage, ResetMessage, DeactivateMessage
+from autogen_magentic_one.agents.multimodal_web_surfer import MultimodalWebSurfer
+from autogen_magentic_one.agents.file_surfer import FileSurfer
+from autogen_magentic_one.utils import LogHandler, message_content_to_str, create_completion_client_from_env
+
+
+import evaluation_harness
+from evaluation_harness.env_config import (
+    ACCOUNTS,
+    GITLAB,
+    MAP,
+    REDDIT,
+    SHOPPING,
+    SHOPPING_ADMIN,
+    WIKIPEDIA,
+    HOMEPAGE,
+    SITE_URLS,
+    LOGIN_PROMPTS,
+    SITE_DESCRIPTIONS,
+    url_to_sitename,
+)
+
+REPLACEMENTS = {
+    "__REDDIT__": REDDIT,
+    "__SHOPPING__": SHOPPING,
+    "__SHOPPING_ADMIN__": SHOPPING_ADMIN,
+    "__GITLAB__": GITLAB,
+    "__WIKIPEDIA__": WIKIPEDIA,
+    "__MAP__": MAP,
+    "__HOMEPAGE__": HOMEPAGE,
+}
+
+nltk.download("punkt")
+
+
+async def response_preparer(task: str, source: str, client: ChatCompletionClient, transcript: List[LLMMessage]) -> str:
+    messages: List[LLMMessage] = [
+        UserMessage(
+            content=f"Earlier you were asked the following:\n\n{task}\n\nYour team then worked diligently to address that request. Here is a transcript of that conversation:",
+            source=source,
+        )
+    ]
+
+    # copy them to this context
+    for message in transcript:
+        messages.append(
+            UserMessage(
+                content = message_content_to_str(message.content),
+                # TODO fix this -> remove type ignore
+                source=message.source, # type: ignore
+            )
+        )
+
+    # ask for the final answer
+    messages.append(
+        UserMessage(
+            content= f"""
+Read the above conversation and output a FINAL ANSWER to the original request. The original request is repeated here for convenience:
+
+{task}
+
+To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER]
+Your FINAL ANSWER should be as few words as possible.
+If the original request was not a question, or you did not find a definitive answer, simply summarize the final state of the page or task as your FINAL ANSWER.""",
+            source=source,
+        )
+    )
+
+    response = await client.create(messages)
+    assert isinstance(response.content, str)
+    return response.content
+
+
+async def main() -> None:
+    # Expand the prompt and the full task
+    task_prompt = ""
+    TASK = None
+    with open("task_prompt.json.txt", "rt") as fh:
+        task_prompt = fh.read()
+    with open("task_prompt.json", "wt") as fh:
+        for k in REPLACEMENTS:
+            task_prompt = task_prompt.replace(k, REPLACEMENTS[k])
+        fh.write(task_prompt)
+        TASK = json.loads(task_prompt)
+        if TASK["start_url"] == REDDIT: 
+            TASK["start_url"] = TASK["start_url"] + "/forums/all"
+
+    full_task = ""
+    with open("full_task.json.txt", "rt") as fh:
+        full_task = fh.read()
+    with open("full_task.json", "wt") as fh:
+        for k in REPLACEMENTS:
+            full_task = full_task.replace(k, REPLACEMENTS[k])
+        fh.write(full_task)
+
+    # Create the runtime.
+    runtime = SingleThreadedAgentRuntime()
+
+    # Create the AzureOpenAI client, with AAD auth
+    client = create_completion_client_from_env()
+    # Login assistant
+    await runtime.register(
+        "LoginAssistant",
+        lambda: Coder(
+            model_client=client,
+            system_messages=[
+                SystemMessage("""You are a general-purpose AI assistant and can handle many questions -- but you don't have access to a web browser. However, the user you are talking to does have a browser, and you can see the screen. Provide short direct instructions to them to take you where you need to go to answer the initial question posed to you.
+
+Once the user has taken the final necessary action to complete the task, and you have fully addressed the initial request, reply with the word TERMINATE.""",
+                )
+            ],
+        ),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    login_assistant = AgentProxy(AgentId("LoginAssistant", "default"), runtime)
+
+    # Web surfer
+    await runtime.register(
+        "WebSurfer",
+        lambda: MultimodalWebSurfer(), # Configuration is set later by init()
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    web_surfer = AgentProxy(AgentId("WebSurfer", "default"), runtime)
+
+    actual_surfer = await runtime.try_get_underlying_agent_instance(web_surfer.id, type=MultimodalWebSurfer)
+    await actual_surfer.init(model_client=client, downloads_folder=os.getcwd(), browser_channel="chromium")
+
+    # Round-robin orchestrator
+    await runtime.register(
+        "round_robin_orc", 
+        lambda: RoundRobinOrchestrator(agents=[web_surfer, login_assistant],),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    round_robin_orc = AgentProxy(AgentId("round_robin_orc", "default"), runtime)
+
+    # Login to the necessary websites
+    for site in TASK["sites"]:
+        if site in ["reddit", "gitlab", "shopping", "shopping_admin"]:
+            actual_surfer.start_page = SITE_URLS[site]
+
+            runtime.start()
+            await runtime.publish_message(
+                ResetMessage(), 
+                topic_id=DefaultTopicId(),
+            )
+            await runtime.publish_message(
+                BroadcastMessage(content=UserMessage(content=LOGIN_PROMPTS[site], source="human")),
+                topic_id=DefaultTopicId(),
+            )
+            await runtime.stop_when_idle()
+
+    # Deactivate the login-related agents
+    runtime.start()
+    await runtime.send_message(DeactivateMessage(), login_assistant.id)
+    await runtime.send_message(DeactivateMessage(), round_robin_orc.id)
+    await runtime.stop_when_idle()
+
+    # By this point, we should be logged in. Prepare for the main event
+    await runtime.register(
+        "Assistant",
+        lambda: Coder(model_client=client),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    coder = AgentProxy(AgentId("Assistant", "default"), runtime)
+
+    await runtime.register(
+        "ComputerTerminal",
+        lambda: Executor(executor=LocalCommandLineCodeExecutor(), confirm_execution="ACCEPT_ALL"),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    executor = AgentProxy(AgentId("ComputerTerminal", "default"), runtime)
+    
+    await runtime.register(
+        "FileSurfer",
+        lambda: FileSurfer(model_client=client),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    file_surfer = AgentProxy(AgentId("FileSurfer", "default"), runtime)
+   
+    await runtime.register(
+        "orchestrator", 
+        lambda: LedgerOrchestrator(
+            agents=[coder, executor, file_surfer, web_surfer],
+            model_client=client,
+            max_rounds=30,
+            max_time=25*60,
+        ),
+        subscriptions=lambda: [DefaultSubscription()],
+    )
+    orchestrator = AgentProxy(AgentId("orchestrator", "default"), runtime)
+
+    # The main event
+    actual_surfer.start_page = TASK["start_url"]
+    runtime.start()
+    await runtime.send_message(ResetMessage(), web_surfer.id)
+
+    # Provide some background about the pages
+    site_description_prompt = ""
+    sitename = url_to_sitename(TASK["start_url"])
+    if sitename:
+        site_description_prompt = ", " + SITE_DESCRIPTIONS[sitename]
+    task = f"Your web browser is currently open to the website {TASK['start_url']}{site_description_prompt}. On this website, please complete the following task:\n\n{TASK['intent']}"
+
+    await runtime.publish_message(
+        BroadcastMessage(content=UserMessage(content=task.strip(), source="human")),
+        topic_id=DefaultTopicId(),
+    )
+
+    await runtime.stop_when_idle()
+
+    # Output the final answer
+    actual_orchestrator = await runtime.try_get_underlying_agent_instance(orchestrator.id, type=LedgerOrchestrator)
+    transcript: List[LLMMessage] = actual_orchestrator._chat_history # type: ignore
+
+    orc_metadata = await orchestrator.metadata
+    source = orc_metadata["type"]
+    final_answer = await response_preparer(task=TASK["intent"], source=source, client=client, transcript=transcript)
+
+    m = re.search("FINAL ANSWER:(.*)$", final_answer, re.DOTALL)
+    if m:
+        final_answer = m.group(1).strip()
+
+    print('page.stop("' + final_answer + '")')
+    print("MAIN TASK COMPLETE !#!#")
+
+    ########## EVALUATION ##########
+    context = actual_surfer._context
+    page = actual_surfer._page
+    cdp_session = await context.new_cdp_session(page)
+    config_file = "full_task.json"
+    
+    evaluator = evaluation_harness.evaluator_router(config_file)
+    score = await evaluator(
+        trajectory=evaluation_harness.make_answer_trajecotry(final_answer),
+        config_file=config_file,
+        page=page,
+        client=cdp_session,
+    #    azure_config=llm_config,
+    )
+    
+    print("FINAL SCORE: " + str(score))
+
+
+if __name__ == "__main__":
+    logger = logging.getLogger(EVENT_LOGGER_NAME)
+    logger.setLevel(logging.INFO)
+    log_handler = LogHandler()
+    logger.handlers = [log_handler]
+    asyncio.run(main())
diff --git a/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/task_prompt.json.txt b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/task_prompt.json.txt
new file mode 100644
index 000000000000..2ec4f7ee341c
--- /dev/null
+++ b/python/packages/agbench/benchmarks/WebArena/Templates/MagenticOne/task_prompt.json.txt
@@ -0,0 +1 @@
+__TASK_PROMPT__
diff --git a/python/packages/agbench/benchmarks/process_logs.py b/python/packages/agbench/benchmarks/process_logs.py
new file mode 100644
index 000000000000..e9aa52532f82
--- /dev/null
+++ b/python/packages/agbench/benchmarks/process_logs.py
@@ -0,0 +1,217 @@
+"""
+Credits: Hussein Mozannar
+"""
+
+import os
+import re
+import json
+import glob
+import logging
+import pandas as pd
+
+logging.basicConfig(level=logging.INFO)
+
+
+def process_logs(logs_path, single_benchmark=False):
+    """
+    logs_path: str, path to the logs directory, containing subdirectories for each benchmark subset
+    returns: pandas DataFrame with all the logs processed
+    """
+    # check if logs_path exists
+    if not os.path.exists(logs_path):
+        raise FileNotFoundError(
+            f"Path {logs_path} does not exist, need to download logs, extract them into one common folder"
+        )
+    if single_benchmark:
+        # subset should be a list with single folder which is the last part of the path
+        subsets = [logs_path.split("/")[-1]]
+        logs_path = "/".join(logs_path.split("/")[:-1])
+
+    else:
+        subsets = os.listdir(logs_path)
+    results = []
+    for subset in subsets:
+        # check if folder is not empty
+        if not os.listdir(os.path.join(logs_path, subset)) or subset == ".DS_Store" or subset == "__MACOSX":
+            continue
+        benchmark_name = subset.split("_")[0]
+        instances = [
+            f
+            for f in os.listdir(os.path.join(logs_path, subset))
+            if os.path.isdir(os.path.join(logs_path, subset, f))
+            and os.path.exists(os.path.join(logs_path, subset, f, "0"))
+        ]
+        logging.info(f"Processing {subset} with {len(instances)} instances")
+        for instance in instances:
+            instance_dir_path = os.path.join(logs_path, subset, instance, "0")
+            try:
+                correct, expected_answer, final_answer = scorer(instance_dir_path, benchmark_name)
+            except Exception as e:
+                logging.error(f"Error processing {instance_dir_path}: {e}")
+                continue
+            messages = get_message_logs(instance_dir_path)
+            results.append(
+                {
+                    "benchmark": benchmark_name,
+                    "subset_benchmark": subset,
+                    "instance": instance,
+                    "task_information": get_task_information(instance_dir_path, benchmark_name),
+                    "expected_answer": expected_answer,
+                    "final_answer": final_answer,
+                    "correct": correct,
+                    "stalled": did_agent_stall(instance_dir_path),
+                    "num_messages": len(messages),
+                    "messages": messages,
+                    "progress_not_being_made": is_progress_not_being_made(instance_dir_path),
+                }
+            )
+    df_logs = pd.DataFrame(results)
+    return df_logs
+
+
+def normalize_answer(a):
+    """
+    Taken from custom_tabulate.py in the WebArena benchmark, given an answer, returns the normalized answer.
+    Operations: lower case, trim, standardize comma separated values, replace multiple spaces with one space, remove trailing punctuation
+    a: str, answer
+    returns: str, normalized answer
+    """
+    norm_answer = ", ".join(a.strip().lower().split(","))
+    norm_answer = re.sub(r"[\.\!\?]+$", "", re.sub(r"\s+", " ", norm_answer))
+    return norm_answer
+
+
+def scorer(instance_dir, benchmark_name):
+    """
+    Returns results based on the benchmark name and the instance directory.
+
+    benchmark_name: str, the name of the benchmark, either "gaia" or "webarena"
+    instance_dir: str, path to the instance directory
+    returns: tuple, (bool, str, str) or None, depending on the benchmark
+    """
+
+    if benchmark_name == "gaia" or benchmark_name == "assistant":
+        # Read the expected answer
+        expected_answer_file = os.path.join(instance_dir, "expected_answer.txt")
+        if not os.path.isfile(expected_answer_file):
+            return None
+
+        with open(expected_answer_file, "rt") as fh:
+            expected_answer = fh.read().strip()
+
+        # Read the console log
+        console_log_file = os.path.join(instance_dir, "console_log.txt")
+        if not os.path.isfile(console_log_file):
+            return None
+
+        with open(console_log_file, "rt") as fh:
+            console_log = fh.read()
+            final_answer = None
+            m = re.search(r"FINAL ANSWER:(.*?)\n", console_log, re.DOTALL)
+            if m:
+                final_answer = m.group(1).strip()
+
+            if final_answer is None:
+                return None
+            not_normalized_final = final_answer
+
+            n_ex = normalize_answer(expected_answer)
+            n_final = normalize_answer(final_answer)
+            return (n_ex != "" and n_ex == n_final), n_ex, not_normalized_final
+
+    elif benchmark_name == "webarena":
+        # Read the console log
+        console_log_file = os.path.join(instance_dir, "console_log.txt")
+        if not os.path.isfile(console_log_file):
+            return None
+
+        with open(console_log_file, "rt") as fh:
+            console_log = fh.read()
+            final_score = None
+            m = re.search(r"FINAL SCORE:(.*?)\n", console_log, re.DOTALL)
+            if m:
+                final_score = m.group(1).strip()
+
+            if final_score is None:
+                return None
+            else:
+                return float(final_score) > 0, "", ""
+
+    else:
+        raise ValueError(f"Unsupported benchmark_name: {benchmark_name}")
+
+
+def get_number_of_chat_messages(chat_messages_dir):
+    # Count the number of chat messages in the chat_messages_dir
+    result = 0
+    for file in glob.glob(f"{chat_messages_dir}/*_messages.json"):
+        with open(file, "r") as f:
+            content = json.load(f)
+            for agent, messages in content.items():
+                result += len(messages)
+    return result
+
+
+def did_agent_stall(instance_dir):
+    # Check if the agent stalled
+    log_file_path = os.path.join(instance_dir, "log.jsonl")
+    if not os.path.isfile(log_file_path):
+        return None
+    # Stalled.... Replanning...
+    with open(log_file_path, "r") as f:
+        for line in f:
+            if "Stalled.... Replanning..." in line:
+                return True
+    return False
+
+
+def get_message_logs(instance_dir):
+    # Read the log file and return the messages
+    log_file_path = os.path.join(instance_dir, "log.jsonl")
+    if not os.path.isfile(log_file_path):
+        return None
+    messages = []
+    # for each line, convert to dict, check if it has a message and source key, and append to messages
+    with open(log_file_path, "r") as f:
+        for line in f:
+            line_dict = json.loads(line)
+            if "message" in line_dict and "source" in line_dict:
+                messages.append(line_dict)
+    return messages
+
+
+def get_task_information(instance_dir, benchmark_name):
+    # Read the task information from the log file
+    if benchmark_name == "gaia" or benchmark_name == "assistant":
+        prompt_file = os.path.join(instance_dir, "prompt.txt")
+        if not os.path.isfile(prompt_file):
+            return None
+        with open(prompt_file, "r") as f:
+            return f.read().strip()
+    elif benchmark_name == "webarena":
+        task_prompt_file = os.path.join(instance_dir, "task_prompt.json")
+        if not os.path.isfile(task_prompt_file):
+            return None
+        with open(task_prompt_file, "r") as f:
+            return json.load(f)["intent"]
+    else:
+        raise ValueError(f"Unsupported benchmark_name: {benchmark_name}")
+
+
+def is_progress_not_being_made(instance_dir):
+    # if at any point in the log, progress is not being made, return True
+    pattern = r'"is_progress_being_made": \{\s+"reason": ".*?",\s+"answer": false\s+\}'
+    log_file_path = os.path.join(instance_dir, "log.jsonl")
+    if not os.path.isfile(log_file_path):
+        return None
+    with open(log_file_path, "r") as f:
+        for line in f:
+            line_dict = json.loads(line)
+            if (
+                "source" in line_dict
+                and line_dict["source"] == "Orchestrator (thought)"
+                and "Updated Ledger:" in line_dict["message"]
+                and re.search(pattern, line_dict["message"])
+            ):
+                return True
+    return False
diff --git a/python/packages/agbench/src/agbench/res/Dockerfile b/python/packages/agbench/src/agbench/res/Dockerfile
index aeea8fef92ed..a7da943f343d 100644
--- a/python/packages/agbench/src/agbench/res/Dockerfile
+++ b/python/packages/agbench/src/agbench/res/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11
+FROM mcr.microsoft.com/devcontainers/python:3.11
 MAINTAINER AutoGen
 
 # Install packages
diff --git a/python/packages/autogen-core/docs/src/packages/index.md b/python/packages/autogen-core/docs/src/packages/index.md
index 63b30a7335b5..f04f20a297ab 100644
--- a/python/packages/autogen-core/docs/src/packages/index.md
+++ b/python/packages/autogen-core/docs/src/packages/index.md
@@ -33,7 +33,7 @@ pip install autogen-agentchat==0.4.0dev1
 ```
 
 
-[{fas}`file-code;pst-color-primary` API Reference](/reference/python/autogen_agentchat/autogen_agentchat.rst) | [{fab}`github;pst-color-primary` Source](https://github.com/microsoft/autogen/tree/main/python/packages/autogen-agentchat)
+[{fas}`circle-info;pst-color-primary` User Guide](/user-guide/agentchat-user-guide/index.md) | [{fas}`file-code;pst-color-primary` API Reference](/reference/python/autogen_agentchat/autogen_agentchat.rst) | [{fab}`python;pst-color-primary` PyPI](https://pypi.org/project/autogen-agentchat/0.4.0.dev1/) | [{fab}`github;pst-color-primary` Source](https://github.com/microsoft/autogen/tree/main/python/packages/autogen-agentchat)
 :::
 
 (pkg-info-autogen-core)=
@@ -48,7 +48,7 @@ Implements the core functionality of the AutoGen framework, providing basic buil
 pip install autogen-core==0.4.0dev1
 ```
 
-[{fas}`file-code;pst-color-primary` API Reference](/reference/python/autogen_core/autogen_core.rst) | [{fab}`github;pst-color-primary` Source](https://github.com/microsoft/autogen/tree/main/python/packages/autogen-core)
+[{fas}`circle-info;pst-color-primary` User Guide](/user-guide/core-user-guide/index.md) | [{fas}`file-code;pst-color-primary` API Reference](/reference/python/autogen_core/autogen_core.rst) | [{fab}`python;pst-color-primary` PyPI](https://pypi.org/project/autogen-core/0.4.0.dev1/) | [{fab}`github;pst-color-primary` Source](https://github.com/microsoft/autogen/tree/main/python/packages/autogen-core)
 :::
 
 (pkg-info-autogen-ext)=
@@ -63,7 +63,13 @@ Implementations of core components that interface with external services, or use
 pip install autogen-ext==0.4.0dev1
 ```
 
-[{fas}`file-code;pst-color-primary` API Reference](/reference/python/autogen_ext/autogen_ext.rst) | [{fab}`github;pst-color-primary` Source](https://github.com/microsoft/autogen/tree/main/python/packages/autogen-ext)
+Extras:
+
+- `langchain-tools` needed for {py:class}`~autogen_ext.tools.LangChainToolAdapter`
+- `azure-code-executor` needed for {py:class}`~autogen_ext.code_executors.ACADynamicSessionsCodeExecutor`
+- `docker-code-executor` needed for {py:class}`~autogen_ext.code_executors.DockerCommandLineCodeExecutor`
+
+[{fas}`circle-info;pst-color-primary` User Guide](/user-guide/extensions-user-guide/index.md) | [{fas}`file-code;pst-color-primary` API Reference](/reference/python/autogen_ext/autogen_ext.rst) | [{fab}`python;pst-color-primary` PyPI](https://pypi.org/project/autogen-ext/0.4.0.dev1/) | [{fab}`github;pst-color-primary` Source](https://github.com/microsoft/autogen/tree/main/python/packages/autogen-ext)
 :::
 
 (pkg-info-autogen-magentic-one)=
diff --git a/python/packages/autogen-core/docs/src/reference/index.md b/python/packages/autogen-core/docs/src/reference/index.md
index 1aabce712c07..cd8513ef4050 100644
--- a/python/packages/autogen-core/docs/src/reference/index.md
+++ b/python/packages/autogen-core/docs/src/reference/index.md
@@ -26,8 +26,6 @@ python/autogen_core/autogen_core
 :caption: AutoGen Extensions
 
 python/autogen_ext/autogen_ext
-python/autogen_ext/autogen_ext.tools
-python/autogen_ext/autogen_ext.code_executor
 ```
 
 ::::{grid} 1 2 2 3
diff --git a/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/guides/code-execution.ipynb b/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/guides/code-execution.ipynb
index eb9841b2f685..a715728ae4c6 100644
--- a/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/guides/code-execution.ipynb
+++ b/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/guides/code-execution.ipynb
@@ -316,7 +316,7 @@
     "from autogen_agentchat.agents import CodeExecutorAgent, CodingAssistantAgent\n",
     "from autogen_agentchat.teams import RoundRobinGroupChat, StopMessageTermination\n",
     "from autogen_core.components.models import OpenAIChatCompletionClient\n",
-    "from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor\n",
+    "from autogen_ext.code_executors import DockerCommandLineCodeExecutor\n",
     "\n",
     "async with DockerCommandLineCodeExecutor(work_dir=\"coding\") as code_executor:  # type: ignore[syntax]\n",
     "    code_executor_agent = CodeExecutorAgent(\"code_executor\", code_executor=code_executor)\n",
diff --git a/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/stocksnippet.md b/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/stocksnippet.md
index 689aae889587..a4e827428302 100644
--- a/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/stocksnippet.md
+++ b/python/packages/autogen-core/docs/src/user-guide/agentchat-user-guide/stocksnippet.md
@@ -8,7 +8,7 @@ from autogen_agentchat import EVENT_LOGGER_NAME
 from autogen_agentchat.agents import CodeExecutorAgent, CodingAssistantAgent
 from autogen_agentchat.logging import ConsoleLogHandler
 from autogen_agentchat.teams import RoundRobinGroupChat, StopMessageTermination
-from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor
+from autogen_ext.code_executors import DockerCommandLineCodeExecutor
 from autogen_core.components.models import OpenAIChatCompletionClient
 
 logger = logging.getLogger(EVENT_LOGGER_NAME)
diff --git a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/index.md b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/index.md
index 9f7ecda1668a..fe66f2bff34b 100644
--- a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/index.md
+++ b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/index.md
@@ -17,6 +17,5 @@ llamaindex-agent
 local-llms-ollama-litellm
 instrumenting
 topic-subscription-scenarios
-azure-container-code-executor
 structured-output-agent
 ```
diff --git a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/tool-use-with-intervention.ipynb b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/tool-use-with-intervention.ipynb
index 84a45f84ee8f..44f9978e59ef 100644
--- a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/tool-use-with-intervention.ipynb
+++ b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/tool-use-with-intervention.ipynb
@@ -32,7 +32,7 @@
     ")\n",
     "from autogen_core.components.tool_agent import ToolAgent, ToolException, tool_agent_caller_loop\n",
     "from autogen_core.components.tools import PythonCodeExecutionTool, ToolSchema\n",
-    "from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor"
+    "from autogen_ext.code_executors import DockerCommandLineCodeExecutor"
    ]
   },
   {
diff --git a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/command-line-code-executors.ipynb b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/command-line-code-executors.ipynb
index 8cdb529e2120..22244d97c8e9 100644
--- a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/command-line-code-executors.ipynb
+++ b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/command-line-code-executors.ipynb
@@ -51,7 +51,7 @@
     "\n",
     "from autogen_core.base import CancellationToken\n",
     "from autogen_core.components.code_executor import CodeBlock\n",
-    "from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor\n",
+    "from autogen_ext.code_executors import DockerCommandLineCodeExecutor\n",
     "\n",
     "work_dir = Path(\"coding\")\n",
     "work_dir.mkdir(exist_ok=True)\n",
diff --git a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/tools.ipynb b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/tools.ipynb
index bce62e45e7ce..c5d058189319 100644
--- a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/tools.ipynb
+++ b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/framework/tools.ipynb
@@ -45,7 +45,7 @@
    "source": [
     "from autogen_core.base import CancellationToken\n",
     "from autogen_core.components.tools import PythonCodeExecutionTool\n",
-    "from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor\n",
+    "from autogen_ext.code_executors import DockerCommandLineCodeExecutor\n",
     "\n",
     "# Create the tool.\n",
     "code_executor = DockerCommandLineCodeExecutor()\n",
diff --git a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/quickstart.ipynb b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/quickstart.ipynb
index 034504eb326a..c882455e3d1f 100644
--- a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/quickstart.ipynb
+++ b/python/packages/autogen-core/docs/src/user-guide/core-user-guide/quickstart.ipynb
@@ -313,7 +313,7 @@
     "\n",
     "from autogen_core.application import SingleThreadedAgentRuntime\n",
     "from autogen_core.components.models import OpenAIChatCompletionClient\n",
-    "from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor\n",
+    "from autogen_ext.code_executors import DockerCommandLineCodeExecutor\n",
     "\n",
     "work_dir = tempfile.mkdtemp()\n",
     "\n",
diff --git a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/azure-container-code-executor.ipynb b/python/packages/autogen-core/docs/src/user-guide/extensions-user-guide/azure-container-code-executor.ipynb
similarity index 98%
rename from python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/azure-container-code-executor.ipynb
rename to python/packages/autogen-core/docs/src/user-guide/extensions-user-guide/azure-container-code-executor.ipynb
index 166c3f745e19..98c6b848c051 100644
--- a/python/packages/autogen-core/docs/src/user-guide/core-user-guide/cookbook/azure-container-code-executor.ipynb
+++ b/python/packages/autogen-core/docs/src/user-guide/extensions-user-guide/azure-container-code-executor.ipynb
@@ -4,18 +4,15 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Azure Container Code Executor\n",
+    "# ACA Dynamic Sessions Code Executor\n",
     "\n",
     "This guide will explain the Azure Container Apps dynamic sessions in Azure Container Apps and show you how to use the Azure Container Code Executor class.\n",
     "\n",
-    "\n",
-    "## Azure Container Apps dynamic sessions\n",
-    "\n",
     "The [Azure Container Apps dynamic sessions](https://learn.microsoft.com/en-us/azure/container-apps/sessions) is a component in the Azure Container Apps service. The environment is hosted on remote Azure instances and will not execute any code locally. The interpreter is capable of executing python code in a jupyter environment with a pre-installed base of commonly used packages. [Custom environments](https://learn.microsoft.com/en-us/azure/container-apps/sessions-custom-container) can be created by users for their applications. Files can additionally be [uploaded to, or downloaded from](https://learn.microsoft.com/en-us/azure/container-apps/sessions-code-interpreter#upload-a-file-to-a-session) each session.\n",
     "\n",
     "The code interpreter can run multiple sessions of code, each of which are delineated by a session identifier string.\n",
     "\n",
-    "### Create a Container Apps Session Pool\n",
+    "## Create a Container Apps Session Pool\n",
     "\n",
     "In your Azure portal, create a new `Container App Session Pool` resource with the pool type set to `Python code interpreter` and note the `Pool management endpoint`. The format for the endpoint should be something like `https://{region}.dynamicsessions.io/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/sessionPools/{session_pool_name}`.\n",
     "\n",
diff --git a/python/packages/autogen-core/docs/src/user-guide/extensions-user-guide/index.md b/python/packages/autogen-core/docs/src/user-guide/extensions-user-guide/index.md
index 601b435382f5..6cf28e9b9dd7 100644
--- a/python/packages/autogen-core/docs/src/user-guide/extensions-user-guide/index.md
+++ b/python/packages/autogen-core/docs/src/user-guide/extensions-user-guide/index.md
@@ -7,6 +7,14 @@ myst:
 
 # Extensions
 
+```{toctree}
+:maxdepth: 3
+:hidden:
+
+azure-container-code-executor
+```
+
+
 Discover community projects:
 
 ::::{grid} 1 2 2 2
diff --git a/python/packages/autogen-core/pyproject.toml b/python/packages/autogen-core/pyproject.toml
index 51a2cc216ad8..9cf50ef4e932 100644
--- a/python/packages/autogen-core/pyproject.toml
+++ b/python/packages/autogen-core/pyproject.toml
@@ -114,8 +114,7 @@ docs-clean = "rm -rf docs/build && rm -rf docs/src/reference/python/"
 
 docs-apidoc-core = "sphinx-apidoc --templatedir docs/src/_apidoc_templates --no-toc --separate --force -o docs/src/reference/python/autogen_core src/autogen_core src/autogen_core/application/protos/"
 docs-apidoc-agentchat = "sphinx-apidoc --templatedir docs/src/_apidoc_templates --no-toc --separate --force -o docs/src/reference/python/autogen_agentchat ../autogen-agentchat/src/autogen_agentchat"
-# Includes --implicit-namespaces as it is a namespace package
-docs-apidoc-ext = "sphinx-apidoc --implicit-namespaces --templatedir docs/src/_apidoc_templates --no-toc --separate --force -o docs/src/reference/python/autogen_ext ../autogen-ext/src/autogen_ext"
+docs-apidoc-ext = "sphinx-apidoc --templatedir docs/src/_apidoc_templates --no-toc --separate --force -o docs/src/reference/python/autogen_ext ../autogen-ext/src/autogen_ext ../autogen-ext/src/autogen_ext/code_executor ../autogen-ext/src/autogen_ext/tools/langchain"
 docs-apidoc-all = [
     "docs-apidoc-core",
     "docs-apidoc-agentchat",
diff --git a/python/packages/autogen-core/samples/coding_pub_sub.py b/python/packages/autogen-core/samples/coding_pub_sub.py
index 68090d556a6f..73138b790226 100644
--- a/python/packages/autogen-core/samples/coding_pub_sub.py
+++ b/python/packages/autogen-core/samples/coding_pub_sub.py
@@ -30,7 +30,7 @@
     UserMessage,
 )
 from autogen_core.components.tools import PythonCodeExecutionTool, Tool
-from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor
+from autogen_ext.code_executors import DockerCommandLineCodeExecutor
 from common.utils import get_chat_completion_client_from_envs
 
 
diff --git a/python/packages/autogen-core/src/autogen_core/components/code_executor/_impl/local_commandline_code_executor.py b/python/packages/autogen-core/src/autogen_core/components/code_executor/_impl/local_commandline_code_executor.py
index 7721ef36739a..f74111ef1eaf 100644
--- a/python/packages/autogen-core/src/autogen_core/components/code_executor/_impl/local_commandline_code_executor.py
+++ b/python/packages/autogen-core/src/autogen_core/components/code_executor/_impl/local_commandline_code_executor.py
@@ -5,7 +5,7 @@
 import logging
 import sys
 import warnings
-from hashlib import md5
+from hashlib import sha256
 from pathlib import Path
 from string import Template
 from typing import Any, Callable, ClassVar, List, Sequence, Union
@@ -245,7 +245,7 @@ async def _execute_code_dont_check_setup(
 
             if filename is None:
                 # create a file with an automatically generated name
-                code_hash = md5(code.encode()).hexdigest()
+                code_hash = sha256(code.encode()).hexdigest()
                 filename = f"tmp_code_{code_hash}.{'py' if lang.startswith('python') else lang}"
 
             written_file = (self._work_dir / filename).resolve()
diff --git a/python/packages/autogen-ext/pyproject.toml b/python/packages/autogen-ext/pyproject.toml
index 9e31cee15cbb..717ad4003f57 100644
--- a/python/packages/autogen-ext/pyproject.toml
+++ b/python/packages/autogen-ext/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
 
 
 [project.optional-dependencies]
-langchain-tools = ["langchain >= 0.3.1"]
+langchain-tools = ["langchain_core~= 0.3.3"]
 azure-code-executor = ["azure-core"]
 docker-code-executor = ["docker~=7.0"]
 
diff --git a/python/packages/autogen-ext/src/autogen_ext/__init__.py b/python/packages/autogen-ext/src/autogen_ext/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/python/packages/autogen-ext/src/autogen_ext/code_executor/aca_dynamic_sessions/__init__.py b/python/packages/autogen-ext/src/autogen_ext/code_executor/aca_dynamic_sessions/__init__.py
index d61d2503426e..009997c41abc 100644
--- a/python/packages/autogen-ext/src/autogen_ext/code_executor/aca_dynamic_sessions/__init__.py
+++ b/python/packages/autogen-ext/src/autogen_ext/code_executor/aca_dynamic_sessions/__init__.py
@@ -1,4 +1,20 @@
-from ._azure_container_code_executor import AzureContainerCodeExecutor
+import warnings
+from typing import Any
+
+from ...code_executors import ACADynamicSessionsCodeExecutor
+
+
+class AzureContainerCodeExecutor(ACADynamicSessionsCodeExecutor):
+    """AzureContainerCodeExecutor has been renamed and moved to autogen_ext.code_executors.ACADynamicSessionsCodeExecutor"""
+
+    def __init__(self, *args: Any, **kwargs: Any) -> None:
+        warnings.warn(
+            "AzureContainerCodeExecutor has been renamed and moved to autogen_ext.code_executors.ACADynamicSessionsCodeExecutor",
+            DeprecationWarning,
+            stacklevel=2,
+        )
+        super().__init__(*args, **kwargs)
+
 
 __all__ = [
     "AzureContainerCodeExecutor",
diff --git a/python/packages/autogen-ext/src/autogen_ext/code_executor/docker_executor/__init__.py b/python/packages/autogen-ext/src/autogen_ext/code_executor/docker_executor/__init__.py
index 23c859ffe63b..66719114300d 100644
--- a/python/packages/autogen-ext/src/autogen_ext/code_executor/docker_executor/__init__.py
+++ b/python/packages/autogen-ext/src/autogen_ext/code_executor/docker_executor/__init__.py
@@ -1,3 +1,11 @@
-from ._impl import DockerCommandLineCodeExecutor
+import warnings
+
+from ...code_executors import DockerCommandLineCodeExecutor
+
+warnings.warn(
+    "DockerCommandLineCodeExecutor moved to autogen_ext.code_executors.DockerCommandLineCodeExecutor",
+    DeprecationWarning,
+    stacklevel=2,
+)
 
 __all__ = ["DockerCommandLineCodeExecutor"]
diff --git a/python/packages/autogen-ext/src/autogen_ext/code_executors/__init__.py b/python/packages/autogen-ext/src/autogen_ext/code_executors/__init__.py
new file mode 100644
index 000000000000..ab7dc8a9b643
--- /dev/null
+++ b/python/packages/autogen-ext/src/autogen_ext/code_executors/__init__.py
@@ -0,0 +1,4 @@
+from ._azure_container_code_executor import ACADynamicSessionsCodeExecutor, TokenProvider
+from ._docker_code_executor import DockerCommandLineCodeExecutor
+
+__all__ = ["DockerCommandLineCodeExecutor", "TokenProvider", "ACADynamicSessionsCodeExecutor"]
diff --git a/python/packages/autogen-ext/src/autogen_ext/code_executor/aca_dynamic_sessions/_azure_container_code_executor.py b/python/packages/autogen-ext/src/autogen_ext/code_executors/_azure_container_code_executor.py
similarity index 97%
rename from python/packages/autogen-ext/src/autogen_ext/code_executor/aca_dynamic_sessions/_azure_container_code_executor.py
rename to python/packages/autogen-ext/src/autogen_ext/code_executors/_azure_container_code_executor.py
index 451f2b8c88db..2852c4592a88 100644
--- a/python/packages/autogen-ext/src/autogen_ext/code_executor/aca_dynamic_sessions/_azure_container_code_executor.py
+++ b/python/packages/autogen-ext/src/autogen_ext/code_executors/_azure_container_code_executor.py
@@ -1,10 +1,12 @@
 # Credit to original authors
 
+from __future__ import annotations
+
 import asyncio
 import os
 from pathlib import Path
 from string import Template
-from typing import Any, Callable, ClassVar, List, Optional, Protocol, Sequence, Union
+from typing import TYPE_CHECKING, Any, Callable, ClassVar, List, Optional, Protocol, Sequence, Union
 from uuid import uuid4
 
 import aiohttp
@@ -22,12 +24,14 @@
     get_required_packages,
     to_stub,
 )
-from azure.core.credentials import AccessToken
 from typing_extensions import ParamSpec
 
+if TYPE_CHECKING:
+    from azure.core.credentials import AccessToken
+
 PYTHON_VARIANTS = ["python", "Python", "py"]
 
-__all__ = ("AzureContainerCodeExecutor", "TokenProvider")
+__all__ = ("ACADynamicSessionsCodeExecutor", "TokenProvider")
 
 A = ParamSpec("A")
 
@@ -38,9 +42,14 @@ def get_token(
     ) -> AccessToken: ...
 
 
-class AzureContainerCodeExecutor(CodeExecutor):
+class ACADynamicSessionsCodeExecutor(CodeExecutor):
     """(Experimental) A code executor class that executes code through a an Azure
-    Container Apps instance.
+    Container Apps Dynamic Sessions instance.
+
+    .. note::
+
+        This class requires the :code:`azure-code-executor` extra for the :code:`autogen-ext` package.
+
 
     **This will execute LLM generated code on an Azure dynamic code container.**
 
diff --git a/python/packages/autogen-ext/src/autogen_ext/code_executor/docker_executor/_impl.py b/python/packages/autogen-ext/src/autogen_ext/code_executors/_docker_code_executor.py
similarity index 90%
rename from python/packages/autogen-ext/src/autogen_ext/code_executor/docker_executor/_impl.py
rename to python/packages/autogen-ext/src/autogen_ext/code_executors/_docker_code_executor.py
index 7b3a194e6f6a..0ea8c3157943 100644
--- a/python/packages/autogen-ext/src/autogen_ext/code_executor/docker_executor/_impl.py
+++ b/python/packages/autogen-ext/src/autogen_ext/code_executors/_docker_code_executor.py
@@ -9,15 +9,11 @@
 import sys
 import uuid
 from collections.abc import Sequence
-from hashlib import md5
+from hashlib import sha256
 from pathlib import Path
 from types import TracebackType
 from typing import Any, Callable, ClassVar, List, Optional, ParamSpec, Type, Union
 
-import asyncio_atexit
-import docker
-import docker.models
-import docker.models.containers
 from autogen_core.base import CancellationToken
 from autogen_core.components.code_executor import (
     CodeBlock,
@@ -30,7 +26,6 @@
     lang_to_cmd,
     silence_pip,
 )
-from docker.errors import ImageNotFound, NotFound
 
 if sys.version_info >= (3, 11):
     from typing import Self
@@ -55,6 +50,11 @@ async def _wait_for_ready(container: Any, timeout: int = 60, stop_time: float =
 class DockerCommandLineCodeExecutor(CodeExecutor):
     """Executes code through a command line environment in a Docker container.
 
+    .. note::
+
+        This class requires the :code:`docker-code-executor` extra for the :code:`autogen-ext` package.
+
+
     The executor first saves each code block in a file in the working
     directory, and then executes the code file in the container.
     The executor executes the code blocks in the order they are received.
@@ -156,7 +156,14 @@ def __init__(
         else:
             self._setup_functions_complete = True
 
-        self._container: docker.models.containers.Container | None = None
+        try:
+            from docker.models.containers import Container
+        except ImportError as e:
+            raise RuntimeError(
+                "Missing dependecies for DockerCommandLineCodeExecutor. Please ensure the autogen-ext package was installed with the 'docker-code-executor' extra."
+            ) from e
+
+        self._container: Container | None = None
         self._running = False
 
     @property
@@ -232,7 +239,7 @@ async def _execute_code_dont_check_setup(
                 break
 
             if not filename:
-                filename = f"tmp_code_{md5(code.encode()).hexdigest()}.{lang}"
+                filename = f"tmp_code_{sha256(code.encode()).hexdigest()}.{lang}"
 
             code_path = self._work_dir / filename
             with code_path.open("w", encoding="utf-8") as fout:
@@ -293,6 +300,14 @@ async def stop(self) -> None:
         if not self._running:
             return
 
+        try:
+            import docker
+            from docker.errors import NotFound
+        except ImportError as e:
+            raise RuntimeError(
+                "Missing dependecies for DockerCommandLineCodeExecutor. Please ensure the autogen-ext package was installed with the 'docker-code-executor' extra."
+            ) from e
+
         client = docker.from_env()
         try:
             container = await asyncio.to_thread(client.containers.get, self.container_name)
@@ -303,6 +318,15 @@ async def stop(self) -> None:
             self._running = False
 
     async def start(self) -> None:
+        try:
+            import asyncio_atexit
+            import docker
+            from docker.errors import ImageNotFound
+        except ImportError as e:
+            raise RuntimeError(
+                "Missing dependecies for DockerCommandLineCodeExecutor. Please ensure the autogen-ext package was installed with the 'docker-code-executor' extra."
+            ) from e
+
         # Start a container from the image, read to exec commands later
         client = docker.from_env()
 
diff --git a/python/packages/autogen-ext/src/autogen_ext/tools/__init__.py b/python/packages/autogen-ext/src/autogen_ext/tools/__init__.py
new file mode 100644
index 000000000000..03af9585bfd5
--- /dev/null
+++ b/python/packages/autogen-ext/src/autogen_ext/tools/__init__.py
@@ -0,0 +1,3 @@
+from ._langchain_adapter import LangChainToolAdapter
+
+__all__ = ["LangChainToolAdapter"]
diff --git a/python/packages/autogen-ext/src/autogen_ext/tools/langchain/_langchain_adapter.py b/python/packages/autogen-ext/src/autogen_ext/tools/_langchain_adapter.py
similarity index 62%
rename from python/packages/autogen-ext/src/autogen_ext/tools/langchain/_langchain_adapter.py
rename to python/packages/autogen-ext/src/autogen_ext/tools/_langchain_adapter.py
index 1d7faf19cb4a..60bdb69b1741 100644
--- a/python/packages/autogen-ext/src/autogen_ext/tools/langchain/_langchain_adapter.py
+++ b/python/packages/autogen-ext/src/autogen_ext/tools/_langchain_adapter.py
@@ -1,46 +1,53 @@
+from __future__ import annotations
+
 import asyncio
 import inspect
-from typing import Any, Callable, Dict, Tuple, Type, cast
+from typing import TYPE_CHECKING, Any, Callable, Dict, Type, cast
 
 from autogen_core.base import CancellationToken
 from autogen_core.components.tools import BaseTool
 from pydantic import BaseModel, Field, create_model
-from pydantic.fields import FieldInfo
-
-from langchain.tools import Tool as LangChainTool
 
-FieldDefinition = Tuple[Type[Any], FieldInfo]
-FieldsDict = Dict[str, FieldDefinition]
+if TYPE_CHECKING:
+    from langchain_core.tools import Tool as LangChainTool
 
 
 class LangChainToolAdapter(BaseTool[BaseModel, Any]):
-    langchain_tool: LangChainTool
-    _callable: Callable[..., Any]
+    """Allows you to wrap a LangChain tool and make it available to AutoGen.
+
+    .. note::
+
+        This class requires the :code:`docker-code-executor` extra for the :code:`autogen-ext` package.
+
+
+    Args:
+        langchain_tool (LangChainTool): A LangChain tool to wrap
+    """
 
     def __init__(self, langchain_tool: LangChainTool):
-        self.langchain_tool = langchain_tool
+        self._langchain_tool: LangChainTool = langchain_tool
 
         # Extract name and description
-        name = langchain_tool.name
-        description = langchain_tool.description or ""
+        name = self._langchain_tool.name
+        description = self._langchain_tool.description or ""
 
         # Determine the callable method
-        if hasattr(langchain_tool, "func") and callable(langchain_tool.func):
-            assert langchain_tool.func is not None
-            self._callable = langchain_tool.func
-        elif hasattr(langchain_tool, "_run") and callable(langchain_tool._run):  # pyright: ignore
-            self._callable = langchain_tool._run  # type: ignore
+        if hasattr(self._langchain_tool, "func") and callable(self._langchain_tool.func):
+            assert self._langchain_tool.func is not None
+            self._callable: Callable[..., Any] = self._langchain_tool.func
+        elif hasattr(self._langchain_tool, "_run") and callable(self._langchain_tool._run):  # pyright: ignore
+            self._callable: Callable[..., Any] = self._langchain_tool._run  # type: ignore
         else:
             raise AttributeError(
                 f"The provided LangChain tool '{name}' does not have a callable 'func' or '_run' method."
             )
 
         # Determine args_type
-        if langchain_tool.args_schema:  # pyright: ignore
-            args_type = langchain_tool.args_schema  # pyright: ignore
+        if self._langchain_tool.args_schema:  # pyright: ignore
+            args_type = self._langchain_tool.args_schema  # pyright: ignore
         else:
             # Infer args_type from the callable's signature
-            sig = inspect.signature(cast(Callable[..., Any], self._callable))
+            sig = inspect.signature(cast(Callable[..., Any], self._callable))  # type: ignore
             fields = {
                 k: (v.annotation, Field(...))
                 for k, v in sig.parameters.items()
diff --git a/python/packages/autogen-ext/src/autogen_ext/tools/langchain/__init__.py b/python/packages/autogen-ext/src/autogen_ext/tools/langchain/__init__.py
index 03af9585bfd5..4d401fc7ef1f 100644
--- a/python/packages/autogen-ext/src/autogen_ext/tools/langchain/__init__.py
+++ b/python/packages/autogen-ext/src/autogen_ext/tools/langchain/__init__.py
@@ -1,3 +1,7 @@
-from ._langchain_adapter import LangChainToolAdapter
+import warnings
+
+from ...tools import LangChainToolAdapter
+
+warnings.warn("LangChainToolAdapter moved to autogen_ext.tools.LangChainToolAdapter", DeprecationWarning, stacklevel=2)
 
 __all__ = ["LangChainToolAdapter"]
diff --git a/python/packages/autogen-ext/tests/code_executors/test_aca_dynamic_sessions.py b/python/packages/autogen-ext/tests/code_executors/test_aca_dynamic_sessions.py
index 9d5d477b14d6..40eb07213006 100644
--- a/python/packages/autogen-ext/tests/code_executors/test_aca_dynamic_sessions.py
+++ b/python/packages/autogen-ext/tests/code_executors/test_aca_dynamic_sessions.py
@@ -10,7 +10,7 @@
 from anyio import open_file
 from autogen_core.base import CancellationToken
 from autogen_core.components.code_executor import CodeBlock
-from autogen_ext.code_executor.aca_dynamic_sessions import AzureContainerCodeExecutor
+from autogen_ext.code_executors import ACADynamicSessionsCodeExecutor
 from azure.identity import DefaultAzureCredential
 
 UNIX_SHELLS = ["bash", "sh", "shell"]
@@ -30,7 +30,9 @@
 async def test_execute_code() -> None:
     assert POOL_ENDPOINT is not None
     cancellation_token = CancellationToken()
-    executor = AzureContainerCodeExecutor(pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential())
+    executor = ACADynamicSessionsCodeExecutor(
+        pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential()
+    )
 
     # Test single code block.
     code_blocks = [CodeBlock(code="import sys; print('hello world!')", language="python")]
@@ -67,7 +69,7 @@ async def test_execute_code() -> None:
 async def test_azure_container_code_executor_timeout() -> None:
     assert POOL_ENDPOINT is not None
     cancellation_token = CancellationToken()
-    executor = AzureContainerCodeExecutor(
+    executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), timeout=1
     )
     code_blocks = [CodeBlock(code="import time; time.sleep(10); print('hello world!')", language="python")]
@@ -83,7 +85,9 @@ async def test_azure_container_code_executor_timeout() -> None:
 async def test_azure_container_code_executor_cancellation() -> None:
     assert POOL_ENDPOINT is not None
     cancellation_token = CancellationToken()
-    executor = AzureContainerCodeExecutor(pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential())
+    executor = ACADynamicSessionsCodeExecutor(
+        pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential()
+    )
     code_blocks = [CodeBlock(code="import time; time.sleep(10); print('hello world!')", language="python")]
 
     coro = executor.execute_code_blocks(code_blocks, cancellation_token)
@@ -109,7 +113,7 @@ async def test_upload_files() -> None:
     cancellation_token = CancellationToken()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        executor = AzureContainerCodeExecutor(
+        executor = ACADynamicSessionsCodeExecutor(
             pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), work_dir=temp_dir
         )
 
@@ -155,7 +159,7 @@ async def test_download_files() -> None:
     cancellation_token = CancellationToken()
 
     with tempfile.TemporaryDirectory() as temp_dir:
-        executor = AzureContainerCodeExecutor(
+        executor = ACADynamicSessionsCodeExecutor(
             pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), work_dir=temp_dir
         )
 
diff --git a/python/packages/autogen-ext/tests/code_executors/test_aca_user_defined_functions.py b/python/packages/autogen-ext/tests/code_executors/test_aca_user_defined_functions.py
index 7b74109b6ea2..857a0333a32b 100644
--- a/python/packages/autogen-ext/tests/code_executors/test_aca_user_defined_functions.py
+++ b/python/packages/autogen-ext/tests/code_executors/test_aca_user_defined_functions.py
@@ -11,7 +11,7 @@
     FunctionWithRequirements,
     with_requirements,
 )
-from autogen_ext.code_executor.aca_dynamic_sessions import AzureContainerCodeExecutor
+from autogen_ext.code_executors import ACADynamicSessionsCodeExecutor
 from azure.identity import DefaultAzureCredential
 
 ENVIRON_KEY_AZURE_POOL_ENDPOINT = "AZURE_POOL_ENDPOINT"
@@ -58,10 +58,10 @@ def function_incorrect_dep() -> "polars.DataFrame":
 async def test_azure_can_load_function_with_reqs() -> None:
     assert POOL_ENDPOINT is not None
     cancellation_token = CancellationToken()
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), functions=[load_data]
     )
-    # AzureContainerCodeExecutor doesn't use the functions module import
+    # ACADynamicSessionsCodeExecutor doesn't use the functions module import
     code = """import polars
 
 # Get first row's name
@@ -87,10 +87,10 @@ async def test_azure_can_load_function() -> None:
     assert POOL_ENDPOINT is not None
 
     cancellation_token = CancellationToken()
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), functions=[add_two_numbers]
     )
-    # AzureContainerCodeExecutor doesn't use the functions module import
+    # ACADynamicSessionsCodeExecutor doesn't use the functions module import
     code = """print(add_two_numbers(1, 2))"""
 
     azure_result = await azure_executor.execute_code_blocks(
@@ -111,7 +111,7 @@ async def test_azure_can_load_function() -> None:
 async def test_azure_fails_for_function_incorrect_import() -> None:
     assert POOL_ENDPOINT is not None
     cancellation_token = CancellationToken()
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=POOL_ENDPOINT,
         credential=DefaultAzureCredential(),
         functions=[function_incorrect_import],
@@ -135,7 +135,7 @@ async def test_azure_fails_for_function_incorrect_import() -> None:
 async def test_azure_fails_for_function_incorrect_dep() -> None:
     assert POOL_ENDPOINT is not None
     cancellation_token = CancellationToken()
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), functions=[function_incorrect_dep]
     )
     code = """function_incorrect_dep()"""
@@ -153,7 +153,7 @@ def test_azure_formatted_prompt() -> None:
     assert_str = '''def add_two_numbers(a: int, b: int) -> int:
     """Add two numbers together."""
 '''
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=DUMMY_POOL_ENDPOINT, credential=DefaultAzureCredential(), functions=[add_two_numbers]
     )
 
@@ -174,7 +174,7 @@ def add_two_numbers(a: int, b: int) -> int:
     """Add two numbers together."""
 '''
 
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=DUMMY_POOL_ENDPOINT, credential=DefaultAzureCredential(), functions=[func]
     )
 
@@ -197,7 +197,7 @@ def add_two_numbers(a: int, b: int) -> int:
     return a + b
 '''
     )
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), functions=[func]
     )
     code = """print(add_two_numbers(1, 2))"""
@@ -228,7 +228,7 @@ def add_two_numbers(a: int, b: int) -> int:
 '''
     )
 
-    azure_executor = AzureContainerCodeExecutor(
+    azure_executor = ACADynamicSessionsCodeExecutor(
         pool_management_endpoint=POOL_ENDPOINT, credential=DefaultAzureCredential(), functions=[func]
     )
     code = """print(add_two_numbers(object(), False))"""
diff --git a/python/packages/autogen-ext/tests/code_executors/test_docker_commandline_code_executor.py b/python/packages/autogen-ext/tests/code_executors/test_docker_commandline_code_executor.py
index 641e5e703316..25c3161157cd 100644
--- a/python/packages/autogen-ext/tests/code_executors/test_docker_commandline_code_executor.py
+++ b/python/packages/autogen-ext/tests/code_executors/test_docker_commandline_code_executor.py
@@ -10,7 +10,7 @@
 from aiofiles import open
 from autogen_core.base import CancellationToken
 from autogen_core.components.code_executor import CodeBlock
-from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor
+from autogen_ext.code_executors import DockerCommandLineCodeExecutor
 
 
 def docker_tests_enabled() -> bool:
diff --git a/python/packages/autogen-ext/tests/test_example.py b/python/packages/autogen-ext/tests/test_example.py
deleted file mode 100644
index 60f1465c8ffb..000000000000
--- a/python/packages/autogen-ext/tests/test_example.py
+++ /dev/null
@@ -1,2 +0,0 @@
-async def test_example() -> None:
-    assert True
diff --git a/python/packages/autogen-ext/tests/test_tools.py b/python/packages/autogen-ext/tests/test_tools.py
index e824576b41e5..c79d25af3d59 100644
--- a/python/packages/autogen-ext/tests/test_tools.py
+++ b/python/packages/autogen-ext/tests/test_tools.py
@@ -2,10 +2,10 @@
 
 import pytest
 from autogen_core.base import CancellationToken
-from autogen_ext.tools.langchain import LangChainToolAdapter  # type: ignore
-from langchain.tools import BaseTool as LangChainTool  # type: ignore
-from langchain.tools import tool  # pyright: ignore
+from autogen_ext.tools import LangChainToolAdapter  # type: ignore
 from langchain_core.callbacks.manager import AsyncCallbackManagerForToolRun, CallbackManagerForToolRun
+from langchain_core.tools import BaseTool as LangChainTool
+from langchain_core.tools import tool  # pyright: ignore
 from pydantic import BaseModel, Field
 
 
@@ -46,7 +46,7 @@ async def test_langchain_tool_adapter() -> None:
     langchain_tool = add  # type: ignore
 
     # Create an adapter
-    adapter = LangChainToolAdapter(langchain_tool)  # pyright: ignore
+    adapter = LangChainToolAdapter(langchain_tool)  # type: ignore
 
     # Test schema generation
     schema = adapter.schema
@@ -75,7 +75,7 @@ async def test_langchain_tool_adapter() -> None:
 
     # Test CustomCalculatorTool
     custom_langchain_tool = CustomCalculatorTool()
-    custom_adapter = LangChainToolAdapter(custom_langchain_tool)  # pyright: ignore
+    custom_adapter = LangChainToolAdapter(custom_langchain_tool)  # type: ignore
 
     # Test schema generation for CustomCalculatorTool
     custom_schema = custom_adapter.schema
diff --git a/python/packages/autogen-magentic-one/examples/README.md b/python/packages/autogen-magentic-one/examples/README.md
new file mode 100644
index 000000000000..21ffcd768548
--- /dev/null
+++ b/python/packages/autogen-magentic-one/examples/README.md
@@ -0,0 +1,19 @@
+# Examples of Magentic-One
+
+**Note**: The examples in this folder are ran at your own risk. They involve agents navigating the web, executing code and browsing local files. Please supervise the execution of the agents to reduce any risks. We also recommend running the examples in a docker environment.
+
+
+We include various examples for using Magentic-One and is agents:
+
+- [example.py](example.py): Is a human-in-the-loop of Magentic-One trying to solve a task specified by user input. If you wish for the team to execute the task without involving the user, remove user_proxy from the Orchestrator agents list.
+
+- [example_coder.py](example_coder.py): Is an example of the Coder + Execution agents in Magentic-One -- without the Magentic-One orchestrator. In a loop, specified by using the RoundRobinOrchestrator, the coder will write code based on user input, executor will run the code and then the user is asked for input again.
+
+- [example_file_surfer.py](example_file_surfer.py): Is an example of the FileSurfer agent individually.  In a loop, specified by using the RoundRobinOrchestrator, the file surfer will respond to user input and then the user is asked for input again.
+
+- [example_userproxy.py](example_userproxy.py): Is an example of the Coder agent in Magentic-One. Compared to [example_coder.py](example_coder.py) this example is just meant to show how to interact with the Coder agent, which serves as a general purpose assistant without tools. In a loop, specified by using the RoundRobinOrchestrator, the coder will respond to user input and then the user is asked for input again.
+
+- [example_websurfer.py](example_websurfer.py): Is an example of the MultimodalWebSurfer agent in Magentic-one -- without the orchestrator. To view the browser the agent uses, pass the argument 'headless = False' to 'actual_surfer.init'. In a loop, specified by using the RoundRobinOrchestrator, the web surfer will perform a single action on the browser in response to user input and then the user is asked for input again.
+
+
+Running these examples is simple. First make sure you have installed 'autogen-magentic-one' either from source or from pip, then run 'python example.py'
diff --git a/python/packages/autogen-magentic-one/examples/example.py b/python/packages/autogen-magentic-one/examples/example.py
new file mode 100644
index 000000000000..f6fd5a284d06
--- /dev/null
+++ b/python/packages/autogen-magentic-one/examples/example.py
@@ -0,0 +1,97 @@
+"""This example demonstrates MagenticOne performing a task given by the user and returning a final answer."""
+
+import asyncio
+import logging
+import os
+
+from autogen_core.application import SingleThreadedAgentRuntime
+from autogen_core.application.logging import EVENT_LOGGER_NAME
+from autogen_core.base import AgentId, AgentProxy
+from autogen_core.components.code_executor import CodeBlock
+from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor
+from autogen_magentic_one.agents.coder import Coder, Executor
+from autogen_magentic_one.agents.file_surfer import FileSurfer
+from autogen_magentic_one.agents.multimodal_web_surfer import MultimodalWebSurfer
+from autogen_magentic_one.agents.orchestrator import LedgerOrchestrator
+from autogen_magentic_one.agents.user_proxy import UserProxy
+from autogen_magentic_one.messages import RequestReplyMessage
+from autogen_magentic_one.utils import LogHandler, create_completion_client_from_env
+
+# NOTE: Don't forget to 'playwright install --with-deps chromium'
+
+
+async def confirm_code(code: CodeBlock) -> bool:
+    response = await asyncio.to_thread(
+        input,
+        f"Executor is about to execute code (lang: {code.language}):\n{code.code}\n\nDo you want to proceed? (yes/no): ",
+    )
+    return response.lower() == "yes"
+
+
+async def main() -> None:
+    # Create the runtime.
+    runtime = SingleThreadedAgentRuntime()
+
+    # Create an appropriate client
+    client = create_completion_client_from_env(model="gpt-4o")
+
+    async with DockerCommandLineCodeExecutor() as code_executor:
+        # Register agents.
+        await Coder.register(runtime, "Coder", lambda: Coder(model_client=client))
+        coder = AgentProxy(AgentId("Coder", "default"), runtime)
+
+        await Executor.register(
+            runtime,
+            "Executor",
+            lambda: Executor("A agent for executing code", executor=code_executor, confirm_execution=confirm_code),
+        )
+        executor = AgentProxy(AgentId("Executor", "default"), runtime)
+
+        # Register agents.
+        await MultimodalWebSurfer.register(runtime, "WebSurfer", MultimodalWebSurfer)
+        web_surfer = AgentProxy(AgentId("WebSurfer", "default"), runtime)
+
+        await FileSurfer.register(runtime, "file_surfer", lambda: FileSurfer(model_client=client))
+        file_surfer = AgentProxy(AgentId("file_surfer", "default"), runtime)
+
+        await UserProxy.register(
+            runtime,
+            "UserProxy",
+            lambda: UserProxy(description="The current user interacting with you."),
+        )
+        user_proxy = AgentProxy(AgentId("UserProxy", "default"), runtime)
+
+        await LedgerOrchestrator.register(
+            runtime,
+            "Orchestrator",
+            lambda: LedgerOrchestrator(
+                agents=[web_surfer, user_proxy, coder, executor, file_surfer],
+                model_client=client,
+                max_rounds=30,
+                max_time=25 * 60,
+                return_final_answer=True,
+            ),
+        )
+        # orchestrator = AgentProxy(AgentId("Orchestrator", "default"), runtime)
+
+        runtime.start()
+
+        actual_surfer = await runtime.try_get_underlying_agent_instance(web_surfer.id, type=MultimodalWebSurfer)
+        await actual_surfer.init(
+            model_client=client,
+            downloads_folder=os.getcwd(),
+            start_page="https://www.bing.com",
+            browser_channel="chromium",
+            headless=True,
+        )
+
+        await runtime.send_message(RequestReplyMessage(), user_proxy.id)
+        await runtime.stop_when_idle()
+
+
+if __name__ == "__main__":
+    logger = logging.getLogger(EVENT_LOGGER_NAME)
+    logger.setLevel(logging.INFO)
+    log_handler = LogHandler()
+    logger.handlers = [log_handler]
+    asyncio.run(main())
diff --git a/python/packages/autogen-magentic-one/examples/example_coder.py b/python/packages/autogen-magentic-one/examples/example_coder.py
index b1978a13d2eb..4824f7338fdf 100644
--- a/python/packages/autogen-magentic-one/examples/example_coder.py
+++ b/python/packages/autogen-magentic-one/examples/example_coder.py
@@ -11,7 +11,7 @@
 from autogen_core.application.logging import EVENT_LOGGER_NAME
 from autogen_core.base import AgentId, AgentProxy
 from autogen_core.components.code_executor import CodeBlock
-from autogen_ext.code_executor.docker_executor import DockerCommandLineCodeExecutor
+from autogen_ext.code_executors import DockerCommandLineCodeExecutor
 from autogen_magentic_one.agents.coder import Coder, Executor
 from autogen_magentic_one.agents.orchestrator import RoundRobinOrchestrator
 from autogen_magentic_one.agents.user_proxy import UserProxy
diff --git a/python/packages/autogen-magentic-one/readme.md b/python/packages/autogen-magentic-one/readme.md
index 24301f30e4f4..3a092cd4fb7d 100644
--- a/python/packages/autogen-magentic-one/readme.md
+++ b/python/packages/autogen-magentic-one/readme.md
@@ -179,8 +179,10 @@ pip install -e .
 2. Configure the environment variables for the chat completion client. See instructions below.
 3. Now you can run the example code to see how the agents work together to accomplish a task.
 
+**NOTE:** The example code may download files from the internet, execute code, and interact with web pages. Ensure you are in a safe environment before running the example code.
+
 ```bash
-python examples/example_websurfer.py
+python examples/example.py
 ```
 
 
diff --git a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/base_orchestrator.py b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/base_orchestrator.py
index 762bf3d7b5a3..75a5c2b76e1c 100644
--- a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/base_orchestrator.py
+++ b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/base_orchestrator.py
@@ -12,6 +12,8 @@
 
 
 class BaseOrchestrator(MagenticOneBaseAgent):
+    """Base class for orchestrator that manage a group of agents."""
+
     def __init__(
         self,
         agents: List[AgentProxy],
diff --git a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/coder.py b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/coder.py
index ea83ae0a9155..7106932514f8 100644
--- a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/coder.py
+++ b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/coder.py
@@ -17,7 +17,7 @@
 
 @default_subscription
 class Coder(BaseWorker):
-    """An agent that uses tools to write, execute, and debug Python code."""
+    """An agent that can write code or text to solve tasks without additional tools."""
 
     DEFAULT_DESCRIPTION = "A helpful and general-purpose AI assistant that has strong language skills, Python skills, and Linux command line skills."
 
diff --git a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/multimodal_web_surfer/multimodal_web_surfer.py b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
index 1e6953bb3bad..fee6b968d9c5 100644
--- a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
+++ b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/multimodal_web_surfer/multimodal_web_surfer.py
@@ -373,7 +373,7 @@ async def _execute_tool(
 
         # Handle metadata
         page_metadata = json.dumps(await self._get_page_metadata(), indent=4)
-        metadata_hash = hashlib.md5(page_metadata.encode("utf-8")).hexdigest()
+        metadata_hash = hashlib.sha256(page_metadata.encode("utf-8")).hexdigest()
         if metadata_hash != self._prior_metadata_hash:
             page_metadata = (
                 "\nThe following metadata was extracted from the webpage:\n\n" + page_metadata.strip() + "\n"
@@ -413,7 +413,7 @@ async def _execute_tool(
 
     async def __generate_reply(self, cancellation_token: CancellationToken) -> Tuple[bool, UserContent]:
         assert self._page is not None
-        """Generates the actual reply."""
+        """Generates the actual reply. First calls the LLM to figure out which tool to use, then executes the tool."""
 
         # Clone the messages to give context, removing old screenshots
         history: List[LLMMessage] = []
@@ -449,8 +449,8 @@ async def __generate_reply(self, cancellation_token: CancellationToken) -> Tuple
             TOOL_SLEEP,
         ]
 
-        #        # Can we reach Bing to search?
-        #        if self._navigation_allow_list("https://www.bing.com/"):
+        # Can we reach Bing to search?
+        # if self._navigation_allow_list("https://www.bing.com/"):
         tools.append(TOOL_WEB_SEARCH)
 
         # We can scroll up
diff --git a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator.py b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator.py
index 13c3e6b3e160..986ab3a607ee 100644
--- a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator.py
+++ b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator.py
@@ -15,6 +15,7 @@
 from .base_orchestrator import BaseOrchestrator
 from .orchestrator_prompts import (
     ORCHESTRATOR_CLOSED_BOOK_PROMPT,
+    ORCHESTRATOR_GET_FINAL_ANSWER,
     ORCHESTRATOR_LEDGER_PROMPT,
     ORCHESTRATOR_PLAN_PROMPT,
     ORCHESTRATOR_SYNTHESIZE_PROMPT,
@@ -26,6 +27,8 @@
 
 @default_subscription
 class RoundRobinOrchestrator(BaseOrchestrator):
+    """A simple orchestrator that selects agents in a round-robin fashion."""
+
     def __init__(
         self,
         agents: List[AgentProxy],
@@ -41,6 +44,9 @@ async def _select_next_agent(self, message: LLMMessage) -> AgentProxy:
 
 @default_subscription
 class LedgerOrchestrator(BaseOrchestrator):
+    """The LedgerOrhestrator is the orchestrator used by MagenticOne to solve tasks.
+    It uses a ledger (implemented as a JSON generated by the LLM) to keep track of task progress and select the next agent that should speak."""
+
     DEFAULT_SYSTEM_MESSAGES = [
         SystemMessage(ORCHESTRATOR_SYSTEM_MESSAGE),
     ]
@@ -61,6 +67,7 @@ def __init__(
         max_time: float = float("inf"),
         max_stalls_before_replan: int = 3,
         max_replans: int = 3,
+        return_final_answer: bool = False,
     ) -> None:
         super().__init__(agents=agents, description=description, max_rounds=max_rounds, max_time=max_time)
 
@@ -81,6 +88,7 @@ def __init__(
         self._stall_counter = 0
         self._max_replans = max_replans
         self._replan_counter = 0
+        self._return_final_answer = return_final_answer
 
         self._team_description = ""
         self._task = ""
@@ -106,6 +114,7 @@ def _get_update_plan_prompt(self, team: str) -> str:
         return self._update_plan_prompt.format(team=team)
 
     async def _get_team_description(self) -> str:
+        # a single string description of all agents in the team
         team_description = ""
         for agent in self._agents:
             metadata = await agent.metadata
@@ -129,6 +138,7 @@ def _get_message_str(self, message: LLMMessage) -> str:
         return result
 
     async def _initialize_task(self, task: str, cancellation_token: Optional[CancellationToken] = None) -> None:
+        # called the first time a task is received
         self._task = task
         self._team_description = await self._get_team_description()
 
@@ -163,6 +173,8 @@ async def _initialize_task(self, task: str, cancellation_token: Optional[Cancell
         # At this point, the planning conversation is dropped.
 
     async def _update_facts_and_plan(self, cancellation_token: Optional[CancellationToken] = None) -> None:
+        # called when the orchestrator decides to replan
+
         # Shallow-copy the conversation
         planning_conversation = [m for m in self._chat_history]
 
@@ -189,9 +201,8 @@ async def _update_facts_and_plan(self, cancellation_token: Optional[Cancellation
         assert isinstance(response.content, str)
         self._plan = response.content
 
-        # At this point, the planning conversation is dropped.
-
     async def update_ledger(self, cancellation_token: Optional[CancellationToken] = None) -> Dict[str, Any]:
+        # updates the ledger at each turn
         max_json_retries = 10
 
         team_description = await self._get_team_description()
@@ -200,6 +211,7 @@ async def update_ledger(self, cancellation_token: Optional[CancellationToken] =
 
         ledger_user_messages: List[LLMMessage] = [UserMessage(content=ledger_prompt, source=self.metadata["type"])]
 
+        # retries in case the LLM does not return a valid JSON
         assert max_json_retries > 0
         for _ in range(max_json_retries):
             ledger_response = await self._model_client.create(
@@ -249,6 +261,20 @@ async def update_ledger(self, cancellation_token: Optional[CancellationToken] =
 
         raise ValueError("Failed to parse ledger information after multiple retries.")
 
+    async def _prepare_final_answer(self, cancellation_token: Optional[CancellationToken] = None) -> str:
+        # called when the task is complete
+
+        final_message = UserMessage(
+            content=ORCHESTRATOR_GET_FINAL_ANSWER.format(task=self._task), source=self.metadata["type"]
+        )
+        response = await self._model_client.create(
+            self._system_messages + self._chat_history + [final_message], cancellation_token=cancellation_token
+        )
+
+        assert isinstance(response.content, str)
+
+        return response.content
+
     async def _handle_broadcast(self, message: BroadcastMessage, ctx: MessageContext) -> None:
         self._chat_history.append(message.content)
         await super()._handle_broadcast(message, ctx)
@@ -256,6 +282,7 @@ async def _handle_broadcast(self, message: BroadcastMessage, ctx: MessageContext
     async def _select_next_agent(
         self, message: LLMMessage, cancellation_token: Optional[CancellationToken] = None
     ) -> Optional[AgentProxy]:
+        # the main orchestrator loop
         # Check if the task is still unset, in which case this message contains the task string
         if len(self._task) == 0:
             await self._initialize_task(self._get_message_str(message), cancellation_token)
@@ -310,6 +337,15 @@ async def _select_next_agent(
                     "Request satisfied.",
                 )
             )
+            if self._return_final_answer:
+                # generate a final message to summarize the conversation
+                final_answer = await self._prepare_final_answer(cancellation_token)
+                self.logger.info(
+                    OrchestrationEvent(
+                        f"{self.metadata['type']} (final answer)",
+                        f"\n{final_answer}",
+                    )
+                )
             return None
 
         # Stalled or stuck in a loop
@@ -373,6 +409,7 @@ async def _select_next_agent(
 
         # If we goit this far, we were not starting, done, or stuck
         next_agent_name = ledger_dict["next_speaker"]["answer"]
+        # find the agent with the next agent name
         for agent in self._agents:
             if (await agent.metadata)["type"] == next_agent_name:
                 # broadcast a new message
diff --git a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator_prompts.py b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator_prompts.py
index 24af2501a10d..b7c1686c1bee 100644
--- a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator_prompts.py
+++ b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/orchestrator_prompts.py
@@ -112,3 +112,15 @@
 
 {team}
 """
+
+ORCHESTRATOR_GET_FINAL_ANSWER = """
+We are working on the following task:
+{task}
+
+We have completed the task.
+
+The above messages contain the conversation that took place to complete the task.
+
+Based on the information gathered, provide the final answer to the original request.
+The answer should be phrased as if you were speaking to the user.
+"""
diff --git a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/user_proxy.py b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/user_proxy.py
index b4cb4a0d5e07..67ef2887b4b1 100755
--- a/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/user_proxy.py
+++ b/python/packages/autogen-magentic-one/src/autogen_magentic_one/agents/user_proxy.py
@@ -10,7 +10,7 @@
 
 @default_subscription
 class UserProxy(BaseWorker):
-    """An agent that allows the user to play the role of an agent in the conversation."""
+    """An agent that allows the user to play the role of an agent in the conversation via input."""
 
     DEFAULT_DESCRIPTION = "A human user."
 
diff --git a/python/packages/autogen-magentic-one/src/autogen_magentic_one/messages.py b/python/packages/autogen-magentic-one/src/autogen_magentic_one/messages.py
index 16924dd0db5c..c46ba997b255 100644
--- a/python/packages/autogen-magentic-one/src/autogen_magentic_one/messages.py
+++ b/python/packages/autogen-magentic-one/src/autogen_magentic_one/messages.py
@@ -11,27 +11,34 @@
 FunctionExecutionContent = List[FunctionExecutionResult]
 SystemContent = str
 
+# the below are message types used in MagenticOne
 
+
+# used by all agents to send messages
 class BroadcastMessage(BaseModel):
     content: LLMMessage
     request_halt: bool = False
 
 
+# used by orchestrator to obtain a response from an agent
 @dataclass
 class RequestReplyMessage:
     pass
 
 
+# used by orchestrator to reset an agent
 @dataclass
 class ResetMessage:
     pass
 
 
+# used by orchestrator to deactivate an agent
 @dataclass
 class DeactivateMessage:
     pass
 
 
+# orchestrator events
 @dataclass
 class OrchestrationEvent:
     source: str
@@ -47,6 +54,7 @@ class AgentEvent:
     message: str
 
 
+# used by the web surfer agent
 @dataclass
 class WebSurferEvent:
     source: str
diff --git a/python/packages/autogen-magentic-one/tests/browser_utils/test_requests_markdown_browser.py b/python/packages/autogen-magentic-one/tests/browser_utils/test_requests_markdown_browser.py
index 4293f4cd23b2..b9919abf7a8e 100644
--- a/python/packages/autogen-magentic-one/tests/browser_utils/test_requests_markdown_browser.py
+++ b/python/packages/autogen-magentic-one/tests/browser_utils/test_requests_markdown_browser.py
@@ -128,10 +128,10 @@ def test_requests_markdown_browser() -> None:
     browser.visit_page(PLAIN_TEXT_URL)
     assert re.sub(r"\s+", " ", string=browser.page_content, flags=re.DOTALL).strip() == expected_results
 
-    # Disrectly download a ZIP file and compute its md5
+    # Disrectly download a ZIP file and compute its sha256
     response = requests.get(DOWNLOAD_URL, stream=True)
     response.raise_for_status()
-    expected_md5 = hashlib.md5(response.raw.read()).hexdigest()
+    expected_sha256 = hashlib.sha256(response.raw.read()).hexdigest()
 
     # Download it with the browser and check for a match
     viewport = browser.visit_page(DOWNLOAD_URL)
@@ -139,10 +139,10 @@ def test_requests_markdown_browser() -> None:
     assert m is not None
     download_loc = m.group(1)
     with open(download_loc, "rb") as fh:
-        downloaded_md5 = hashlib.md5(fh.read()).hexdigest()
+        downloaded_sha256 = hashlib.sha256(fh.read()).hexdigest()
 
     # MD%s should match
-    assert expected_md5 == downloaded_md5
+    assert expected_sha256 == downloaded_sha256
 
     # Fetch a PDF
     viewport = browser.visit_page(PDF_URL)
diff --git a/python/packages/autogen-studio/Dockerfile b/python/packages/autogen-studio/Dockerfile
index bb475db7176b..e8e9dbd8182a 100644
--- a/python/packages/autogen-studio/Dockerfile
+++ b/python/packages/autogen-studio/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.10
+FROM mcr.microsoft.com/devcontainers/python:3.10
 
 WORKDIR /code
 
diff --git a/python/packages/autogen-studio/autogenstudio/utils/utils.py b/python/packages/autogen-studio/autogenstudio/utils/utils.py
index 40cd549cb06b..88f310d6ffc3 100644
--- a/python/packages/autogen-studio/autogenstudio/utils/utils.py
+++ b/python/packages/autogen-studio/autogenstudio/utils/utils.py
@@ -17,14 +17,14 @@
 from ..version import APP_NAME
 
 
-def md5_hash(text: str) -> str:
+def sha256_hash(text: str) -> str:
     """
     Compute the MD5 hash of a given text.
 
     :param text: The string to hash
     :return: The MD5 hash of the text
     """
-    return hashlib.md5(text.encode()).hexdigest()
+    return hashlib.sha256(text.encode()).hexdigest()
 
 
 def check_and_cast_datetime_fields(obj: Any) -> Any:
diff --git a/python/packages/autogen-studio/autogenstudio/web/app.py b/python/packages/autogen-studio/autogenstudio/web/app.py
index bbd087f52ea2..d86e2dc439fd 100644
--- a/python/packages/autogen-studio/autogenstudio/web/app.py
+++ b/python/packages/autogen-studio/autogenstudio/web/app.py
@@ -17,7 +17,7 @@
 from ..database.dbmanager import DBManager
 from ..datamodel import Agent, Message, Model, Response, Session, Skill, Workflow
 from ..profiler import Profiler
-from ..utils import check_and_cast_datetime_fields, init_app_folders, md5_hash, test_model
+from ..utils import check_and_cast_datetime_fields, init_app_folders, sha256_hash, test_model
 from ..version import VERSION
 from ..websocket_connection_manager import WebSocketConnectionManager
 
@@ -453,7 +453,7 @@ async def run_session_workflow(message: Message, session_id: int, workflow_id: i
         )
         # save incoming message
         dbmanager.upsert(message)
-        user_dir = os.path.join(folders["files_static_root"], "user", md5_hash(message.user_id))
+        user_dir = os.path.join(folders["files_static_root"], "user", sha256_hash(message.user_id))
         os.makedirs(user_dir, exist_ok=True)
         workflow = workflow_from_id(workflow_id, dbmanager=dbmanager)
         agent_response: Message = await managers["chat"].a_chat(
diff --git a/python/uv.lock b/python/uv.lock
index 673d82c6b195..fd386de0563b 100644
--- a/python/uv.lock
+++ b/python/uv.lock
@@ -498,7 +498,7 @@ docker-code-executor = [
     { name = "docker" },
 ]
 langchain-tools = [
-    { name = "langchain" },
+    { name = "langchain-core" },
 ]
 
 [package.metadata]
@@ -506,7 +506,7 @@ requires-dist = [
     { name = "autogen-core", editable = "packages/autogen-core" },
     { name = "azure-core", marker = "extra == 'azure-code-executor'" },
     { name = "docker", marker = "extra == 'docker-code-executor'", specifier = "~=7.0" },
-    { name = "langchain", marker = "extra == 'langchain-tools'", specifier = ">=0.3.1" },
+    { name = "langchain-core", marker = "extra == 'langchain-tools'", specifier = "~=0.3.3" },
 ]
 
 [[package]]
@@ -1965,28 +1965,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/c9/fb/108ecd1fe961941959ad0ee4e12ee7b8b1477247f30b1fdfd83ceaf017f0/jupyter_core-5.7.2-py3-none-any.whl", hash = "sha256:4f7315d2f6b4bcf2e3e7cb6e46772eba760ae459cd1f59d29eb57b0a01bd7409", size = 28965 },
 ]
 
-[[package]]
-name = "langchain"
-version = "0.3.1"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "aiohttp" },
-    { name = "async-timeout", marker = "python_full_version < '3.11'" },
-    { name = "langchain-core" },
-    { name = "langchain-text-splitters" },
-    { name = "langsmith" },
-    { name = "numpy" },
-    { name = "pydantic" },
-    { name = "pyyaml" },
-    { name = "requests" },
-    { name = "sqlalchemy" },
-    { name = "tenacity" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/a5/9e/4cb6a6ba04151a440e8ee527934df53b577181a7db6fce517faab110a1e0/langchain-0.3.1.tar.gz", hash = "sha256:54d6e3abda2ec056875a231a418a4130ba7576e629e899067e499bfc847b7586", size = 416751 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/51/3f/462c134228fbb4f65be0a9db6a651e2f1d7226d003a712f1bac455a141b7/langchain-0.3.1-py3-none-any.whl", hash = "sha256:94e5ee7464d4366e4b158aa5704953c39701ea237b9ed4b200096d49e83bb3ae", size = 1005223 },
-]
-
 [[package]]
 name = "langchain-core"
 version = "0.3.6"
@@ -2019,18 +1997,6 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/8c/de/865dedcb252db4725e6e458fb28845038217fdade1df40a9e41b0579c534/langchain_openai-0.2.0-py3-none-any.whl", hash = "sha256:9a1a69ba0706f23ec2941096ead0bc39202cac0e9782a5d6c8d92cb2280c2759", size = 51465 },
 ]
 
-[[package]]
-name = "langchain-text-splitters"
-version = "0.3.0"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "langchain-core" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/57/35/08ac1ca01c58da825f070bd1fdc9192a9ff52c0a048f74c93b05df70c127/langchain_text_splitters-0.3.0.tar.gz", hash = "sha256:f9fe0b4d244db1d6de211e7343d4abc4aa90295aa22e1f0c89e51f33c55cd7ce", size = 20234 }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/da/6a/d1303b722a3fa7a0a8c2f8f5307e42f0bdbded46d99cca436f3db0df5294/langchain_text_splitters-0.3.0-py3-none-any.whl", hash = "sha256:e84243e45eaff16e5b776cd9c81b6d07c55c010ebcb1965deb3d1792b7358e83", size = 25543 },
-]
-
 [[package]]
 name = "langgraph"
 version = "0.2.28"