From 4f060e5efa29f2332b5c5f670779cb8f76116e42 Mon Sep 17 00:00:00 2001 From: Krishna Shedbalkar <60742358+krishnashed@users.noreply.github.com> Date: Sat, 12 Oct 2024 06:39:12 +0530 Subject: [PATCH] Autobuild Function calling (#3238) * [Fix] Precommit issues * [Fix] checks * [Fix] iterating through list_of_functions * [Fix] pre-commit checks * Update test/agentchat/contrib/test_agent_builder.py Co-authored-by: Chi Wang <4250911+sonichi@users.noreply.github.com> --------- Co-authored-by: Chi Wang Co-authored-by: Ryan Sweet Co-authored-by: Chi Wang <4250911+sonichi@users.noreply.github.com> --- autogen/agentchat/contrib/agent_builder.py | 79 +++- notebook/autobuild_function_calling.ipynb | 470 +++++++++++++++++++ test/agentchat/contrib/test_agent_builder.py | 141 +++++- 3 files changed, 671 insertions(+), 19 deletions(-) create mode 100644 notebook/autobuild_function_calling.ipynb diff --git a/autogen/agentchat/contrib/agent_builder.py b/autogen/agentchat/contrib/agent_builder.py index 430017d13fc9..7eaec3eef747 100644 --- a/autogen/agentchat/contrib/agent_builder.py +++ b/autogen/agentchat/contrib/agent_builder.py @@ -172,6 +172,26 @@ class AgentBuilder: ``` """ + AGENT_FUNCTION_MAP_PROMPT = """Consider the following function. + Function Name: {function_name} + Function Description: {function_description} + + The agent details are given in the format: {format_agent_details} + + Which one of the following agents should be able to execute this function, preferably an agent with programming background? + {agent_details} + + Hint: + # Only respond with the name of the agent that is most suited to execute the function and nothing else. + """ + + UPDATED_AGENT_SYSTEM_MESSAGE = """ + {agent_system_message} + + You have access to execute the function: {function_name}. + With following description: {function_description} + """ + def __init__( self, config_file_or_env: Optional[str] = "OAI_CONFIG_LIST", @@ -358,6 +378,7 @@ def build( self, building_task: str, default_llm_config: Dict, + list_of_functions: Optional[List[Dict]] = None, coding: Optional[bool] = None, code_execution_config: Optional[Dict] = None, use_oai_assistant: Optional[bool] = False, @@ -373,6 +394,7 @@ def build( coding: use to identify if the user proxy (a code interpreter) should be added. code_execution_config: specific configs for user proxy (e.g., last_n_messages, work_dir, ...). default_llm_config: specific configs for LLM (e.g., config_list, seed, temperature, ...). + list_of_functions: list of functions to be associated with Agents use_oai_assistant: use OpenAI assistant api instead of self-constructed agent. user_proxy: user proxy's class that can be used to replace the default user proxy. @@ -480,8 +502,9 @@ def build( "code_execution_config": code_execution_config, } ) + _config_check(self.cached_configs) - return self._build_agents(use_oai_assistant, user_proxy=user_proxy, **kwargs) + return self._build_agents(use_oai_assistant, list_of_functions, user_proxy=user_proxy, **kwargs) def build_from_library( self, @@ -653,13 +676,18 @@ def build_from_library( return self._build_agents(use_oai_assistant, user_proxy=user_proxy, **kwargs) def _build_agents( - self, use_oai_assistant: Optional[bool] = False, user_proxy: Optional[autogen.ConversableAgent] = None, **kwargs + self, + use_oai_assistant: Optional[bool] = False, + list_of_functions: Optional[List[Dict]] = None, + user_proxy: Optional[autogen.ConversableAgent] = None, + **kwargs, ) -> Tuple[List[autogen.ConversableAgent], Dict]: """ Build agents with generated configs. Args: use_oai_assistant: use OpenAI assistant api instead of self-constructed agent. + list_of_functions: list of functions to be associated to Agents user_proxy: user proxy's class that can be used to replace the default user proxy. Returns: @@ -695,6 +723,53 @@ def _build_agents( ) agent_list = agent_list + [user_proxy] + agent_details = [] + + for agent in agent_list[:-1]: + agent_details.append({"name": agent.name, "description": agent.description}) + + if list_of_functions: + for func in list_of_functions: + resp = ( + self.builder_model.create( + messages=[ + { + "role": "user", + "content": self.AGENT_FUNCTION_MAP_PROMPT.format( + function_name=func["name"], + function_description=func["description"], + format_agent_details='[{"name": "agent_name", "description": "agent description"}, ...]', + agent_details=str(json.dumps(agent_details)), + ), + } + ] + ) + .choices[0] + .message.content + ) + + autogen.agentchat.register_function( + func["function"], + caller=self.agent_procs_assign[resp][0], + executor=agent_list[0], + name=func["name"], + description=func["description"], + ) + + agents_current_system_message = [ + agent["system_message"] for agent in agent_configs if agent["name"] == resp + ][0] + + self.agent_procs_assign[resp][0].update_system_message( + self.UPDATED_AGENT_SYSTEM_MESSAGE.format( + agent_system_message=agents_current_system_message, + function_name=func["name"], + function_description=func["description"], + ) + ) + + print(f"Function {func['name']} is registered to agent {resp}.") + return agent_list, self.cached_configs.copy() def save(self, filepath: Optional[str] = None) -> str: diff --git a/notebook/autobuild_function_calling.ipynb b/notebook/autobuild_function_calling.ipynb new file mode 100644 index 000000000000..f414de4b84a4 --- /dev/null +++ b/notebook/autobuild_function_calling.ipynb @@ -0,0 +1,470 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# AutoBuild Agents function calling\n", + "By: [Krishna Shedbalkar](https://github.com/krishnashed/)\n", + "\n", + "In this notebook, we introduce a way for Agents created using `Autobuild` to do function calling. Developers can specify a function, function name and function description which will thereafter be assigned and executed by the most suitable agent created using AutoBuild." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Requirement\n", + "\n", + "AutoBuild require `pyautogen[autobuild]`, which can be installed by the following command:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install pyautogen[autobuild]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 1: Prepare configuration and some useful functions\n", + "\n", + "Prepare a `config_file_or_env` for assistant agent to limit the choice of LLM you want to use in this task. This config can be a path of json file or a name of environment variable. A `default_llm_config` is also required for initialize the specific config of LLMs like seed, temperature, etc. Preventing UserProxy agent being called multiple times by adding `allow_repeat_speaker=agent_list[:-1]`" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import autogen\n", + "from autogen.agentchat.contrib.agent_builder import AgentBuilder\n", + "\n", + "config_file_or_env = \"OAI_CONFIG_LIST\"\n", + "config_list = autogen.config_list_from_json(config_file_or_env, filter_dict={\"model\": [\"gpt-4-1106-preview\", \"gpt-4\"]})\n", + "llm_config = {\n", + " \"config_list\": config_list,\n", + " \"timeout\": 120,\n", + "}\n", + "\n", + "\n", + "def start_task(execution_task: str, agent_list: list):\n", + " group_chat = autogen.GroupChat(agents=agent_list, messages=[], allow_repeat_speaker=agent_list[:-1], max_round=12)\n", + " manager = autogen.GroupChatManager(groupchat=group_chat, llm_config={\"config_list\": config_list})\n", + " agent_list[0].initiate_chat(manager, message=execution_task)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 2: Create a AgentBuilder\n", + "\n", + "Create a `AgentBuilder` with the specified `config_path_or_env`. AgentBuilder will use `gpt-4` in default to complete the whole process, you can specify the `builder_model` and `agent_model` to other OpenAI model to match your task. You can also specify an open-source LLM supporting by vLLM and FastChat, see blog for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "builder = AgentBuilder(\n", + " config_file_or_env=config_file_or_env, builder_model=\"gpt-4-1106-preview\", agent_model=\"gpt-4-1106-preview\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 3: Specify a building task\n", + "\n", + "Specify a building task with a general description. Building task will help build manager (a LLM) decide what agents should be built." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "building_task = \"Analyze and list the trending topics in arxiv papers related to GPT-4\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 4: Define functions\n", + "\n", + "Define functions to be executed by the Agents of AutoBuild, further specify details like `name`, `description` and `function` of all the functions in an array called `list_of_functions` which will be passed to `builder.build()`" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "import time\n", + "from datetime import datetime, timedelta\n", + "from typing import Dict\n", + "\n", + "import feedparser\n", + "\n", + "\n", + "def get_arxiv_paper_from_a_week(search_topic: str) -> Dict:\n", + " # arXiv API endpoint\n", + " url = \"http://export.arxiv.org/api/query?\"\n", + "\n", + " # Search parameters\n", + " max_results = 10\n", + "\n", + " query = (\n", + " f\"{url}search_query=all:{search_topic}&max_results={max_results}&sortBy=lastUpdatedDate&sortOrder=descending\"\n", + " )\n", + "\n", + " # Parse the feed\n", + " feed = feedparser.parse(query)\n", + "\n", + " now = datetime.now()\n", + " week_ago = now - timedelta(weeks=1)\n", + "\n", + " papers = []\n", + "\n", + " # Get papers from last week\n", + " for entry in feed.entries:\n", + " published_time = datetime.strptime(entry.published, \"%Y-%m-%dT%H:%M:%SZ\")\n", + " if published_time > week_ago:\n", + " list_of_authors = \", \".join(author.name for author in entry.authors)\n", + "\n", + " papers.append(\n", + " {\n", + " \"title\": entry.title,\n", + " \"authors\": list_of_authors,\n", + " \"published_on\": time.strftime(\"%B %d, %Y\", entry.published_parsed),\n", + " \"summary\": entry.summary,\n", + " \"link\": entry.link,\n", + " }\n", + " )\n", + "\n", + " return papers\n", + "\n", + "\n", + "list_of_functions = [\n", + " {\n", + " \"name\": \"get_arxiv_paper_from_a_week\",\n", + " \"description\": \"Get arxiv papers published in last week\",\n", + " \"function\": get_arxiv_paper_from_a_week,\n", + " }\n", + "]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 5: build group chat agents\n", + "\n", + "Use `build()` to let build manager (the specified `builder_model`) complete the group chat agents generation. Specify `list_of_functions` to be used by the Agents" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m==> Generating agents...\u001b[0m\n", + "['NLP_Expert', 'DataAnalysis_Expert', 'AIResearch_Expert'] are generated.\n", + "\u001b[32m==> Generating system message...\u001b[0m\n", + "Preparing system message for NLP_Expert\n", + "Preparing system message for DataAnalysis_Expert\n", + "Preparing system message for AIResearch_Expert\n", + "\u001b[32m==> Generating description...\u001b[0m\n", + "Preparing description for NLP_Expert\n", + "Preparing description for DataAnalysis_Expert\n", + "Preparing description for AIResearch_Expert\n", + "\u001b[32m==> Creating agents...\u001b[0m\n", + "Creating agent NLP_Expert...\n", + "Creating agent DataAnalysis_Expert...\n", + "Creating agent AIResearch_Expert...\n", + "Adding user console proxy...\n", + "Function get_arxiv_paper_from_a_week is registered to agent DataAnalysis_Expert.\n" + ] + } + ], + "source": [ + "agent_list, agent_configs = builder.build(building_task, llm_config, list_of_functions, max_agents=3)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here you can see that Function `exec_python` has been associated with `ArxivAPI_Expert` Agent." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 6: execute task\n", + "\n", + "Let agents generated in `build()` to complete the task collaboratively in a group chat." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mNLP_Expert\u001b[0m (to chat_manager):\n", + "\n", + "Analyze and list the trending topics in arxiv papers related to GPT-4\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32m\n", + "Next speaker: DataAnalysis_Expert\n", + "\u001b[0m\n", + "\u001b[33mDataAnalysis_Expert\u001b[0m (to chat_manager):\n", + "\n", + "\u001b[32m***** Suggested tool call (call_hkKs7wbCyAOMkC4QjOYMLgtd): get_arxiv_paper_from_a_week *****\u001b[0m\n", + "Arguments: \n", + "{\"search_topic\":\"GPT-4\"}\n", + "\u001b[32m********************************************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: NLP_Expert\n", + "\u001b[0m\n", + "\u001b[35m\n", + ">>>>>>>> EXECUTING FUNCTION get_arxiv_paper_from_a_week...\u001b[0m\n", + "\u001b[33mNLP_Expert\u001b[0m (to chat_manager):\n", + "\n", + "\u001b[33mNLP_Expert\u001b[0m (to chat_manager):\n", + "\n", + "\u001b[32m***** Response from calling tool (call_hkKs7wbCyAOMkC4QjOYMLgtd) *****\u001b[0m\n", + "[{\"title\": \"Self-Training with Direct Preference Optimization Improves\\n Chain-of-Thought Reasoning\", \"authors\": \"Tianduo Wang, Shichen Li, Wei Lu\", \"published_on\": \"July 25, 2024\", \"summary\": \"Effective training of language models (LMs) for mathematical reasoning tasks\\ndemands high-quality supervised fine-tuning data. Besides obtaining annotations\\nfrom human experts, a common alternative is sampling from larger and more\\npowerful LMs. However, this knowledge distillation approach can be costly and\\nunstable, particularly when relying on closed-source, proprietary LMs like\\nGPT-4, whose behaviors are often unpredictable. In this work, we demonstrate\\nthat the reasoning abilities of small-scale LMs can be enhanced through\\nself-training, a process where models learn from their own outputs. We also\\nshow that the conventional self-training can be further augmented by a\\npreference learning algorithm called Direct Preference Optimization (DPO). By\\nintegrating DPO into self-training, we leverage preference data to guide LMs\\ntowards more accurate and diverse chain-of-thought reasoning. We evaluate our\\nmethod across various mathematical reasoning tasks using different base models.\\nOur experiments show that this approach not only improves LMs' reasoning\\nperformance but also offers a more cost-effective and scalable solution\\ncompared to relying on large proprietary LMs.\", \"link\": \"http://arxiv.org/abs/2407.18248v1\"}, {\"title\": \"C2P: Featuring Large Language Models with Causal Reasoning\", \"authors\": \"Abdolmahdi Bagheri, Matin Alinejad, Kevin Bello, Alireza Akhondi-Asl\", \"published_on\": \"July 25, 2024\", \"summary\": \"Causal reasoning is the primary bottleneck that Large Language Models (LLMs)\\nmust overcome to attain human-level intelligence. To address this, we introduce\\nthe Causal Chain of Prompting (C2P) as the first reasoning framework that\\nequips current LLMs with causal reasoning capabilities. C2P operates\\nautonomously, avoiding reliance on external tools or modules during both the\\ncausal learning and reasoning phases, and can be seamlessly implemented during\\nthe training or fine-tuning of LLMs. Experimental results across various\\nbenchmark datasets demonstrate a significant improvement in causal learning and\\nsubsequent reasoning accuracy of LLMs. We illustrate how C2P enhances LLMs'\\nability to causally reason in real-world scenarios, addressing complex problems\\nin fields such as healthcare, medicine, economics, education, social sciences,\\nenvironmental science, and marketing. With few-shot learning, GPT-4 Turbo using\\nC2P with as few as six examples achieves significant performance improvements,\\nboasting over a 33% increase in reasoning accuracy over the most\\nstate-of-the-art LLMs, which perform nearly randomly in similar circumstances.\\nThis demonstrates the transformative potential of integrating C2P into LLM\\ntraining or fine-tuning processes, thereby empowering these models with\\nadvanced causal reasoning capabilities.\", \"link\": \"http://arxiv.org/abs/2407.18069v1\"}, {\"title\": \"Is the Digital Forensics and Incident Response Pipeline Ready for\\n Text-Based Threats in LLM Era?\", \"authors\": \"Avanti Bhandarkar, Ronald Wilson, Anushka Swarup, Mengdi Zhu, Damon Woodard\", \"published_on\": \"July 25, 2024\", \"summary\": \"In the era of generative AI, the widespread adoption of Neural Text\\nGenerators (NTGs) presents new cybersecurity challenges, particularly within\\nthe realms of Digital Forensics and Incident Response (DFIR). These challenges\\nprimarily involve the detection and attribution of sources behind advanced\\nattacks like spearphishing and disinformation campaigns. As NTGs evolve, the\\ntask of distinguishing between human and NTG-authored texts becomes critically\\ncomplex. This paper rigorously evaluates the DFIR pipeline tailored for\\ntext-based security systems, specifically focusing on the challenges of\\ndetecting and attributing authorship of NTG-authored texts. By introducing a\\nnovel human-NTG co-authorship text attack, termed CS-ACT, our study uncovers\\nsignificant vulnerabilities in traditional DFIR methodologies, highlighting\\ndiscrepancies between ideal scenarios and real-world conditions. Utilizing 14\\ndiverse datasets and 43 unique NTGs, up to the latest GPT-4, our research\\nidentifies substantial vulnerabilities in the forensic profiling phase,\\nparticularly in attributing authorship to NTGs. Our comprehensive evaluation\\npoints to factors such as model sophistication and the lack of distinctive\\nstyle within NTGs as significant contributors for these vulnerabilities. Our\\nfindings underscore the necessity for more sophisticated and adaptable\\nstrategies, such as incorporating adversarial learning, stylizing NTGs, and\\nimplementing hierarchical attribution through the mapping of NTG lineages to\\nenhance source attribution. This sets the stage for future research and the\\ndevelopment of more resilient text-based security systems.\", \"link\": \"http://arxiv.org/abs/2407.17870v1\"}, {\"title\": \"Cost-effective Instruction Learning for Pathology Vision and Language\\n Analysis\", \"authors\": \"Kaitao Chen, Mianxin Liu, Fang Yan, Lei Ma, Xiaoming Shi, Lilong Wang, Xiaosong Wang, Lifeng Zhu, Zhe Wang, Mu Zhou, Shaoting Zhang\", \"published_on\": \"July 25, 2024\", \"summary\": \"The advent of vision-language models fosters the interactive conversations\\nbetween AI-enabled models and humans. Yet applying these models into clinics\\nmust deal with daunting challenges around large-scale training data, financial,\\nand computational resources. Here we propose a cost-effective instruction\\nlearning framework for conversational pathology named as CLOVER. CLOVER only\\ntrains a lightweight module and uses instruction tuning while freezing the\\nparameters of the large language model. Instead of using costly GPT-4, we\\npropose well-designed prompts on GPT-3.5 for building generation-based\\ninstructions, emphasizing the utility of pathological knowledge derived from\\nthe Internet source. To augment the use of instructions, we construct a\\nhigh-quality set of template-based instructions in the context of digital\\npathology. From two benchmark datasets, our findings reveal the strength of\\nhybrid-form instructions in the visual question-answer in pathology. Extensive\\nresults show the cost-effectiveness of CLOVER in answering both open-ended and\\nclosed-ended questions, where CLOVER outperforms strong baselines that possess\\n37 times more training parameters and use instruction data generated from\\nGPT-4. Through the instruction tuning, CLOVER exhibits robustness of few-shot\\nlearning in the external clinical dataset. These findings demonstrate that\\ncost-effective modeling of CLOVER could accelerate the adoption of rapid\\nconversational applications in the landscape of digital pathology.\", \"link\": \"http://arxiv.org/abs/2407.17734v1\"}, {\"title\": \"My Ontologist: Evaluating BFO-Based AI for Definition Support\", \"authors\": \"Carter Benson, Alec Sculley, Austin Liebers, John Beverley\", \"published_on\": \"July 24, 2024\", \"summary\": \"Generative artificial intelligence (AI), exemplified by the release of\\nGPT-3.5 in 2022, has significantly advanced the potential applications of large\\nlanguage models (LLMs), including in the realms of ontology development and\\nknowledge graph creation. Ontologies, which are structured frameworks for\\norganizing information, and knowledge graphs, which combine ontologies with\\nactual data, are essential for enabling interoperability and automated\\nreasoning. However, current research has largely overlooked the generation of\\nontologies extending from established upper-level frameworks like the Basic\\nFormal Ontology (BFO), risking the creation of non-integrable ontology silos.\\nThis study explores the extent to which LLMs, particularly GPT-4, can support\\nontologists trained in BFO. Through iterative development of a specialized GPT\\nmodel named \\\"My Ontologist,\\\" we aimed to generate BFO-conformant ontologies.\\nInitial versions faced challenges in maintaining definition conventions and\\nleveraging foundational texts effectively. My Ontologist 3.0 showed promise by\\nadhering to structured rules and modular ontology suites, yet the release of\\nGPT-4o disrupted this progress by altering the model's behavior. Our findings\\nunderscore the importance of aligning LLM-generated ontologies with top-level\\nstandards and highlight the complexities of integrating evolving AI\\ncapabilities in ontology engineering.\", \"link\": \"http://arxiv.org/abs/2407.17657v1\"}, {\"title\": \"Can GPT-4 learn to analyze moves in research article abstracts?\", \"authors\": \"Danni Yu, Marina Bondi, Ken Hyland\", \"published_on\": \"July 22, 2024\", \"summary\": \"One of the most powerful and enduring ideas in written discourse analysis is\\nthat genres can be described in terms of the moves which structure a writer's\\npurpose. Considerable research has sought to identify these distinct\\ncommunicative acts, but analyses have been beset by problems of subjectivity,\\nreliability and the time-consuming need for multiple coders to confirm\\nanalyses. In this paper we employ the affordances of GPT-4 to automate the\\nannotation process by using natural language prompts. Focusing on abstracts\\nfrom articles in four applied linguistics journals, we devise prompts which\\nenable the model to identify moves effectively. The annotated outputs of these\\nprompts were evaluated by two assessors with a third addressing disagreements.\\nThe results show that an 8-shot prompt was more effective than one using two,\\nconfirming that the inclusion of examples illustrating areas of variability can\\nenhance GPT-4's ability to recognize multiple moves in a single sentence and\\nreduce bias related to textual position. We suggest that GPT-4 offers\\nconsiderable potential in automating this annotation process, when human actors\\nwith domain specific linguistic expertise inform the prompting process.\", \"link\": \"http://arxiv.org/abs/2407.15612v2\"}, {\"title\": \"I Could've Asked That: Reformulating Unanswerable Questions\", \"authors\": \"Wenting Zhao, Ge Gao, Claire Cardie, Alexander M. Rush\", \"published_on\": \"July 24, 2024\", \"summary\": \"When seeking information from unfamiliar documents, users frequently pose\\nquestions that cannot be answered by the documents. While existing large\\nlanguage models (LLMs) identify these unanswerable questions, they do not\\nassist users in reformulating their questions, thereby reducing their overall\\nutility. We curate CouldAsk, an evaluation benchmark composed of existing and\\nnew datasets for document-grounded question answering, specifically designed to\\nstudy reformulating unanswerable questions. We evaluate state-of-the-art\\nopen-source and proprietary LLMs on CouldAsk. The results demonstrate the\\nlimited capabilities of these models in reformulating questions. Specifically,\\nGPT-4 and Llama2-7B successfully reformulate questions only 26% and 12% of the\\ntime, respectively. Error analysis shows that 62% of the unsuccessful\\nreformulations stem from the models merely rephrasing the questions or even\\ngenerating identical questions. We publicly release the benchmark and the code\\nto reproduce the experiments.\", \"link\": \"http://arxiv.org/abs/2407.17469v1\"}]\n", + "\u001b[32m**********************************************************************\u001b[0m\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: DataAnalysis_Expert\n", + "\u001b[0m\n", + "\u001b[33mDataAnalysis_Expert\u001b[0m (to chat_manager):\n", + "\n", + "After reviewing the recent arXiv papers related to GPT-4, the following trending topics have been identified:\n", + "\n", + "1. **Chain-of-Thought Reasoning and Self-Training**: One study proposes self-training methods, coupled with Direct Preference Optimization (DPO), to improve chain-of-thought reasoning in smaller language models, addressing the limitations of depending on proprietary large language models like GPT-4 for knowledge distillation ([source](http://arxiv.org/abs/2407.18248v1)).\n", + "\n", + "2. **Causal Reasoning in LLMs**: Another research introduces the Causal Chain of Prompting (C2P) framework designed to equip large language models with causal reasoning capabilities. The study shows that this framework, when used with GPT-4, achieves significant performance improvements in various practical scenarios ([source](http://arxiv.org/abs/2407.18069v1)).\n", + "\n", + "3. **Digital Forensics and Text-Based Security Threats**: This paper evaluates how well the Digital Forensics and Incident Response (DFIR) pipeline can handle text-based threats in the LLM era, specifically in the context of detecting and attributing authorship to texts generated by neural text generators like GPT-4 ([source](http://arxiv.org/abs/2407.17870v1)).\n", + "\n", + "4. **Cost-Effective Instruction Learning**: A research team discusses a new cost-effective instruction learning framework for conversational pathology named CLOVER, which leverages well-designed prompts on GPT-3.5, demonstrating that savings can be made on computation and financial resources while applying language models in a clinical setting ([source](http://arxiv.org/abs/2407.17734v1)).\n", + "\n", + "5. **Ontology Generation and AI Alignment**: An investigation into the feasibility of GPT-4 supporting ontology development based on the Basic Formal Ontology (BFO). The findings suggest challenges in aligning the rapidly evolving capabilities of LLMs with well-structured ontology standards ([source](http://arxiv.org/abs/2407.17657v1)).\n", + "\n", + "6. **Automating Annotation Processes in Discourse Analysis**: Researchers employ GPT-4's capacity to automate the annotation of communicative moves in written discourse, using natural language prompts to identify structural elements in academic abstracts more effectively ([source](http://arxiv.org/abs/2407.15612v2)).\n", + "\n", + "7. **Reformulating Unanswerable Questions**: Another study creates a benchmark for evaluating LLMs' performance in reformulating unanswerable questions. The findings indicate that even state-of-the-art models like GPT-4 struggle with this task, often failing to significantly alter the original unanswerable question ([source](http://arxiv.org/abs/2407.17469v1)).\n", + "\n", + "These topics illustrate the varied applications of GPT-4 and highlight the research community's focus on enhancing reasoning, security, cost-effectiveness, and interoperability within LLMs. Additionally, the challenges of annotation and question reformulation show that there is still room for improvement in understanding and processing natural language through AI.\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: AIResearch_Expert\n", + "\u001b[0m\n", + "\u001b[33mAIResearch_Expert\u001b[0m (to chat_manager):\n", + "\n", + "Based on the synthesized information provided by the DataAnalysis_Expert, the current trending topics for GPT-4 research seem to cover a spectrum of cognitive and practical applications: from enhancing reasoning capabilities, improving training methodologies, tackling new cybersecurity issues, to the more nuanced tasks of ontology engineering and discourse annotation.\n", + "\n", + "A future direction that seems particularly pertinent is investigating the interplay between GPT-4's abilities and human intelligence. For example, how GPT-4 can support specialized professionals in tasks that require highly structured knowledge, such as legal document analysis, medical diagnosis, or engineering design. There's scope to explore how the communication between GPT-4 and humans could be streamlined for cooperative problem-solving.\n", + "\n", + "Another promising direction is delving into the area of affective computing: understanding emotions, sarcasm, and subtleties in text to improve human-computer interactions. This can lead to breakthrough applications in personalized digital assistants, education (tailored student feedback), and mental health (empathetic conversational agents).\n", + "\n", + "Finally, with the mentioned challenges in reformulation of unanswerable questions and ontology alignment, extensive research focused on understanding the limitations of GPT-4's language comprehension could result in more nuanced teaching mechanisms for AI, leading to more reliable autonomous decision-making applications.\n", + "\n", + "Would the NLP_Expert or DataAnalysis_Expert like to weigh in on some applications or future research directions specifically leveraging linguistic or data-driven nuances that GPT-4 may be capable of addressing?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: NLP_Expert\n", + "\u001b[0m\n", + "\u001b[33mNLP_Expert\u001b[0m (to chat_manager):\n", + "\n", + "Sure, I'd like to add some insights from the perspective of NLP.\n", + "\n", + "The field of representational learning is particularly important as we continue to seek ways to improve GPT-4's performance. More sophisticated embedding methods could capture nuanced semantic relationships, make fine-grained distinctions among synonyms, handle polysemy more effectively, and distinguish subtle connotations—a direction that might enhance GPT-4's capabilities in tasks like sentiment analysis, irony detection, and humor recognition.\n", + "\n", + "In terms of practical applications, one future direction lies in the domain of legal and ethical reasoning. As systems like GPT-4 are leveraged for legal research and analysis, it will be essential to teach the model to navigate complex ethical considerations and the nuanced language of legal literature. This could involve training on specialized datasets that include case law and legal precedence, as well as philosophical texts dealing with ethics.\n", + "\n", + "Moreover, the trend towards cross-domain applications — such as pathology vision-language analysis mentioned in the papers — suggests that GPT-4 could be tailored to handle multi-modal inputs more effectively. Here the interaction between visual data and textual information requires further exploration, particularly in the way that GPT-4 processes and generates language grounded in non-textual contexts.\n", + "\n", + "Lastly, the recurring theme of cost-effectiveness in model training opens up a broader discussion about sustainable AI practices. Finding ways to reduce the data, energy, and computational resources necessary for training and deploying models like GPT-4 isn't just a technical challenge, but also an environmental and economic one. This includes research into more efficient algorithms and architectures, as well as methodologies for leveraging smaller models to approximate the capabilities of more substantial systems.\n", + "\n", + "There are indeed a myriad of pathways for future research, all of which underscore the evolving nature of NLP and its entwinement with the continually advancing capabilities of models like GPT-4. Any further explorations or examinations on these topics from the group?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: MachineLearning_Expert\n", + "\u001b[0m\n", + "\u001b[33mMachineLearning_Expert\u001b[0m (to chat_manager):\n", + "\n", + "Considering the points made by the NLP_Expert and AIResearch_Expert, I'd like to contribute a take on how we might further the state-of-the-art in machine learning leveraging the capabilities of GPT-4.\n", + "\n", + "For one, the convergence of unsupervised, semi-supervised, and supervised learning methodologies might be beneficial for GPT-4's future iterations or similar models. The incorporation of unsupervised learning can help in better understanding context and meaning without substantial labeled datasets. This could improve GPT-4's potential in low-resource languages and niche applications where labeled data is scarce.\n", + "\n", + "Another realm of exploration could be few-shot and one-shot learning. As models become more capable, their ability to generalize from fewer examples is critical. Fine-tuning GPT-4's few-shot learning capabilities can have practical implications in personalized AI services where the model needs to adapt quickly to individual user needs and preferences.\n", + "\n", + "Additionally, given the recent trends in GPT-4 research, the incorporation of neuro-symbolic approaches may offer a promising avenue for enhancing reasoning capabilities. This hybrid approach combines neural networks' learning prowess with symbolic AI's rule-based reasoning - providing a pathway to improve GPT-4's causal reasoning, problem-solving, and comprehension of complex systems.\n", + "\n", + "Finally, as we progress toward AI models that comprehend and generate human-like text, the ethical dimension becomes paramount. It's crucial for future research to focus on models' accountability, interpretability, and fairness. By building robust mechanisms for transparency and control, we can ensure that advancements in GPT-4 and similar AI are developed responsibly and augment human abilities without unintended consequences.\n", + "\n", + "If the group agrees, these threads can weave together a forward-looking agenda for research in machine learning, focusing on resource efficiency, learning dexterity, cognitive depth, and ethical robustness. Are we in consensus to proceed on finalizing our analysis and concluding our discussion?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: AI_Research_Expert\n", + "\u001b[0m\n", + "\u001b[33mAI_Research_Expert\u001b[0m (to chat_manager):\n", + "\n", + "The interdisciplinary perspectives presented here create a comprehensive view of the potential for GPT-4 and its related research domains. We have touched upon cognitive enhancements in machine reasoning, representational learning, cross-domain applications, sustainable AI, few-shot learning, neuro-symbolic approaches, and ethical considerations in AI development.\n", + "\n", + "As an AI Research Expert, I second the synthesis of these insights and propose that our analysis has reached a natural conclusion with consensus on the future directions and implications of GPT-4's continuing evolution within the AI landscape.\n", + "\n", + "If there are no additional insights or questions from the group, I suggest we may consider our task complete. Shall we proceed to close our discussion?\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: Computer_terminal\n", + "\u001b[0m\n", + "\u001b[33mComputer_terminal\u001b[0m (to chat_manager):\n", + "\n", + "There is no code from the last 1 message for me to execute. Group chat manager should let other participants to continue the conversation. If the group chat manager want to end the conversation, you should let other participant reply me only with \"TERMINATE\"\n", + "\n", + "--------------------------------------------------------------------------------\n", + "\u001b[32m\n", + "Next speaker: AI_Research_Expert\n", + "\u001b[0m\n", + "\u001b[33mAI_Research_Expert\u001b[0m (to chat_manager):\n", + "\n", + "TERMINATE\n", + "\n", + "--------------------------------------------------------------------------------\n" + ] + } + ], + "source": [ + "start_task(execution_task=building_task, agent_list=agent_list)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Step 7 (Optional): clear all agents and prepare for the next task\n", + "\n", + "You can clear all agents generated in this task by the following code if your task is completed or the next task is largely different from the current task. If the agent's backbone is an open-source LLM, this process will also shut down the endpoint server. If necessary, you can use `recycle_endpoint=False` to retain the previous open-source LLMs' endpoint server." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mAll agents have been cleared.\u001b[0m\n" + ] + } + ], + "source": [ + "builder.clear_all_agents(recycle_endpoint=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Save & load configs\n", + "\n", + "You can save all necessary information of the built group chat agents. Here is a case for those agents generated in the above task:" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[32mBuilding config saved to ./save_config_8e0d96e24673563ecb572d92ed003d2a.json\u001b[0m\n" + ] + } + ], + "source": [ + "saved_path = builder.save()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.14" + }, + "orig_nbformat": 4, + "vscode": { + "interpreter": { + "hash": "e7370f93d1d0cde622a1f8e1c04877d8463912d04d973331ad4851f04de6915a" + } + } + }, + "nbformat": 4, + "nbformat_minor": 2 + } diff --git a/test/agentchat/contrib/test_agent_builder.py b/test/agentchat/contrib/test_agent_builder.py index e2e39e8ba43b..e0d8515c0fe0 100755 --- a/test/agentchat/contrib/test_agent_builder.py +++ b/test/agentchat/contrib/test_agent_builder.py @@ -3,15 +3,18 @@ import json import os import sys +from unittest.mock import MagicMock, patch import pytest +import autogen from autogen.agentchat.contrib.agent_builder import AgentBuilder sys.path.append(os.path.join(os.path.dirname(__file__), "..")) sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) + from conftest import reason, skip_openai # noqa: E402 -from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 +from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402 # noqa: E402 try: import chromadb @@ -22,6 +25,7 @@ skip = False here = os.path.abspath(os.path.dirname(__file__)) +llm_config = {"temperature": 0} def _config_check(config): @@ -37,10 +41,27 @@ def _config_check(config): assert agent_config.get("system_message", None) is not None -@pytest.mark.skipif( - skip_openai, - reason=reason, -) +# Function initializes a group chat with agents and starts a execution_task. +def start_task(execution_task: str, agent_list: list): + group_chat = autogen.GroupChat(agents=agent_list, messages=[], max_round=12) + manager = autogen.GroupChatManager( + groupchat=group_chat, + llm_config={"config_list": autogen.config_list_from_json(f"{KEY_LOC}/{OAI_CONFIG_LIST}"), **llm_config}, + ) + + agent_list[0].initiate_chat(manager, message=execution_task) + + +ask_ossinsight_mock = MagicMock() + + +# Function to test function calling +def ask_ossinsight(question: str) -> str: + ask_ossinsight_mock(question) + return "The repository microsoft/autogen has 123,456 stars on GitHub." + + +@pytest.mark.skipif(skip_openai, reason=reason) def test_build(): builder = AgentBuilder( config_file_or_env=OAI_CONFIG_LIST, @@ -69,6 +90,99 @@ def test_build(): assert len(agent_config["agent_configs"]) <= builder.max_agents +@pytest.mark.skipif(skip_openai or skip, reason=reason + "OR dependency not installed") +def test_build_assistant_with_function_calling(): + list_of_functions = [ + { + "name": "ossinsight_data_api", + "description": "This is an API endpoint allowing users (analysts) to input question about GitHub in text format to retrieve the related and structured data.", + "function": ask_ossinsight, + } + ] + + builder = AgentBuilder( + config_file_or_env=OAI_CONFIG_LIST, config_file_location=KEY_LOC, builder_model="gpt-4", agent_model="gpt-4" + ) + building_task = "How many stars microsoft/autogen has on GitHub?" + + agent_list, agent_config = builder.build( + building_task=building_task, + default_llm_config={"temperature": 0}, + code_execution_config={ + "last_n_messages": 2, + "work_dir": f"{here}/test_agent_scripts", + "timeout": 60, + "use_docker": "python:3", + }, + list_of_functions=list_of_functions, + ) + + _config_check(agent_config) + + # check number of agents + assert len(agent_config["agent_configs"]) <= builder.max_agents + + # Mock the 'ask_ossinsight' function in the '_main_' module using a context manager. + with patch(f"{__name__}.ask_ossinsight") as mocked_function: + # Execute 'start_task' which should trigger 'ask_ossinsight' due to the given execution task. + start_task( + execution_task="How many stars microsoft/autogen has on GitHub?", + agent_list=agent_list, + ) + + # Verify that 'ask_ossinsight' was called exactly once during the task execution. + mocked_function.assert_called() + + +@pytest.mark.skipif( + skip_openai, + reason="requested to skip", +) +def test_build_gpt_assistant_with_function_calling(): + list_of_functions = [ + { + "name": "ossinsight_data_api", + "description": "This is an API endpoint allowing users (analysts) to input question about GitHub in text format to retrieve the related and structured data.", + "function": ask_ossinsight, + } + ] + + builder = AgentBuilder( + config_file_or_env=OAI_CONFIG_LIST, config_file_location=KEY_LOC, builder_model="gpt-4", agent_model="gpt-4" + ) + + building_task = "Determine number of stars of GitHub repositories" + + agent_list, agent_config = builder.build( + building_task=building_task, + default_llm_config={"temperature": 0}, + code_execution_config={ + "last_n_messages": 2, + "work_dir": f"{here}/test_agent_scripts", + "timeout": 60, + "use_docker": "python:3", + }, + list_of_functions=list_of_functions, + use_oai_assistant=True, + ) + + _config_check(agent_config) + + # check number of agents + assert len(agent_config["agent_configs"]) <= builder.max_agents + + # Mock the 'ask_ossinsight' function in the '_main_' module using a context manager. + with patch(f"{__name__}.ask_ossinsight") as mocked_function: + # Execute 'start_task' which should trigger 'ask_ossinsight' due to the given execution task. + start_task( + execution_task="How many stars microsoft/autogen has on GitHub?", + agent_list=agent_list, + ) + + # Verify that 'ask_ossinsight' was called exactly once during the task execution. + mocked_function.assert_called() + + @pytest.mark.skipif( skip_openai or skip, reason=reason + "OR dependency not installed", @@ -122,10 +236,7 @@ def test_build_from_library(): assert len(agent_config["agent_configs"]) <= builder.max_agents -@pytest.mark.skipif( - skip_openai, - reason=reason, -) +@pytest.mark.skipif(skip_openai, reason=reason) def test_save(): builder = AgentBuilder( config_file_or_env=OAI_CONFIG_LIST, @@ -159,10 +270,7 @@ def test_save(): _config_check(saved_configs) -@pytest.mark.skipif( - skip_openai, - reason=reason, -) +@pytest.mark.skipif(skip_openai, reason=reason) def test_load(): builder = AgentBuilder( config_file_or_env=OAI_CONFIG_LIST, @@ -188,10 +296,7 @@ def test_load(): _config_check(loaded_agent_configs) -@pytest.mark.skipif( - skip_openai, - reason=reason, -) +@pytest.mark.skipif(skip_openai, reason=reason) def test_clear_agent(): builder = AgentBuilder( config_file_or_env=OAI_CONFIG_LIST, @@ -218,6 +323,8 @@ def test_clear_agent(): if __name__ == "__main__": test_build() + test_build_assistant_with_function_calling() + test_build_gpt_assistant_with_function_calling() test_build_from_library() test_save() test_load()