Skip to content

Commit

Permalink
Add headless browser to the WebSurferAgent, closes microsoft#1481
Browse files Browse the repository at this point in the history
  • Loading branch information
vijaykramesh committed Feb 5, 2024
1 parent 5b217c9 commit b8e400d
Show file tree
Hide file tree
Showing 4 changed files with 208 additions and 40 deletions.
28 changes: 17 additions & 11 deletions autogen/agentchat/contrib/web_surfer.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,23 @@
import json
import copy
import copy
import logging
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Callable, Literal, Tuple
from autogen import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper
from autogen.browser_utils import SimpleTextBrowser
from autogen.code_utils import content_str
from datetime import datetime
from autogen.token_count_utils import count_token, get_max_token_limit
from typing import Dict, List, Optional, Union, Callable, Literal, Tuple

from autogen import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, OpenAIWrapper
from autogen.browser_utils import SimpleTextBrowser, HeadlessChromeBrowser
from autogen.oai.openai_utils import filter_config
from autogen.token_count_utils import count_token, get_max_token_limit

logger = logging.getLogger(__name__)


class WebSurferAgent(ConversableAgent):
"""(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages."""
"""(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages.
Defaults to a simple text-based browser.
Can be configured to use a headless Chrome browser by providing a browser_config dictionary with the key "headless" set to True.
"""

DEFAULT_PROMPT = (
"You are a helpful AI assistant with access to a web browser (via the provided functions). In fact, YOU ARE THE ONLY MEMBER OF YOUR PARTY WITH ACCESS TO A WEB BROWSER, so please help out where you can by performing web searches, navigating pages, and reporting what you find. Today's date is "
Expand Down Expand Up @@ -84,7 +86,11 @@ def __init__(
if browser_config is None:
self.browser = SimpleTextBrowser()
else:
self.browser = SimpleTextBrowser(**browser_config)
headless = browser_config.pop("headless")
if headless:
self.browser = HeadlessChromeBrowser(**browser_config)
else:
self.browser = SimpleTextBrowser(**browser_config)

# Create a copy of the llm_config for the inner monologue agents to use, and set them up with function calling
if llm_config is None: # Nothing to copy
Expand Down Expand Up @@ -214,7 +220,7 @@ def _browser_state():
current_page = self.browser.viewport_current_page
total_pages = len(self.browser.viewport_pages)

header += f"Viewport position: Showing page {current_page+1} of {total_pages}.\n"
header += f"Viewport position: Showing page {current_page + 1} of {total_pages}.\n"
return (header, self.browser.viewport)

def _informational_search(query):
Expand All @@ -225,7 +231,7 @@ def _informational_search(query):
def _navigational_search(query):
self.browser.visit_page(f"bing: {query}")

# Extract the first linl
# Extract the first link
m = re.search(r"\[.*?\]\((http.*?)\)", self.browser.page_content)
if m:
self.browser.visit_page(m.group(1))
Expand Down
150 changes: 142 additions & 8 deletions autogen/browser_utils.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
import json
import io
import mimetypes
import os
import requests
import re
import markdownify
import io
import uuid
import mimetypes
from urllib.parse import urljoin, urlparse

import markdownify
import requests
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Callable, Literal, Tuple
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

# Optional PDF support
IS_PDF_CAPABLE = False
Expand All @@ -27,8 +28,57 @@
except ModuleNotFoundError:
pass

from abc import ABC, abstractmethod
from typing import Optional, Union, Dict


class AbstractBrowser(ABC):
"""An abstract class for a web browser."""

@abstractmethod
def __init__(
self,
start_page: Optional[str] = "about:blank",
viewport_size: Optional[int] = 1024 * 8,
downloads_folder: Optional[Union[str, None]] = None,
bing_api_key: Optional[Union[str, None]] = None,
request_kwargs: Optional[Union[Dict, None]] = None,
):
pass

@property
@abstractmethod
def address(self) -> str:
pass

@abstractmethod
def set_address(self, uri_or_path):
pass

@property
@abstractmethod
def viewport(self) -> str:
pass

@property
@abstractmethod
def page_content(self) -> str:
pass

@abstractmethod
def page_down(self):
pass

@abstractmethod
def page_up(self):
pass

@abstractmethod
def visit_page(self, path_or_uri):
pass


class SimpleTextBrowser:
class SimpleTextBrowser(AbstractBrowser):
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""

def __init__(
Expand Down Expand Up @@ -281,3 +331,87 @@ def _fetch_page(self, url):
except requests.exceptions.RequestException as e:
self.page_title = "Error"
self._set_page_content(str(e))


class HeadlessChromeBrowser(AbstractBrowser):
"""(In preview) A Selenium powered headless Chrome browser. Suitable for Agentic use."""

def __init__(
self,
start_page: Optional[str] = "about:blank",
viewport_size: Optional[int] = 1024 * 8,
downloads_folder: Optional[Union[str, None]] = None,
bing_api_key: Optional[Union[str, None]] = None,
request_kwargs: Optional[Union[Dict, None]] = None,
):
self.start_page = start_page
self.driver = None
self.viewport_size = viewport_size # Applies only to the standard uri types
self.downloads_folder = downloads_folder
self.history = list()
self.page_title = None
self.viewport_current_page = 0
self.viewport_pages = list()
self.bing_api_key = bing_api_key
self.request_kwargs = request_kwargs

self._start_browser()

def _start_browser(self):
chrome_options = Options()
chrome_options.add_argument("--headless")
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.get(self.start_page)

@property
def address(self) -> str:
return self.driver.current_url

def set_address(self, uri_or_path):
if uri_or_path.startswith("bing:"):
self._bing_search(uri_or_path[len("bing:") :].strip())
else:
self.driver.get(uri_or_path)

@property
def viewport(self) -> str:
# returns the content of the current viewport
return self.page_content

@property
def page_content(self) -> str:
html = self.driver.execute_script("return document.body.innerHTML;")
return self._process_html(html)

def _process_html(self, html: str) -> str:
"""Process the raw HTML content and return the processed text."""
soup = BeautifulSoup(html, "html.parser")

# Remove javascript and style blocks
for script in soup(["script", "style"]):
script.extract()

# Convert to text
text = soup.get_text()

# Remove excessive blank lines
text = re.sub(r"\n{2,}", "\n\n", text).strip()

return text

def _bing_search(self, query):
self.driver.get("https://www.bing.com")

search_bar = self.driver.find_element(By.NAME, "q")
search_bar.clear()
search_bar.send_keys(query)
search_bar.submit()

def page_down(self):
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

def page_up(self):
self.driver.execute_script("window.scrollTo(0, 0);")

def visit_page(self, path_or_uri):
self.set_address(path_or_uri)
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
"teachable": ["chromadb"],
"lmm": ["replicate", "pillow"],
"graphs": ["networkx~=3.2.1", "matplotlib~=3.8.1"],
"websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate"],
"websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate", "selenium"],
"redis": ["redis"],
},
classifiers=[
Expand Down
68 changes: 48 additions & 20 deletions test/agentchat/contrib/test_web_surfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,32 +110,34 @@ def test_web_surfer_oai():
assert len(llm_config["config_list"]) > 0
assert len(summarizer_llm_config["config_list"]) > 0

page_size = 4096
web_surfer = WebSurferAgent(
"web_surfer",
llm_config=llm_config,
summarizer_llm_config=summarizer_llm_config,
browser_config={"viewport_size": page_size},
)
# run the test with both text and headless browsers
for useHeadlessBrowser in [False, True]:
page_size = 4096
web_surfer = WebSurferAgent(
"web_surfer",
llm_config=llm_config,
summarizer_llm_config=summarizer_llm_config,
browser_config={"viewport_size": page_size, "headless": useHeadlessBrowser},
)

user_proxy = UserProxyAgent(
"user_proxy",
human_input_mode="NEVER",
code_execution_config=False,
default_auto_reply="",
is_termination_msg=lambda x: True,
)
user_proxy = UserProxyAgent(
"user_proxy",
human_input_mode="NEVER",
code_execution_config=False,
default_auto_reply="",
is_termination_msg=lambda x: True,
)

# Make some requests that should test function calling
user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'")
# Make some requests that should test function calling
user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'")

user_proxy.initiate_chat(web_surfer, message="Please scroll down.")
user_proxy.initiate_chat(web_surfer, message="Please scroll down.")

user_proxy.initiate_chat(web_surfer, message="Please scroll up.")
user_proxy.initiate_chat(web_surfer, message="Please scroll up.")

user_proxy.initiate_chat(web_surfer, message="When was it founded?")
user_proxy.initiate_chat(web_surfer, message="When was it founded?")

user_proxy.initiate_chat(web_surfer, message="What's this page about?")
user_proxy.initiate_chat(web_surfer, message="What's this page about?")


@pytest.mark.skipif(
Expand Down Expand Up @@ -165,6 +167,32 @@ def test_web_surfer_bing():
assert "Address: https://en.wikipedia.org/wiki/" in response


@pytest.mark.skipif(
skip_bing,
reason="do not run if bing api key is not available",
)
def test_web_surfer_headless_bing():
page_size = 4096
web_surfer = WebSurferAgent(
"web_surfer",
llm_config=False,
browser_config={"viewport_size": page_size, "headless": True},
)

# Sneak a peak at the function map, allowing us to call the functions for testing here
function_map = web_surfer._user_proxy._function_map

# Test informational queries
response = function_map["informational_web_search"](BING_QUERY)
assert "Address: https://www.bing.com/search?q=Microsoft&form=QBLH" in response
assert "Microsoft – Cloud, Computers, Apps & Gaming" in response

# Test informational queries
response = function_map["navigational_web_search"](BING_QUERY + " Wikipedia")
assert "Address: https://www.bing.com/search?q=Microsoft+Wikipedia&form=QBLH" in response
assert "Microsoft - Wikipedia" in response


if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_web_surfer()
Expand Down

0 comments on commit b8e400d

Please sign in to comment.