Add a web surfer agent that can search and browse the web. (#1093)

* Initial commit of WebSurfer. Adds the browser_utils, and related tests. WebSurfer will be added in a subsequent commit.

* Added the web surfer agent, and related tests.

* Added a notebook to show how WebSurferAgent works.

* Fixed a typo.

* Updated test_web_surfer for compatibility with #1110.

* Updated skip_oai logic.

* Fixed code formatting.

* More pre-commit fixes.

* Added block to contrib-openai.yml

* Added block to contrib-openai.yml

* Added hook for BING_API_KEY

* Temporarily commented out other tests, per request.

* Fixed indentation (maybe?)

* Restoring contrib-openai.yml
This commit is contained in:
afourney 2024-01-21 19:43:15 -08:00 committed by GitHub
parent ca56782a7f
commit 708eb4d884
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 1695 additions and 0 deletions

View File

@ -217,3 +217,42 @@ jobs:
with:
file: ./coverage.xml
flags: unittests
WebSurfer:
strategy:
matrix:
os: [ubuntu-latest]
python-version: ["3.11"]
runs-on: ${{ matrix.os }}
environment: openai1
steps:
# checkout to pr branch
- name: Checkout
uses: actions/checkout@v3
with:
ref: ${{ github.event.pull_request.head.sha }}
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install packages and dependencies
run: |
docker --version
python -m pip install --upgrade pip wheel
pip install -e .[websurfer]
python -c "import autogen"
pip install coverage pytest
- name: Coverage
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }}
OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }}
BING_API_KEY: ${{ secrets.BING_API_KEY }}
run: |
coverage run -a -m pytest test/agentchat/contrib/test_web_surfer.py
coverage xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ./coverage.xml
flags: unittests

View File

@ -196,6 +196,49 @@ jobs:
file: ./coverage.xml
flags: unittests
WebSurfer:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, macos-latest, windows-2019]
python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v3
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: ${{ matrix.python-version }}
- name: Install packages and dependencies for all tests
run: |
python -m pip install --upgrade pip wheel
pip install pytest
- name: Install packages and dependencies for WebSurfer
run: |
pip install -e .[websurfer]
- name: Set AUTOGEN_USE_DOCKER based on OS
shell: bash
run: |
if [[ ${{ matrix.os }} != ubuntu-latest ]]; then
echo "AUTOGEN_USE_DOCKER=False" >> $GITHUB_ENV
fi
- name: Test WebSurfer
if: matrix.python-version != '3.10' # diversify the python versions
run: |
pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai
- name: Coverage
if: matrix.python-version == '3.10'
run: |
pip install coverage>=5.3
coverage run -a -m pytest test/test_browser_utils.py test/agentchat/contrib/test_web_surfer.py --skip-openai
coverage xml
- name: Upload coverage to Codecov
if: matrix.python-version == '3.10'
uses: codecov/codecov-action@v3
with:
file: ./coverage.xml
flags: unittests
LMMTest:
runs-on: ${{ matrix.os }}
strategy:

View File

@ -0,0 +1,356 @@
import json
import copy
import logging
import re
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Callable, Literal, Tuple
from autogen import Agent, ConversableAgent, AssistantAgent, UserProxyAgent, GroupChatManager, GroupChat, OpenAIWrapper
from autogen.browser_utils import SimpleTextBrowser
from autogen.code_utils import content_str
from datetime import datetime
from autogen.token_count_utils import count_token, get_max_token_limit
from autogen.oai.openai_utils import filter_config
logger = logging.getLogger(__name__)
class WebSurferAgent(ConversableAgent):
"""(In preview) An agent that acts as a basic web surfer that can search the web and visit web pages."""
DEFAULT_PROMPT = (
"You are a helpful AI assistant with access to a web browser (via the provided functions). In fact, YOU ARE THE ONLY MEMBER OF YOUR PARTY WITH ACCESS TO A WEB BROWSER, so please help out where you can by performing web searches, navigating pages, and reporting what you find. Today's date is "
+ datetime.now().date().isoformat()
)
DEFAULT_DESCRIPTION = "A helpful assistant with access to a web browser. Ask them to perform web searches, open pages, navigate to Wikipedia, answer questions from pages, and or generate summaries."
def __init__(
self,
name,
system_message: Optional[Union[str, List]] = DEFAULT_PROMPT,
description: Optional[str] = DEFAULT_DESCRIPTION,
is_termination_msg: Optional[Callable[[Dict], bool]] = None,
max_consecutive_auto_reply: Optional[int] = None,
human_input_mode: Optional[str] = "TERMINATE",
function_map: Optional[Dict[str, Callable]] = None,
code_execution_config: Optional[Union[Dict, Literal[False]]] = None,
llm_config: Optional[Union[Dict, Literal[False]]] = None,
summarizer_llm_config: Optional[Union[Dict, Literal[False]]] = None,
default_auto_reply: Optional[Union[str, Dict, None]] = "",
browser_config: Optional[Union[Dict, None]] = None,
):
super().__init__(
name=name,
system_message=system_message,
description=description,
is_termination_msg=is_termination_msg,
max_consecutive_auto_reply=max_consecutive_auto_reply,
human_input_mode=human_input_mode,
function_map=function_map,
code_execution_config=code_execution_config,
llm_config=llm_config,
default_auto_reply=default_auto_reply,
)
# If the summarizer_llm_config is None, we copy it from the llm_config
if summarizer_llm_config is None:
if llm_config is None: # Nothing to copy
self.summarizer_llm_config = None
elif llm_config is False: # LLMs disabled
self.summarizer_llm_config = False
else: # Create a suitable config
self.summarizer_llm_config = copy.deepcopy(llm_config)
if "config_list" in self.summarizer_llm_config:
preferred_models = filter_config(
self.summarizer_llm_config["config_list"],
{"model": ["gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k"]},
)
if len(preferred_models) == 0:
logger.warning(
"The summarizer did not find the preferred model (gpt-3.5-turbo-16k) in the config list. "
"Semantic operations on webpages (summarization or Q&A) might be costly or ineffective."
)
else:
self.summarizer_llm_config["config_list"] = preferred_models
else:
self.summarizer_llm_config = summarizer_llm_config
# Create the summarizer client
self.summarization_client = None
if self.summarizer_llm_config is not False:
self.summarization_client = OpenAIWrapper(**self.summarizer_llm_config)
# Create the browser
if browser_config is None:
self.browser = SimpleTextBrowser()
else:
self.browser = SimpleTextBrowser(**browser_config)
# Create a copy of the llm_config for the inner monologue agents to use, and set them up with function calling
if llm_config is None: # Nothing to copy
inner_llm_config = None
elif llm_config is False: # LLMs disabled
inner_llm_config = False
else:
inner_llm_config = copy.deepcopy(llm_config)
inner_llm_config["functions"] = [
{
"name": "informational_web_search",
"description": "Perform an INFORMATIONAL web search query then return the search results.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The informational web search query to perform.",
}
},
},
"required": ["query"],
},
{
"name": "navigational_web_search",
"description": "Perform a NAVIGATIONAL web search query then immediately navigate to the top result. Useful, for example, to navigate to a particular Wikipedia article or other known destination. Equivalent to Google's \"I'm Feeling Lucky\" button.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "The navigational web search query to perform.",
}
},
},
"required": ["query"],
},
{
"name": "visit_page",
"description": "Visit a webpage at a given URL and return its text.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "The relative or absolute url of the webapge to visit.",
}
},
},
"required": ["url"],
},
{
"name": "page_up",
"description": "Scroll the viewport UP one page-length in the current webpage and return the new viewport content.",
"parameters": {"type": "object", "properties": {}},
"required": [],
},
{
"name": "page_down",
"description": "Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content.",
"parameters": {"type": "object", "properties": {}},
"required": [],
},
]
# Enable semantic operations
if self.summarization_client is not None:
inner_llm_config["functions"].append(
{
"name": "answer_from_page",
"description": "Uses AI to read the page and directly answer a given question based on the content.",
"parameters": {
"type": "object",
"properties": {
"question": {
"type": "string",
"description": "The question to directly answer.",
},
"url": {
"type": "string",
"description": "[Optional] The url of the page. (Defaults to the current page)",
},
},
},
"required": ["question"],
}
)
inner_llm_config["functions"].append(
{
"name": "summarize_page",
"description": "Uses AI to summarize the content found at a given url. If the url is not provided, the current page is summarized.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "[Optional] The url of the page to summarize. (Defaults to current page)",
},
},
},
"required": [],
}
)
# Set up the inner monologue
self._assistant = AssistantAgent(
self.name + "_inner_assistant",
system_message=system_message,
llm_config=inner_llm_config,
is_termination_msg=lambda m: False,
)
self._user_proxy = UserProxyAgent(
self.name + "_inner_user_proxy",
human_input_mode="NEVER",
code_execution_config=False,
default_auto_reply="",
is_termination_msg=lambda m: False,
)
# Helper functions
def _browser_state():
header = f"Address: {self.browser.address}\n"
if self.browser.page_title is not None:
header += f"Title: {self.browser.page_title}\n"
current_page = self.browser.viewport_current_page
total_pages = len(self.browser.viewport_pages)
header += f"Viewport position: Showing page {current_page+1} of {total_pages}.\n"
return (header, self.browser.viewport)
def _informational_search(query):
self.browser.visit_page(f"bing: {query}")
header, content = _browser_state()
return header.strip() + "\n=======================\n" + content
def _navigational_search(query):
self.browser.visit_page(f"bing: {query}")
# Extract the first linl
m = re.search(r"\[.*?\]\((http.*?)\)", self.browser.page_content)
if m:
self.browser.visit_page(m.group(1))
# Return where we ended up
header, content = _browser_state()
return header.strip() + "\n=======================\n" + content
def _visit_page(url):
self.browser.visit_page(url)
header, content = _browser_state()
return header.strip() + "\n=======================\n" + content
def _page_up():
self.browser.page_up()
header, content = _browser_state()
return header.strip() + "\n=======================\n" + content
def _page_down():
self.browser.page_down()
header, content = _browser_state()
return header.strip() + "\n=======================\n" + content
def _summarize_page(question, url):
if url is not None and url != self.browser.address:
self.browser.visit_page(url)
# We are likely going to need to fix this later, but summarize only as many tokens that fit in the buffer
limit = 4096
try:
limit = get_max_token_limit(self.summarizer_llm_config["config_list"][0]["model"])
except ValueError:
pass # limit is unknown
except TypeError:
pass # limit is unknown
if limit < 16000:
logger.warning(
f"The token limit ({limit}) of the WebSurferAgent.summarizer_llm_config, is below the recommended 16k."
)
buffer = ""
for line in re.split(r"([\r\n]+)", self.browser.page_content):
tokens = count_token(buffer + line)
if tokens + 1024 > limit: # Leave room for our summary
break
buffer += line
buffer = buffer.strip()
if len(buffer) == 0:
return "Nothing to summarize."
messages = [
{
"role": "system",
"content": "You are a helpful assistant that can summarize long documents to answer question.",
}
]
prompt = f"Please summarize the following into one or two paragraph:\n\n{buffer}"
if question is not None:
prompt = f"Please summarize the following into one or two paragraphs with respect to '{question}':\n\n{buffer}"
messages.append(
{"role": "user", "content": prompt},
)
response = self.summarization_client.create(context=None, messages=messages)
extracted_response = self.summarization_client.extract_text_or_completion_object(response)[0]
return str(extracted_response)
self._user_proxy.register_function(
function_map={
"informational_web_search": lambda query: _informational_search(query),
"navigational_web_search": lambda query: _navigational_search(query),
"visit_page": lambda url: _visit_page(url),
"page_up": lambda: _page_up(),
"page_down": lambda: _page_down(),
"answer_from_page": lambda question=None, url=None: _summarize_page(question, url),
"summarize_page": lambda question=None, url=None: _summarize_page(None, url),
}
)
self._reply_func_list = []
self.register_reply([Agent, None], WebSurferAgent.generate_surfer_reply)
self.register_reply([Agent, None], ConversableAgent.generate_code_execution_reply)
self.register_reply([Agent, None], ConversableAgent.generate_function_call_reply)
self.register_reply([Agent, None], ConversableAgent.check_termination_and_human_reply)
def generate_surfer_reply(
self,
messages: Optional[List[Dict]] = None,
sender: Optional[Agent] = None,
config: Optional[OpenAIWrapper] = None,
) -> Tuple[bool, Union[str, Dict, None]]:
"""Generate a reply using autogen.oai."""
if messages is None:
messages = self._oai_messages[sender]
self._user_proxy.reset()
self._assistant.reset()
# Clone the messages to give context
self._assistant.chat_messages[self._user_proxy] = list()
history = messages[0 : len(messages) - 1]
for message in history:
self._assistant.chat_messages[self._user_proxy].append(message)
# Remind the agent where it is
self._user_proxy.send(
f"Your browser is currently open to the page '{self.browser.page_title}' at the address '{self.browser.address}'.",
self._assistant,
request_reply=False,
silent=True,
)
self._user_proxy.send(messages[-1]["content"], self._assistant, request_reply=True, silent=True)
agent_reply = self._user_proxy.chat_messages[self._assistant][-1]
# print("Agent Reply: " + str(agent_reply))
proxy_reply = self._user_proxy.generate_reply(
messages=self._user_proxy.chat_messages[self._assistant], sender=self._assistant
)
# print("Proxy Reply: " + str(proxy_reply))
if proxy_reply == "": # Was the default reply
return True, None if agent_reply is None else agent_reply["content"]
else:
return True, None if proxy_reply is None else proxy_reply["content"]

283
autogen/browser_utils.py Normal file
View File

@ -0,0 +1,283 @@
import json
import os
import requests
import re
import markdownify
import io
import uuid
import mimetypes
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from dataclasses import dataclass
from typing import Dict, List, Optional, Union, Callable, Literal, Tuple
# Optional PDF support
IS_PDF_CAPABLE = False
try:
import pdfminer
import pdfminer.high_level
IS_PDF_CAPABLE = True
except ModuleNotFoundError:
pass
# Other optional dependencies
try:
import pathvalidate
except ModuleNotFoundError:
pass
class SimpleTextBrowser:
"""(In preview) An extremely simple text-based web browser comparable to Lynx. Suitable for Agentic use."""
def __init__(
self,
start_page: Optional[str] = "about:blank",
viewport_size: Optional[int] = 1024 * 8,
downloads_folder: Optional[Union[str, None]] = None,
bing_api_key: Optional[Union[str, None]] = None,
request_kwargs: Optional[Union[Dict, None]] = None,
):
self.start_page = start_page
self.viewport_size = viewport_size # Applies only to the standard uri types
self.downloads_folder = downloads_folder
self.history = list()
self.page_title = None
self.viewport_current_page = 0
self.viewport_pages = list()
self.set_address(start_page)
self.bing_api_key = bing_api_key
self.request_kwargs = request_kwargs
self._page_content = ""
@property
def address(self) -> str:
"""Return the address of the current page."""
return self.history[-1]
def set_address(self, uri_or_path):
self.history.append(uri_or_path)
# Handle special URIs
if uri_or_path == "about:blank":
self._set_page_content("")
elif uri_or_path.startswith("bing:"):
self._bing_search(uri_or_path[len("bing:") :].strip())
else:
if not uri_or_path.startswith("http:") and not uri_or_path.startswith("https:"):
uri_or_path = urljoin(self.address, uri_or_path)
self.history[-1] = uri_or_path # Update the address with the fully-qualified path
self._fetch_page(uri_or_path)
self.viewport_current_page = 0
@property
def viewport(self) -> str:
"""Return the content of the current viewport."""
bounds = self.viewport_pages[self.viewport_current_page]
return self.page_content[bounds[0] : bounds[1]]
@property
def page_content(self) -> str:
"""Return the full contents of the current page."""
return self._page_content
def _set_page_content(self, content) -> str:
"""Sets the text content of the current page."""
self._page_content = content
self._split_pages()
if self.viewport_current_page >= len(self.viewport_pages):
self.viewport_current_page = len(self.viewport_pages) - 1
def page_down(self):
self.viewport_current_page = min(self.viewport_current_page + 1, len(self.viewport_pages) - 1)
def page_up(self):
self.viewport_current_page = max(self.viewport_current_page - 1, 0)
def visit_page(self, path_or_uri):
"""Update the address, visit the page, and return the content of the viewport."""
self.set_address(path_or_uri)
return self.viewport
def _split_pages(self):
# Split only regular pages
if not self.address.startswith("http:") and not self.address.startswith("https:"):
self.viewport_pages = [(0, len(self._page_content))]
return
# Handle empty pages
if len(self._page_content) == 0:
self.viewport_pages = [(0, 0)]
return
# Break the viewport into pages
self.viewport_pages = []
start_idx = 0
while start_idx < len(self._page_content):
end_idx = min(start_idx + self.viewport_size, len(self._page_content))
# Adjust to end on a space
while end_idx < len(self._page_content) and self._page_content[end_idx - 1] not in [" ", "\t", "\r", "\n"]:
end_idx += 1
self.viewport_pages.append((start_idx, end_idx))
start_idx = end_idx
def _bing_api_call(self, query):
# Make sure the key was set
if self.bing_api_key is None:
raise ValueError("Missing Bing API key.")
# Prepare the request parameters
request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
if "headers" not in request_kwargs:
request_kwargs["headers"] = {}
request_kwargs["headers"]["Ocp-Apim-Subscription-Key"] = self.bing_api_key
if "params" not in request_kwargs:
request_kwargs["params"] = {}
request_kwargs["params"]["q"] = query
request_kwargs["params"]["textDecorations"] = False
request_kwargs["params"]["textFormat"] = "raw"
request_kwargs["stream"] = False
# Make the request
response = requests.get("https://api.bing.microsoft.com/v7.0/search", **request_kwargs)
response.raise_for_status()
results = response.json()
return results
def _bing_search(self, query):
results = self._bing_api_call(query)
web_snippets = list()
idx = 0
for page in results["webPages"]["value"]:
idx += 1
web_snippets.append(f"{idx}. [{page['name']}]({page['url']})\n{page['snippet']}")
if "deepLinks" in page:
for dl in page["deepLinks"]:
idx += 1
web_snippets.append(
f"{idx}. [{dl['name']}]({dl['url']})\n{dl['snippet'] if 'snippet' in dl else ''}"
)
news_snippets = list()
if "news" in results:
for page in results["news"]["value"]:
idx += 1
news_snippets.append(f"{idx}. [{page['name']}]({page['url']})\n{page['description']}")
self.page_title = f"{query} - Search"
content = (
f"A Bing search for '{query}' found {len(web_snippets) + len(news_snippets)} results:\n\n## Web Results\n"
+ "\n\n".join(web_snippets)
)
if len(news_snippets) > 0:
content += "\n\n## News Results:\n" + "\n\n".join(news_snippets)
self._set_page_content(content)
def _fetch_page(self, url):
try:
# Prepare the request parameters
request_kwargs = self.request_kwargs.copy() if self.request_kwargs is not None else {}
request_kwargs["stream"] = True
# Send a HTTP request to the URL
response = requests.get(url, **request_kwargs)
response.raise_for_status()
# If the HTTP request returns a status code 200, proceed
if response.status_code == 200:
content_type = response.headers.get("content-type", "")
for ct in ["text/html", "text/plain", "application/pdf"]:
if ct in content_type.lower():
content_type = ct
break
if content_type == "text/html":
# Get the content of the response
html = ""
for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
html += chunk
soup = BeautifulSoup(html, "html.parser")
# Remove javascript and style blocks
for script in soup(["script", "style"]):
script.extract()
# Convert to markdown -- Wikipedia gets special attention to get a clean version of the page
if url.startswith("https://en.wikipedia.org/"):
body_elm = soup.find("div", {"id": "mw-content-text"})
title_elm = soup.find("span", {"class": "mw-page-title-main"})
if body_elm:
# What's the title
main_title = soup.title.string
if title_elm and len(title_elm) > 0:
main_title = title_elm.string
webpage_text = (
"# " + main_title + "\n\n" + markdownify.MarkdownConverter().convert_soup(body_elm)
)
else:
webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
else:
webpage_text = markdownify.MarkdownConverter().convert_soup(soup)
# Convert newlines
webpage_text = re.sub(r"\r\n", "\n", webpage_text)
# Remove excessive blank lines
self.page_title = soup.title.string
self._set_page_content(re.sub(r"\n{2,}", "\n\n", webpage_text).strip())
elif content_type == "text/plain":
# Get the content of the response
plain_text = ""
for chunk in response.iter_content(chunk_size=512, decode_unicode=True):
plain_text += chunk
self.page_title = None
self._set_page_content(plain_text)
elif IS_PDF_CAPABLE and content_type == "application/pdf":
pdf_data = io.BytesIO(response.raw.read())
self.page_title = None
self._set_page_content(pdfminer.high_level.extract_text(pdf_data))
elif self.downloads_folder is not None:
# Try producing a safe filename
fname = None
try:
fname = pathvalidate.sanitize_filename(os.path.basename(urlparse(url).path)).strip()
except NameError:
pass
# No suitable name, so make one
if fname is None:
extension = mimetypes.guess_extension(content_type)
if extension is None:
extension = ".download"
fname = str(uuid.uuid4()) + extension
# Open a file for writing
download_path = os.path.abspath(os.path.join(self.downloads_folder, fname))
with open(download_path, "wb") as fh:
for chunk in response.iter_content(chunk_size=512):
fh.write(chunk)
# Return a page describing what just happened
self.page_title = "Download complete."
self._set_page_content(f"Downloaded '{url}' to '{download_path}'.")
else:
self.page_title = f"Error - Unsupported Content-Type '{content_type}'"
self._set_page_content(self.page_title)
else:
self.page_title = "Error"
self._set_page_content("Failed to retrieve " + url)
except requests.exceptions.RequestException as e:
self.page_title = "Error"
self._set_page_content(str(e))

View File

@ -0,0 +1,627 @@
{
"cells": [
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# WebSurferAgent\n",
"\n",
"AutoGen provides a proof-of-concept WebSurferAgent that can command a simple text-based browser (similar to [Lynx](https://en.wikipedia.org/wiki/Lynx_(web_browser))) to search the web, visit pages, navigate within pages, download files, etc. The browsing is stateful, meaning that browsing history, viewport state, and other details are maintained throughout the conversation. \n",
"\n",
"This work was largely inspired by OpenAI's [WebGPT](https://openai.com/research/webgpt) project from December 2021. \n",
"\n",
"## Requirements\n",
"\n",
"AutoGen requires `Python>=3.8`. To run this notebook example, please install AutoGen with the optional `websurfer` dependencies:\n",
"```bash\n",
"pip install \"pyautogen[websurfer]\"\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# %pip install --quiet \"pyautogen[websurfer]\""
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Set your API Endpoint\n",
"\n",
"The [`config_list_from_json`](https://microsoft.github.io/autogen/docs/reference/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.\n",
"\n",
"It first looks for environment variable \"OAI_CONFIG_LIST\" which needs to be a valid json string. If that variable is not found, it then looks for a json file named \"OAI_CONFIG_LIST\". It filters the configs by models (you can filter by other keys as well).\n",
"\n",
"The WebSurferAgent uses a combination of models. GPT-4 and GPT-3.5-turbo-16 are recommended.\n",
"\n",
"Your json config should look something like the following:\n",
"```json\n",
"[\n",
" {\n",
" \"model\": \"gpt-4\",\n",
" \"api_key\": \"<your OpenAI API key here>\"\n",
" },\n",
" {\n",
" \"model\": \"gpt-3.5-turbo-16k\",\n",
" \"api_key\": \"<your OpenAI API key here>\"\n",
" }\n",
"]\n",
"```\n",
"\n",
"If you open this notebook in colab, you can upload your files by clicking the file icon on the left panel and then choose \"upload file\" icon.\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import autogen # noqa: E402\n",
"\n",
"llm_config = {\n",
" \"timeout\": 600,\n",
" \"cache_seed\": 44, # change the seed for different trials\n",
" \"config_list\": autogen.config_list_from_json(\n",
" \"OAI_CONFIG_LIST\",\n",
" filter_dict={\"model\": [\"gpt-4\", \"gpt-4-0613\", \"gpt-4-32k\", \"gpt-4-32k-0613\", \"gpt-4-1106-preview\"]},\n",
" ),\n",
" \"temperature\": 0,\n",
"}\n",
"\n",
"summarizer_llm_config = {\n",
" \"timeout\": 600,\n",
" \"cache_seed\": 44, # change the seed for different trials\n",
" \"config_list\": autogen.config_list_from_json(\n",
" \"OAI_CONFIG_LIST\",\n",
" filter_dict={\"model\": [\"gpt-3.5-turbo-1106\", \"gpt-3.5-turbo-16k-0613\", \"gpt-3.5-turbo-16k\"]},\n",
" ),\n",
" \"temperature\": 0,\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Configure Bing\n",
"\n",
"For WebSurferAgent to be reasonably useful, it needs to be able to search the web -- and that means it needs a Bing API key. \n",
"You can read more about how to get an API on the [Bing Web Search API](https://www.microsoft.com/en-us/bing/apis/bing-web-search-api) page.\n",
"\n",
"Once you have your key, either set it as the `BING_API_KEY` system environment variable, or simply input your key below.\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"import os # noqa: E402\n",
"\n",
"bing_api_key = os.environ[\"BING_API_KEY\"]"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Construct Agents\n",
"\n",
"We now create out WebSurferAgent, and a UserProxyAgent to surf the web. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from autogen.agentchat.contrib.web_surfer import WebSurferAgent # noqa: E402\n",
"\n",
"web_surfer = WebSurferAgent(\n",
" \"web_surfer\",\n",
" llm_config=llm_config,\n",
" summarizer_llm_config=summarizer_llm_config,\n",
" browser_config={\"viewport_size\": 4096, \"bing_api_key\": bing_api_key},\n",
")\n",
"\n",
"user_proxy = autogen.UserProxyAgent(\n",
" \"user_proxy\",\n",
" human_input_mode=\"NEVER\",\n",
" code_execution_config=False,\n",
" default_auto_reply=\"\",\n",
" is_termination_msg=lambda x: True,\n",
")"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example 1: Search, summarize\n",
"- Search for information aobut Microsoft AutoGen\n",
"- Summarize the results\n",
"- Visit the Getting Started Docs page"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33muser_proxy\u001b[0m (to web_surfer):\n",
"\n",
"\n",
"Search the web for information about Microsoft AutoGen\n",
"\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[31m\n",
">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
"\u001b[35m\n",
">>>>>>>> EXECUTING FUNCTION informational_web_search...\u001b[0m\n",
"\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n",
"\n",
"Address: bing: Microsoft AutoGen\n",
"Title: Microsoft AutoGen - Search\n",
"Viewport position: Showing page 1 of 1.\n",
"=======================\n",
"A Bing search for 'Microsoft AutoGen' found 10 results:\n",
"\n",
"## Web Results\n",
"1. [AutoGen: Enabling next-generation large language model applications](https://www.microsoft.com/en-us/research/blog/autogen-enabling-next-generation-large-language-model-applications/)\n",
"AutoGen is a Python package that simplifies the orchestration, optimization, and automation of large language model applications. It enables customizable and conversable agents that integrate with humans, tools, and other agents to solve tasks using GPT-4 and other advanced LLMs. Learn how to use AutoGen for code-based question answering, supply-chain optimization, conversational chess, and more.\n",
"\n",
"2. [GitHub - microsoft/autogen: Enable Next-Gen Large Language Model ...](https://github.com/microsoft/autogen)\n",
"AutoGen is a Python library that enables the development of large language model applications using multiple agents that can converse with each other to solve tasks. It supports various conversation patterns, enhanced LLM inference, and customizable and conversable agents based on OpenAI models.\n",
"\n",
"3. [Getting Started | AutoGen](https://microsoft.github.io/autogen/docs/Getting-Started/)\n",
"AutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools. Main Features\n",
"\n",
"4. [AutoGen | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/)\n",
"AutoGen is a tool that enables next-gen large language model applications by providing a high-level abstraction for building diverse and enhanced LLM workflows. It offers a collection of working systems for various domains and complexities, as well as enhanced LLM inference and optimization APIs.\n",
"\n",
"5. [AutoGen - Microsoft Research](https://www.microsoft.com/en-us/research/project/autogen/)\n",
"AutoGen is an open-source library for building next-generation LLM applications with multiple agents, teachability and personalization. It supports agents that can be backed by various LLM configurations, code generation and execution, and human proxy agent integration.\n",
"\n",
"6. [Installation | AutoGen](https://microsoft.github.io/autogen/docs/Installation/)\n",
"Installation Setup Virtual Environment When not using a docker container, we recommend using a virtual environment to install AutoGen. This will ensure that the dependencies for AutoGen are isolated from the rest of your system. Option 1: venv You can create a virtual environment with venv as below: python3 -m venv pyautogen\n",
"\n",
"7. [AutoGen: Downloads - Microsoft Research](https://www.microsoft.com/en-us/research/project/autogen/downloads/)\n",
"AutoGen allows developers to build LLM applications via multiple agents that can converse with each other to accomplish tasks.\n",
"\n",
"8. [Multi-agent Conversation Framework | AutoGen - microsoft.github.io](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat/)\n",
"AutoGen offers a unified multi-agent conversation framework as a high-level abstraction of using foundation models. It features capable, customizable and conversable agents which integrate LLMs, tools, and humans via automated agent chat.\n",
"\n",
"9. [[2308.08155] AutoGen: Enabling Next-Gen LLM Applications via Multi ...](https://arxiv.org/abs/2308.08155)\n",
"AutoGen is an open-source framework that allows developers to create and customize agents that can converse with each other to perform tasks using various types of language models (LLMs). The framework supports natural language and code-based conversation patterns, and is effective for diverse applications such as mathematics, coding, question answering, and more.\n",
"\n",
"10. [How to setup and use the new Microsoft AutoGen AI agent](https://www.geeky-gadgets.com/microsoft-autogen/)\n",
"Learn how to use AutoGen, a tool that simplifies the automation and optimization of complex language model applications using multiple agents that can converse with each other. AutoGen supports diverse conversation patterns, human participation, and the tuning of expensive LLMs like ChatGPT and GPT-4.\n",
"\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"task1 = \"\"\"\n",
"Search the web for information about Microsoft AutoGen\n",
"\"\"\"\n",
"\n",
"user_proxy.initiate_chat(web_surfer, message=task1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33muser_proxy\u001b[0m (to web_surfer):\n",
"\n",
"Summarize these results\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[31m\n",
">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
"\u001b[35m\n",
">>>>>>>> EXECUTING FUNCTION summarize_page...\u001b[0m\n",
"\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n",
"\n",
"AutoGen is a Python package and framework developed by Microsoft that simplifies the orchestration, optimization, and automation of large language model (LLM) applications. It enables the development of customizable and conversable agents that can solve tasks using advanced LLMs like GPT-4. AutoGen supports various conversation patterns, enhanced LLM inference, and seamless integration with humans, tools, and other agents. It offers a high-level abstraction for building diverse and enhanced LLM workflows and provides a collection of working systems for different domains and complexities. AutoGen is open-source and supports natural language and code-based conversation patterns for applications such as question answering, coding, mathematics, and more.\n",
"\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"task2 = \"Summarize these results\"\n",
"user_proxy.initiate_chat(web_surfer, message=task2, clear_history=False)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33muser_proxy\u001b[0m (to web_surfer):\n",
"\n",
"Click the 'Getting Started' result\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[31m\n",
">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
"\u001b[35m\n",
">>>>>>>> EXECUTING FUNCTION navigational_web_search...\u001b[0m\n",
"\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n",
"\n",
"Address: https://microsoft.github.io/autogen/docs/Getting-Started/\n",
"Title: Getting Started | AutoGen\n",
"Viewport position: Showing page 1 of 2.\n",
"=======================\n",
"Getting Started | AutoGen\n",
"\n",
"[Skip to main content](#)[![AutoGen](/autogen/img/ag.svg)![AutoGen](/autogen/img/ag.svg)**AutoGen**](/autogen/)[Docs](/autogen/docs/Getting-Started)[SDK](/autogen/docs/reference/agentchat/conversable_agent)[Blog](/autogen/blog)[FAQ](/autogen/docs/FAQ)[Examples](/autogen/docs/Examples)Resources* [Ecosystem](/autogen/docs/Ecosystem)\n",
"* [Gallery](/autogen/docs/Gallery)\n",
"[GitHub](https://github.com/microsoft/autogen)🌜🌞`ctrl``K`* [Getting Started](/autogen/docs/Getting-Started)\n",
"* [Installation](/autogen/docs/Installation)\n",
"* [Use Cases](#)\n",
"* [Contributing](/autogen/docs/Contribute)\n",
"* [Research](/autogen/docs/Research)\n",
"On this pageGetting Started\n",
"===============\n",
"\n",
"AutoGen is a framework that enables development of LLM applications using multiple agents that can converse with each other to solve tasks. AutoGen agents are customizable, conversable, and seamlessly allow human participation. They can operate in various modes that employ combinations of LLMs, human inputs, and tools.\n",
"\n",
"![AutoGen Overview](/autogen/assets/images/autogen_agentchat-250ca64b77b87e70d34766a080bf6ba8.png)\n",
"\n",
"### Main Features[](#main-features \"Direct link to heading\")\n",
"\n",
"* AutoGen enables building next-gen LLM applications based on [multi-agent conversations](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat) with minimal effort. It simplifies the orchestration, automation, and optimization of a complex LLM workflow. It maximizes the performance of LLM models and overcomes their weaknesses.\n",
"* It supports [diverse conversation patterns](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#supporting-diverse-conversation-patterns) for complex workflows. With customizable and conversable agents, developers can use AutoGen to build a wide range of conversation patterns concerning conversation autonomy,\n",
"the number of agents, and agent conversation topology.\n",
"* It provides a collection of working systems with different complexities. These systems span a [wide range of applications](https://microsoft.github.io/autogen/docs/Use-Cases/agent_chat#diverse-applications-implemented-with-autogen) from various domains and complexities. This demonstrates how AutoGen can easily support diverse conversation patterns.\n",
"* AutoGen provides [enhanced LLM inference](https://microsoft.github.io/autogen/docs/Use-Cases/enhanced_inference#api-unification). It offers utilities like API unification and caching, and advanced usage patterns, such as error handling, multi-config inference, context programming, etc.\n",
"\n",
"AutoGen is powered by collaborative [research studies](/autogen/docs/Research) from Microsoft, Penn State University, and University of Washington.\n",
"\n",
"### Quickstart[](#quickstart \"Direct link to heading\")\n",
"\n",
"Install from pip: `pip install pyautogen`. Find more options in [Installation](/autogen/docs/Installation).\n",
"For [code execution](/autogen/docs/FAQ#code-execution), we strongly recommend installing the python docker package, and using docker.\n",
"\n",
"#### Multi-Agent Conversation Framework[](#multi-agent-conversation-framework \"Direct link to heading\")\n",
"\n",
"Autogen enables the next-gen LLM applications with a generic multi-agent conversation framework. It offers customizable and conversable agents which integrate LLMs, tools, and humans.\n",
"By automating chat among multiple capable agents, one can easily make them collectively perform tasks autonomously or with human feedback, including tasks that require using tools via code. For [example](https://github.com/microsoft/autogen/blob/main/test/twoagent.py),\n",
"\n",
"```\n",
"from autogen import AssistantAgent, UserProxyAgent, config\\_list\\_from\\_json \n",
" \n",
"# Load LLM inference endpoints from an env variable or a file \n",
"# See https://microsoft.github.io/autogen/docs/FAQ#set-your-api-endpoints \n",
"# and OAI\\_CONFIG\\_LIST\\_sample.json \n",
"config\\_list = config\\_list\\_from\\_json(env\\_or\\_file=\"OAI\\_CONFIG\\_LIST\") \n",
"assistant = AssistantAgent(\"assistant\", llm\\_config={\"config\\_list\": config\\_list}) \n",
"user\\_proxy = UserProxyAgent(\"user\\_proxy\", code\\_execution\\_config={\"work\\_dir\": \"coding\"}) \n",
"user\\_proxy.initiate\\_chat(assistant, \n",
"\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"task3 = \"Click the 'Getting Started' result\"\n",
"user_proxy.initiate_chat(web_surfer, message=task3, clear_history=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Example 2: Navigational search, scroll, answer questions\n",
"- Search for Microsoft's wikipedia page, then naviagate to it\n",
"- Scroll down\n",
"- Answer questions about the content"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33muser_proxy\u001b[0m (to web_surfer):\n",
"\n",
"Find Microsoft's Wikipedia page.\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[31m\n",
">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
"\u001b[35m\n",
">>>>>>>> EXECUTING FUNCTION navigational_web_search...\u001b[0m\n",
"\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n",
"\n",
"Address: https://en.wikipedia.org/wiki/Microsoft\n",
"Title: Microsoft - Wikipedia\n",
"Viewport position: Showing page 1 of 64.\n",
"=======================\n",
"# Microsoft\n",
"\n",
"American multinational technology corporation\n",
"\n",
"Microsoft Corporation| [A square divided into four sub-squares, colored red-orange, green, yellow and blue (clockwise), with the company name appearing to its right](/wiki/File:Microsoft_logo_(2012).svg) |\n",
"| Building 92 on the [Microsoft Redmond campus](/wiki/Microsoft_Redmond_campus \"Microsoft Redmond campus\") |\n",
"| Type | [Public](/wiki/Public_company \"Public company\") |\n",
"| [Traded as](/wiki/Ticker_symbol \"Ticker symbol\") | * [Nasdaq](/wiki/Nasdaq \"Nasdaq\"): [MSFT](https://www.nasdaq.com/market-activity/stocks/msft)\n",
"* [Nasdaq-100](/wiki/Nasdaq-100 \"Nasdaq-100\") component\n",
"* [DJIA](/wiki/Dow_Jones_Industrial_Average \"Dow Jones Industrial Average\") component\n",
"* [S&P 100](/wiki/S%26P_100 \"S&P 100\") component\n",
"* [S&P 500](/wiki/S%26P_500 \"S&P 500\") component\n",
" |\n",
"| [ISIN](/wiki/International_Securities_Identification_Number \"International Securities Identification Number\") | [US5949181045](https://isin.toolforge.org/?language=en&isin=US5949181045) |\n",
"| Industry | [Information technology](/wiki/Information_technology \"Information technology\") |\n",
"| Founded | April 4, 1975; 48 years ago (1975-04-04) in [Albuquerque, New Mexico](/wiki/Albuquerque,_New_Mexico \"Albuquerque, New Mexico\"), U.S. |\n",
"| Founders | * [Bill Gates](/wiki/Bill_Gates \"Bill Gates\")\n",
"* [Paul Allen](/wiki/Paul_Allen \"Paul Allen\")\n",
" |\n",
"| Headquarters | [One Microsoft Way](/wiki/Microsoft_campus \"Microsoft campus\")[Redmond, Washington](/wiki/Redmond,_Washington \"Redmond, Washington\"), U.S. |\n",
"| Area served | Worldwide |\n",
"| Key people | * [Satya Nadella](/wiki/Satya_Nadella \"Satya Nadella\")([Chairman](/wiki/Chairman \"Chairman\") & [CEO](/wiki/Chief_executive_officer \"Chief executive officer\"))\n",
"* [Brad Smith](/wiki/Brad_Smith_(American_lawyer) \"Brad Smith (American lawyer)\")([Vice Chairman](/wiki/Vice-Chairman \"Vice-Chairman\") & [President](/wiki/President_(corporate_title) \"President (corporate title)\"))\n",
"* [Bill Gates](/wiki/Bill_Gates \"Bill Gates\")([technical adviser](/wiki/Adviser \"Adviser\"))\n",
" |\n",
"| Products | * [Software development](/wiki/Software_development \"Software development\")\n",
"* [Computer hardware](/wiki/Computer_hardware \"Computer hardware\")\n",
"* [Consumer electronics](/wiki/Consumer_electronics \"Consumer electronics\")\n",
"* [Social networking service](/wiki/Social_networking_service \"Social networking service\")\n",
"* [Cloud computing](/wiki/Cloud_computing \"Cloud computing\")\n",
"* [Video games](/wiki/Video_game_industry \"Video game industry\")\n",
"* [Internet](/wiki/Internet \"Internet\")\n",
"* [Corporate venture capital](/wiki/Corporate_venture_capital \"Corporate venture capital\")\n",
" |\n",
"| Brands | \n",
"* [Windows](/wiki/Microsoft_Windows \"Microsoft Windows\")\n",
"* [Microsoft 365](/wiki/Microsoft_365 \"Microsoft 365\")\n",
"* [Skype](/wiki/Skype \"Skype\")\n",
"* [Visual Studio](/wiki/Visual_Studio \"Visual Studio\")\n",
"* [Xbox](/wiki/Xbox \"Xbox\")\n",
"* [Dynamics](/wiki/Microsoft_Dynamics_365 \"Microsoft Dynamics 365\")\n",
"* [Surface](/wiki/Microsoft_Surface \"Microsoft Surface\")\n",
"\n",
" |\n",
"| Services | \n",
"* [Edge](/wiki/Microsoft_Edge \"Microsoft Edge\")\n",
"* [Azure](/wiki/Microsoft_Azure \"Microsoft Azure\")\n",
"* [Bing](/wiki/Microsoft_Bing \"Microsoft Bing\")\n",
"* [LinkedIn](/wiki/LinkedIn \"LinkedIn\")\n",
"* [Yammer](/wiki/Yammer \"Yammer\")\n",
"* [Microsoft 365](/wiki/Microsoft_365 \"Microsoft 365\")\n",
"* [OneDrive](/wiki/OneDrive \"OneDrive\")\n",
"* [Outlook](/wiki/Microsoft_Outlook \"Microsoft Outlook\")\n",
"* [GitHub](/wiki/GitHub \"GitHub\")\n",
"* [Microsoft Store](/wiki/Microsoft_Store_(digital) \"Microsoft Store (digital)\")\n",
"* [Windows Update](/wiki/Windows_Update \"Windows Update\")\n",
"* [Xbox Game Pass](/wiki/Xbox_Game_Pass \"Xbox Game Pass\")\n",
"* [Xbox network](/wiki/Xbox_network \"Xbox network\")\n",
"\n",
" |\n",
"| Revenue | Increase [US$](/wiki/United_States_dollar \"United States dollar\")211.9 billion (2023) |\n",
"| [Operating income](/wiki/Earnings_before_interest_and_taxes \"Earnings before interest and taxes\") | Increase US$88.5 billion (2023) |\n",
"| [Net income](/wiki/Net_income \"Net income\") | Increase US$73.4 billion (2023) |\n",
"| [Total assets](/wiki/Asset \"Asset\") | Increase US$411.9 billion (2023) |\n",
"| [Total equity](/wiki/Equity_(finance) \"Equity \n",
"\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"task4 = \"\"\"Find Microsoft's Wikipedia page.\"\"\"\n",
"user_proxy.initiate_chat(web_surfer, message=task4, clear_history=False)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33muser_proxy\u001b[0m (to web_surfer):\n",
"\n",
"Scroll down.\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[31m\n",
">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
"\u001b[35m\n",
">>>>>>>> EXECUTING FUNCTION page_down...\u001b[0m\n",
"\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n",
"\n",
"Address: https://en.wikipedia.org/wiki/Microsoft\n",
"Title: Microsoft - Wikipedia\n",
"Viewport position: Showing page 2 of 64.\n",
"=======================\n",
"(finance)\") | Increase US$206.2 billion (2023) |\n",
"| Number of employees | 238,000 (2023) |\n",
"| [Divisions](/wiki/Division_(business) \"Division (business)\") | \n",
"* [Microsoft Engineering Groups](/wiki/Microsoft_engineering_groups \"Microsoft engineering groups\")\n",
"* [Microsoft Digital Crimes Unit](/wiki/Microsoft_Digital_Crimes_Unit \"Microsoft Digital Crimes Unit\")\n",
"* [Microsoft Press](/wiki/Microsoft_Press \"Microsoft Press\")\n",
"* [Microsoft Japan](/wiki/Microsoft_Japan \"Microsoft Japan\")\n",
"* [Microsoft Gaming](/wiki/Microsoft_Gaming \"Microsoft Gaming\")\n",
"\n",
" |\n",
"| [Subsidiaries](/wiki/Subsidiary \"Subsidiary\") | \n",
"* [GitHub](/wiki/GitHub \"GitHub\")\n",
"* [LinkedIn](/wiki/LinkedIn \"LinkedIn\")\n",
"* [Metaswitch](/wiki/Metaswitch \"Metaswitch\")\n",
"* [Nuance Communications](/wiki/Nuance_Communications \"Nuance Communications\")\n",
"* [RiskIQ](/wiki/RiskIQ \"RiskIQ\")\n",
"* [Skype Technologies](/wiki/Skype_Technologies \"Skype Technologies\")\n",
"* [OpenAI](/wiki/OpenAI \"OpenAI\") (49%)[[1]](#cite_note-1)\n",
"* [Xamarin](/wiki/Xamarin \"Xamarin\")\n",
"* [Xandr](/wiki/Xandr \"Xandr\")\n",
"\n",
" |\n",
"| |\n",
"| [ASN](/wiki/Autonomous_System_Number \"Autonomous System Number\") | * [8075](https://bgp.tools/as/8075)\n",
" |\n",
"| |\n",
"| Website | [microsoft.com](https://www.microsoft.com/) |\n",
"| **Footnotes / references**Financials as of June 30, 2023[[update]](https://en.wikipedia.org/w/index.php?title=Microsoft&action=edit)[[2]](#cite_note-2) |\n",
"\n",
"| | | |\n",
"| --- | --- | --- |\n",
"| \n",
"\n",
"| | |\n",
"| --- | --- |\n",
"| [Bill Gates in 2023](/wiki/File:Bill_Gates_2017_(cropped).jpg) | This article is part of a series about\n",
"[Bill Gates](/wiki/Bill_Gates \"Bill Gates\") |\n",
"\n",
" |\n",
"| * [Awards and honors](/wiki/Bill_Gates#Recognition \"Bill Gates\")\n",
"* [Philanthropy](/wiki/Bill_Gates#Philanthropy \"Bill Gates\")\n",
"* [Political positions](/wiki/Bill_Gates#Political_positions \"Bill Gates\")\n",
"* [Public image](/wiki/Bill_Gates#Public_image \"Bill Gates\")\n",
"* [Residence](/wiki/Bill_Gates%27s_house \"Bill Gates's house\")\n",
"\n",
"---\n",
"\n",
"Companies* [Traf-O-Data](/wiki/Traf-O-Data \"Traf-O-Data\")\n",
"* Microsoft ([criticism](/wiki/Criticism_of_Microsoft \"Criticism of Microsoft\"))\n",
"* [BEN](/wiki/Branded_Entertainment_Network \"Branded Entertainment Network\")\n",
"* [Cascade Investment](/wiki/Cascade_Investment \"Cascade Investment\")\n",
"* [TerraPower](/wiki/TerraPower \"TerraPower\")\n",
"* [Gates Ventures](/wiki/Gates_Ventures \"Gates Ventures\")\n",
"\n",
"---\n",
"\n",
"Charitable organizations* [Bill & Melinda Gates Foundation](/wiki/Bill_%26_Melinda_Gates_Foundation \"Bill & Melinda Gates Foundation\")\n",
"* [Match for Africa](/wiki/Match_for_Africa \"Match for Africa\")\n",
"* [The Giving Pledge](/wiki/The_Giving_Pledge \"The Giving Pledge\")\n",
"* [OER Project](/wiki/OER_Project \"OER Project\")\n",
"* [Breakthrough Energy](/wiki/Breakthrough_Energy \"Breakthrough Energy\")\n",
"* [Mission Innovation](/wiki/Mission_Innovation \"Mission Innovation\")\n",
"\n",
"---\n",
"\n",
"Writings* \"[An Open Letter to Hobbyists](/wiki/An_Open_Letter_to_Hobbyists \"An Open Letter to Hobbyists\")\"\n",
"* *[The Road Ahead](/wiki/The_Road_Ahead_(Gates_book) \"The Road Ahead (Gates book)\")*\n",
"* *[Business @ the Speed of Thought](/wiki/Business_@_the_Speed_of_Thought \"Business @ the Speed of Thought\")*\n",
"* *[How to Avoid a Climate Disaster](/wiki/How_to_Avoid_a_Climate_Disaster \"How to Avoid a Climate Disaster\")*\n",
"* *[How to Prevent the Next Pandemic](/wiki/How_to_Prevent_the_Next_Pandemic \"How to Prevent the Next Pandemic\")*\n",
"\n",
"---\n",
"\n",
"Related* [Bill Gates' flower fly](/wiki/Bill_Gates%27_flower_fly \"Bill Gates' flower fly\")\n",
"* [Codex Leicester](/wiki/Codex_Leicester \"Codex Leicester\")\n",
"* *[Lost on the Grand Banks](/wiki/Lost_on_the_Grand_Banks \"Lost on the Grand Banks\")*\n",
"* [History of Microsoft](/wiki/History_of_Microsoft \"History of Microsoft\")\n",
"* [Timeline of Microsoft](/wiki/Timeline_of_Microsoft \"Timeline of Microsoft\")\n",
"* [Paul Allen](/wiki/Paul_Allen \"Paul Allen\")\n",
"\n",
"---\n",
"\n",
" |\n",
"| * [v](/wiki/Template:Bill_Gates_series \"Template:Bill Gates series\")\n",
"* [t](/wiki/Template_talk:Bill_Gates_series \"Template talk:Bill Gates series\")\n",
"* [e](/wiki/Special:EditPage/Template:Bill_Gates_series \"Special:EditPage/Template:Bill Gates series\")\n",
" |\n",
"\n",
"**Microsoft Corporation** is an American multinational [technology corporation](/wiki/Technology_company \n",
"\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"task5 = \"\"\"Scroll down.\"\"\"\n",
"user_proxy.initiate_chat(web_surfer, message=task5, clear_history=False)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[33muser_proxy\u001b[0m (to web_surfer):\n",
"\n",
"Where was the first office location, and when did they move to Redmond?\n",
"\n",
"--------------------------------------------------------------------------------\n",
"\u001b[31m\n",
">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
"\u001b[35m\n",
">>>>>>>> EXECUTING FUNCTION answer_from_page...\u001b[0m\n",
"\u001b[33mweb_surfer\u001b[0m (to user_proxy):\n",
"\n",
"Microsoft's first office location was in Albuquerque, New Mexico, where it was founded on April 4, 1975. However, Microsoft later moved its headquarters to Redmond, Washington in January 1979. Since then, Redmond has been the main office location for Microsoft.\n",
"\n",
"--------------------------------------------------------------------------------\n"
]
}
],
"source": [
"task6 = \"\"\"Where was the first office location, and when did they move to Redmond?\"\"\"\n",
"user_proxy.initiate_chat(web_surfer, message=task6, clear_history=False)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -52,6 +52,7 @@ setuptools.setup(
"teachable": ["chromadb"],
"lmm": ["replicate", "pillow"],
"graphs": ["networkx~=3.2.1", "matplotlib~=3.8.1"],
"websurfer": ["beautifulsoup4", "markdownify", "pdfminer.six", "pathvalidate"],
"redis": ["redis"],
},
classifiers=[

View File

@ -0,0 +1,172 @@
import os
import sys
import re
import pytest
from autogen import ConversableAgent, UserProxyAgent, config_list_from_json
from autogen.oai.openai_utils import filter_config
sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
from conftest import skip_openai # noqa: E402
sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
from test_assistant_agent import KEY_LOC, OAI_CONFIG_LIST # noqa: E402
BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"
BING_QUERY = "Microsoft"
try:
from autogen.agentchat.contrib.web_surfer import WebSurferAgent
except ImportError:
skip_all = True
else:
skip_all = False
try:
from openai import OpenAI
except ImportError:
skip_oai = True
else:
skip_oai = False or skip_openai
try:
BING_API_KEY = os.environ["BING_API_KEY"]
except KeyError:
skip_bing = True
else:
skip_bing = False
if not skip_oai:
config_list = config_list_from_json(env_or_file=OAI_CONFIG_LIST, file_location=KEY_LOC)
@pytest.mark.skipif(
skip_all,
reason="do not run if dependency is not installed",
)
def test_web_surfer():
page_size = 4096
web_surfer = WebSurferAgent("web_surfer", llm_config=False, browser_config={"viewport_size": page_size})
# Sneak a peak at the function map, allowing us to call the functions for testing here
function_map = web_surfer._user_proxy._function_map
# Test some basic navigations
response = function_map["visit_page"](BLOG_POST_URL)
assert f"Address: {BLOG_POST_URL}".strip() in response
assert f"Title: {BLOG_POST_TITLE}".strip() in response
# Test scrolling
m = re.search(r"\bViewport position: Showing page 1 of (\d+).", response)
total_pages = int(m.group(1))
response = function_map["page_down"]()
assert (
f"Viewport position: Showing page 2 of {total_pages}." in response
) # Assumes the content is longer than one screen
response = function_map["page_up"]()
assert f"Viewport position: Showing page 1 of {total_pages}." in response
# Try to scroll too far back up
response = function_map["page_up"]()
assert f"Viewport position: Showing page 1 of {total_pages}." in response
# Try to scroll too far down
for i in range(0, total_pages + 1):
response = function_map["page_down"]()
assert f"Viewport position: Showing page {total_pages} of {total_pages}." in response
# Test web search -- we don't have a key in this case, so we expect it to raise an error (but it means the code path is correct)
with pytest.raises(ValueError, match="Missing Bing API key."):
response = function_map["informational_web_search"](BING_QUERY)
with pytest.raises(ValueError, match="Missing Bing API key."):
response = function_map["navigational_web_search"](BING_QUERY)
# Test Q&A and summarization -- we don't have a key so we expect it to fail (but it means the code path is correct)
with pytest.raises(AttributeError, match="'NoneType' object has no attribute 'create'"):
response = function_map["answer_from_page"]("When was it founded?")
with pytest.raises(AttributeError, match="'NoneType' object has no attribute 'create'"):
response = function_map["summarize_page"]()
@pytest.mark.skipif(
skip_oai,
reason="do not run if oai is not installed",
)
def test_web_surfer_oai():
llm_config = {"config_list": config_list, "timeout": 180, "cache_seed": None}
summarizer_llm_config = {
"config_list": filter_config(
config_list, {"model": ["gpt-3.5-turbo-1106", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo-16k"]}
),
"timeout": 180,
"cache_seed": None,
}
assert len(llm_config["config_list"]) > 0
assert len(summarizer_llm_config["config_list"]) > 0
page_size = 4096
web_surfer = WebSurferAgent(
"web_surfer",
llm_config=llm_config,
summarizer_llm_config=summarizer_llm_config,
browser_config={"viewport_size": page_size},
)
user_proxy = UserProxyAgent(
"user_proxy",
human_input_mode="NEVER",
code_execution_config=False,
default_auto_reply="",
is_termination_msg=lambda x: True,
)
# Make some requests that should test function calling
user_proxy.initiate_chat(web_surfer, message="Please visit the page 'https://en.wikipedia.org/wiki/Microsoft'")
user_proxy.initiate_chat(web_surfer, message="Please scroll down.")
user_proxy.initiate_chat(web_surfer, message="Please scroll up.")
user_proxy.initiate_chat(web_surfer, message="When was it founded?")
user_proxy.initiate_chat(web_surfer, message="What's this page about?")
@pytest.mark.skipif(
skip_bing,
reason="do not run if bing api key is not available",
)
def test_web_surfer_bing():
page_size = 4096
web_surfer = WebSurferAgent(
"web_surfer",
llm_config=False,
browser_config={"viewport_size": page_size, "bing_api_key": BING_API_KEY},
)
# Sneak a peak at the function map, allowing us to call the functions for testing here
function_map = web_surfer._user_proxy._function_map
# Test informational queries
response = function_map["informational_web_search"](BING_QUERY)
assert f"Address: bing: {BING_QUERY}" in response
assert f"Title: {BING_QUERY} - Search" in response
assert "Viewport position: Showing page 1 of 1." in response
assert f"A Bing search for '{BING_QUERY}' found " in response
# Test informational queries
response = function_map["navigational_web_search"](BING_QUERY + " Wikipedia")
assert "Address: https://en.wikipedia.org/wiki/" in response
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_web_surfer()
# test_web_surfer_oai()
# test_web_surfer_bing()

173
test/test_browser_utils.py Normal file
View File

@ -0,0 +1,173 @@
import pytest
import os
import sys
import requests
import hashlib
import re
from agentchat.test_assistant_agent import KEY_LOC # noqa: E402
BLOG_POST_URL = "https://microsoft.github.io/autogen/blog/2023/04/21/LLM-tuning-math"
BLOG_POST_TITLE = "Does Model and Inference Parameter Matter in LLM Applications? - A Case Study for MATH | AutoGen"
BLOG_POST_STRING = "Large language models (LLMs) are powerful tools that can generate natural language texts for various applications, such as chatbots, summarization, translation, and more. GPT-4 is currently the state of the art LLM in the world. Is model selection irrelevant? What about inference parameters?"
WIKIPEDIA_URL = "https://en.wikipedia.org/wiki/Microsoft"
WIKIPEDIA_TITLE = "Microsoft - Wikipedia"
WIKIPEDIA_STRING = "Redmond"
PLAIN_TEXT_URL = "https://raw.githubusercontent.com/microsoft/autogen/main/README.md"
IMAGE_URL = "https://github.com/afourney.png"
PDF_URL = "https://arxiv.org/pdf/2308.08155.pdf"
PDF_STRING = "Figure 1: AutoGen enables diverse LLM-based applications using multi-agent conversations."
BING_QUERY = "Microsoft"
BING_TITLE = f"{BING_QUERY} - Search"
BING_STRING = f"A Bing search for '{BING_QUERY}' found"
try:
from autogen.browser_utils import SimpleTextBrowser
except ImportError:
skip_all = True
else:
skip_all = False
try:
BING_API_KEY = os.environ["BING_API_KEY"]
except KeyError:
skip_bing = True
else:
skip_bing = False
def _rm_folder(path):
"""Remove all the regular files in a folder, then deletes the folder. Assumes a flat file structure, with no subdirectories."""
for fname in os.listdir(path):
fpath = os.path.join(path, fname)
if os.path.isfile(fpath):
os.unlink(fpath)
os.rmdir(path)
@pytest.mark.skipif(
skip_all,
reason="do not run if dependency is not installed",
)
def test_simple_text_browser():
# Create a downloads folder (removing any leftover ones from prior tests)
downloads_folder = os.path.join(KEY_LOC, "downloads")
if os.path.isdir(downloads_folder):
_rm_folder(downloads_folder)
os.mkdir(downloads_folder)
# Instantiate the browser
user_agent = "python-requests/" + requests.__version__
viewport_size = 1024
browser = SimpleTextBrowser(
downloads_folder=downloads_folder,
viewport_size=viewport_size,
request_kwargs={
"headers": {"User-Agent": user_agent},
},
)
# Test that we can visit a page and find what we expect there
top_viewport = browser.visit_page(BLOG_POST_URL)
assert browser.viewport == top_viewport
assert browser.page_title.strip() == BLOG_POST_TITLE.strip()
assert BLOG_POST_STRING in browser.page_content
# Check if page splitting works
approx_pages = int(len(browser.page_content) / viewport_size + 0.5) # May be fewer, since it aligns to word breaks
assert len(browser.viewport_pages) <= approx_pages
assert abs(len(browser.viewport_pages) - approx_pages) <= 1 # allow only a small deviation
assert browser.viewport_pages[0][0] == 0
assert browser.viewport_pages[-1][1] == len(browser.page_content)
# Make sure we can reconstruct the full contents from the split pages
buffer = ""
for bounds in browser.viewport_pages:
buffer += browser.page_content[bounds[0] : bounds[1]]
assert buffer == browser.page_content
# Test scrolling (scroll all the way to the bottom)
for i in range(1, len(browser.viewport_pages)):
browser.page_down()
assert browser.viewport_current_page == i
# Test scrolloing beyond the limits
for i in range(0, 5):
browser.page_down()
assert browser.viewport_current_page == len(browser.viewport_pages) - 1
# Test scrolling (scroll all the way to the bottom)
for i in range(len(browser.viewport_pages) - 2, 0, -1):
browser.page_up()
assert browser.viewport_current_page == i
# Test scrolloing beyond the limits
for i in range(0, 5):
browser.page_up()
assert browser.viewport_current_page == 0
# Test Wikipedia handling
assert WIKIPEDIA_STRING in browser.visit_page(WIKIPEDIA_URL)
assert WIKIPEDIA_TITLE.strip() == browser.page_title.strip()
# Visit a plain-text file
response = requests.get(PLAIN_TEXT_URL)
response.raise_for_status()
expected_results = response.text
browser.visit_page(PLAIN_TEXT_URL)
assert browser.page_content.strip() == expected_results.strip()
# Directly download an image, and compute its md5
response = requests.get(IMAGE_URL, stream=True)
response.raise_for_status()
expected_md5 = hashlib.md5(response.raw.read()).hexdigest()
# Visit an image causing it to be downloaded by the SimpleTextBrowser, then compute its md5
viewport = browser.visit_page(IMAGE_URL)
m = re.search(r"Downloaded '(.*?)' to '(.*?)'", viewport)
fetched_url = m.group(1)
download_loc = m.group(2)
assert fetched_url == IMAGE_URL
with open(download_loc, "rb") as fh:
downloaded_md5 = hashlib.md5(fh.read()).hexdigest()
# MD%s should match
assert expected_md5 == downloaded_md5
# Fetch a PDF
viewport = browser.visit_page(PDF_URL)
assert PDF_STRING in viewport
# Clean up
_rm_folder(downloads_folder)
@pytest.mark.skipif(
skip_bing,
reason="do not run bing tests if key is missing",
)
def test_bing_search():
# Instantiate the browser
user_agent = "python-requests/" + requests.__version__
browser = SimpleTextBrowser(
bing_api_key=BING_API_KEY,
viewport_size=1024,
request_kwargs={
"headers": {"User-Agent": user_agent},
},
)
assert BING_STRING in browser.visit_page("bing: " + BING_QUERY)
assert BING_TITLE == browser.page_title
assert len(browser.viewport_pages) == 1
assert browser.viewport_pages[0] == (0, len(browser.page_content))
if __name__ == "__main__":
"""Runs this file's tests from the command line."""
test_simple_text_browser()
test_bing_search()

View File

@ -39,6 +39,7 @@ Links to notebook examples:
- Function Inception: Enable AutoGen agents to update/remove functions during conversations. - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_inception_function.ipynb)
- Agent Chat with Whisper - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_video_transcript_translate_with_whisper.ipynb)
- Constrained Responses via Guidance - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_guidance.ipynb)
- Browse the Web with Agents - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_surfer.ipynb)
1. **Human Involvement**
- Simple example in ChatGPT style [View example](https://github.com/microsoft/autogen/blob/main/samples/simple_chat.py)
- Auto Code Generation, Execution, Debugging and **Human Feedback** - [View Notebook](https://github.com/microsoft/autogen/blob/main/notebook/agentchat_human_feedback.ipynb)