add examples: multimodal tool use with qwen2-vl

2024-08-09 15:50:41 +08:00 · 2024-08-09 15:50:41 +08:00 · 8f24dbf6a7
parent 3c4f8d00af
commit 8f24dbf6a7
9 changed files with 225 additions and 175 deletions
--- a/examples/assistant_angry_girlfriend.py
+++ b/examples/assistant_angry_girlfriend.py
@ -1,57 +0,0 @@
-"""A comfort game implemented by assistant"""
-
-from qwen_agent.agents import Assistant
-from qwen_agent.gui import WebUI
-
-
-def init_agent_service():
-    llm_cfg = {'model': 'qwen-max'}
-    system = ('我们来玩角色扮演游戏。你扮演用户的女友。由用户开始发言，根据他的发言，你初始化一个心情值（0到100）并作出回应。'
-              '用户的任务是哄你开心，你根据每次用户说的话调整心情，每次回复开头加上（当前心情：分数）。')
-
-    bot = Assistant(llm=llm_cfg, name='虚拟女友', description='哄哄机器人', system_message=system)
-
-    return bot
-
-
-def test(query: str = '你今天真好看'):
-    # Define the agent
-    bot = init_agent_service()
-
-    # Chat
-    messages = [{'role': 'user', 'content': query}]
-    for response in bot.run(messages=messages):
-        print('bot response:', response)
-
-
-def app_tui():
-    # Define the agent
-    bot = init_agent_service()
-
-    # Chat
-    messages = []
-    while True:
-        query = input('user question: ')
-        messages.append({'role': 'user', 'content': query})
-        response = []
-        for response in bot.run(messages=messages):
-            print('bot response:', response)
-        messages.extend(response)
-
-
-def app_gui():
-    agent = init_agent_service()
-    chatbot_config = {
-        'prompt.suggestions': [
-            '你今天真好看！',
-            '晚上去吃好吃的嘛~',
-            '宝贝，你又瘦啦！',
-        ]
-    }
-    WebUI(agent, chatbot_config=chatbot_config).run(messages=[{'role': 'assistant', 'content': [{'text': '还不快来哄哄我！'}]}])
-
-
-if __name__ == '__main__':
-    # test()
-    # app_tui()
-    app_gui()
--- a/examples/assistant_growing_girl.py
+++ b/examples/assistant_growing_girl.py
@ -1,93 +0,0 @@
-"""A girl's growth story novelist implemented by assistant"""
-
-import os
-from typing import Optional
-
-from qwen_agent.agents import Assistant
-from qwen_agent.gui import WebUI
-
-ROOT_RESOURCE = os.path.join(os.path.dirname(__file__), 'resource')
-
-
-def init_agent_service():
-    llm_cfg = {'model': 'qwen-max'}
-    tools = ['image_gen']
-    bot = Assistant(
-        llm=llm_cfg,
-        name='漫画家',
-        description='根据女孩的成长阶段画出图片，串成一个故事',
-        function_list=tools,
-        system_message='你扮演一个漫画家，根据我给你的女孩的不同阶段，使用工具画出每个阶段女孩的的图片，'
-        '并串成一个故事讲述出来。要求图片背景丰富',
-    )
-    return bot
-
-
-def test(
-        query='请用image_gen开始创作！',
-        file: Optional[str] = os.path.join(ROOT_RESOURCE, 'growing_girl.pdf'),
-):
-    # Define the agent
-    bot = init_agent_service()
-
-    # Chat
-    messages = []
-
-    if not file:
-        messages.append({'role': 'user', 'content': query})
-    else:
-        messages.append({'role': 'user', 'content': [{'text': query}, {'file': file}]})
-
-    for response in bot.run(messages):
-        print('bot response:', response)
-
-
-def app_tui():
-    # Define the agent
-    bot = init_agent_service()
-
-    # Chat
-    messages = []
-    while True:
-        # Query example: 请用image_gen开始创作！
-        query = input('user question: ')
-        # File example: resource/growing_girl.pdf
-        file = input('file url (press enter if no file): ').strip()
-        if not query:
-            print('user question cannot be empty！')
-            continue
-        if not file:
-            messages.append({'role': 'user', 'content': query})
-        else:
-            messages.append({'role': 'user', 'content': [{'text': query}, {'file': file}]})
-
-        response = []
-        for response in bot.run(messages):
-            print('bot response:', response)
-        messages.extend(response)
-
-
-def app_gui():
-    # Define the agent
-    bot = init_agent_service()
-    file = os.path.join(ROOT_RESOURCE, 'growing_girl.pdf')
-    chatbot_config = {
-        'prompt.suggestions': [
-            {
-                'text': '画一个女孩的成长故事',
-                'files': [file]
-            },
-            {
-                'text': '画一个女孩的成长故事，从小学到大学',
-                'files': [file]
-            },
-            '画出女人的一生，要反映出人类的深刻本质',
-        ]
-    }
-    WebUI(bot, chatbot_config=chatbot_config).run()
-
-
-if __name__ == '__main__':
-    # test()
-    # app_tui()
-    app_gui()
--- a/examples/qwen2vl_assistant_tooluse.py
+++ b/examples/qwen2vl_assistant_tooluse.py
@ -0,0 +1,137 @@
+import os
+import re
+import uuid
+from io import BytesIO
+from pprint import pprint
+from typing import List, Union
+
+import requests
+from PIL import Image
+
+from qwen_agent.agents import FnCallAgent
+from qwen_agent.llm.schema import ContentItem
+from qwen_agent.tools.base import BaseToolWithFileAccess, register_tool
+
+ROOT_RESOURCE = os.path.join(os.path.dirname(__file__), 'resource')
+
+
+@register_tool('crop_and_resize')
+class CropResize(BaseToolWithFileAccess):
+    description = '这是一个放大镜功能，截取局部图像并放大从而查看更多细节，如果你无法直接看清细节时可以调用'
+    parameters = [
+        {
+            'name': 'image',
+            'type': 'string',
+            'description': '输入图片本地路径或URL',
+            'required': True
+        },
+        {
+            'name': 'rectangle',
+            'type': 'string',
+            'description': '需要截取的局部图像区域，使用左上角坐标和右下角坐标表示（原点在图像左上角、向右为x轴正方向、向下为y轴正方向），格式：(x1,y1),(x2,y2)',
+            'required': True
+        },
+    ]
+
+    def _extract_coordinates(self, text):
+        pattern = r'\((\d+),\s*(\d+)\)'
+        matches = re.findall(pattern, text)
+        coordinates = [(int(x), int(y)) for x, y in matches]
+        if len(coordinates) >= 2:
+            x1, y1 = coordinates[0]
+            x2, y2 = coordinates[1]
+            return x1, y1, x2, y2
+
+        pattern = r'\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)'
+        matches = re.findall(pattern, text)
+        coordinates = [(int(x1), int(y1), int(x2), int(y2)) for x1, y1, x2, y2 in matches]
+        x1, y1, x2, y2 = coordinates[0]
+        return coordinates[0]
+
+    def _expand_box(self, x1, y1, x2, y2, factor=1):
+        xc = (x1 + x2) / 2
+        yc = (y1 + y2) / 2
+        w = x2 - x1
+        h = y2 - y1
+        w_new = w * factor
+        h_new = h * factor
+        return xc - w_new / 2, yc - h_new / 2, xc + w_new / 2, yc + h_new / 2
+
+    def call(self, params: Union[str, dict], files: List[str] = None, **kwargs) -> List[ContentItem]:
+        super().call(params=params, files=files)
+        params = self._verify_json_format_args(params)
+
+        image_arg = params['image']  # local path or url
+        rectangle = params['rectangle']
+
+        # open image
+        if image_arg.startswith('http'):
+            response = requests.get(image_arg)
+            response.raise_for_status()
+            image = Image.open(BytesIO(response.content))
+        elif os.path.exists(image_arg):
+            image = Image.open(image_arg)
+        else:
+            image = Image.open(os.path.join(self.work_dir, image_arg))
+
+        coordinates = self._extract_coordinates(rectangle)
+        x1, y1, x2, y2 = self._expand_box(*coordinates, factor=1.35)
+
+        w, h = image.size
+        x1, y1 = round(x1 / 1000 * w), round(y1 / 1000 * h)
+        x2, y2 = round(x2 / 1000 * w), round(y2 / 1000 * h)
+
+        # remove padding
+        x1, y1, x2, y2 = max(x1, 0), max(y1, 0), min(x2, w), min(y2, h)
+
+        cropped_image = image.crop((x1, y1, x2, y2))
+
+        # save
+        output_path = os.path.abspath(os.path.join(self.work_dir, f'{uuid.uuid4()}.png'))
+        cropped_image.save(output_path)
+
+        return [
+            ContentItem(image=output_path),
+            ContentItem(text=f'（ 这张放大的局部区域的图片的URL是 {output_path} ）'),
+        ]
+
+
+def test():
+    llm_cfg_vl = {
+        # Using Qwen2-VL deployed at any openai-compatible service such as vLLM:
+        # 'model_type': 'qwenvl_oai',
+        # 'model': 'Qwen/Qwen2-VL-72B-Instruct',
+        # 'model_server': 'http://localhost:8000/v1',  # api_base
+        # 'api_key': 'EMPTY',
+
+        # Using Qwen2-VL provided by Alibaba Cloud DashScope:
+        # 'model_type': 'qwenvl_dashscope',
+        # 'model': 'qwen2-vl-72b-instruct',
+        # 'api_key': os.getenv('DASHSCOPE_API_KEY'),
+
+        # TODO: Use qwen2-vl instead once qwen2-vl is released.
+        'model_type': 'qwenvl_dashscope',
+        'model': 'qwen-vl-max',
+        'api_key': os.getenv('DASHSCOPE_API_KEY'),
+        'generate_cfg': dict(max_retries=10,)
+    }
+
+    agent = FnCallAgent(function_list=['crop_and_resize'], llm=llm_cfg_vl)
+    messages = [{
+        'role':
+            'user',
+        'content': [
+            {
+                'image': os.path.abspath(os.path.join(ROOT_RESOURCE, 'screenshot_with_plot.jpeg'))
+            },
+            {
+                'text': '调用工具放大右边的表格'
+            },
+        ],
+    }]
+    response = agent.run_nonstream(messages=messages)
+    pprint(response, indent=4)
+
+
+if __name__ == '__main__':
+    test()
--- a/examples/qwen2vl_function_calling.py
+++ b/examples/qwen2vl_function_calling.py
@ -0,0 +1,81 @@
+import json
+import urllib.parse
+
+from qwen_agent.llm import get_chat_model
+from qwen_agent.llm.schema import ContentItem
+
+
+def image_gen(prompt: str) -> str:
+    prompt = urllib.parse.quote(prompt)
+    image_url = f'https://image.pollinations.ai/prompt/{prompt}'
+    return image_url
+
+
+def test():
+    # Config for the model
+    llm_cfg_oai = {
+        # Using Qwen2-VL deployed at any openai-compatible service such as vLLM:
+        'model_type': 'qwenvl_oai',
+        'model': 'Qwen/Qwen2-VL-72B-Instruct',
+        'model_server': 'http://localhost:8000/v1',  # api_base
+        'api_key': 'EMPTY',
+    }
+    llm = get_chat_model(llm_cfg_oai)
+
+    # Initial conversation
+    messages = [{
+        'role':
+            'user',
+        'content': [{
+            'image': 'https://dashscope.oss-cn-beijing.aliyuncs.com/images/dog_and_girl.jpeg'
+        }, {
+            'text': '图片中的内容是什么？请画一张内容相同，风格类似的图片。'
+        }]
+    }]
+
+    functions = [
+        {
+            'name': 'image_gen',
+            'description': 'AI绘画（图像生成）服务，输入文本描述，返回根据文本信息绘制的图片URL。',
+            'parameters': {
+                'name': 'prompt',
+                'type': 'string',
+                'description': '详细描述了希望生成的图像具有什么内容，例如人物、环境、动作等细节描述，使用英文',
+                'required': True
+            }
+        },
+    ]
+
+    print('# Assistant Response 1:')
+    responses = []
+    for responses in llm.chat(messages=messages, functions=functions, stream=True):
+        print(responses)
+    messages.extend(responses)
+
+    for rsp in responses:
+        if rsp.get('function_call', None):
+            func_name = rsp['function_call']['name']
+            if func_name == 'image_gen':
+                func_args = json.loads(rsp['function_call']['arguments'])
+                image_url = image_gen(func_args['prompt'])
+                print('# Function Response:')
+                func_rsp = {
+                    'role': 'function',
+                    'name': func_name,
+                    'content': [ContentItem(image=image_url),
+                                ContentItem(text=f'（ 这张图片的URL是 {image_url} ）')],
+                }
+                messages.append(func_rsp)
+                print(func_rsp)
+            else:
+                raise NotImplementedError
+
+    print('# Assistant Response 2:')
+    responses = []
+    for responses in llm.chat(messages=messages, functions=functions, stream=True):
+        print(responses)
+    messages.extend(responses)
+
+
+if __name__ == '__main__':
+    test()
--- a/examples/resource/blood_routine.pdf
+++ b/examples/resource/blood_routine.pdf
--- a/examples/resource/growing_girl.pdf
+++ b/examples/resource/growing_girl.pdf
--- a/examples/resource/screenshot_with_plot.jpeg
+++ b/examples/resource/screenshot_with_plot.jpeg
--- a/tests/examples/test_examples.py
+++ b/tests/examples/test_examples.py
@ -7,9 +7,6 @@ sys.path.insert(0, os.path.abspath(os.path.join(__file__, '../../..')))  # noqa

 ROOT_RESOURCE = os.path.abspath(os.path.join(__file__, '../../../examples/resource'))  # noqa
 from examples.assistant_add_custom_tool import test as assistant_add_custom_tool  # noqa
-from examples.assistant_angry_girlfriend import test as assistant_angry_girlfriend  # noqa
-# from examples.assistant_doctor import test as assistant_doctor
-from examples.assistant_growing_girl import test as assistant_growing_girl  # noqa
 from examples.assistant_weather_bot import test as assistant_weather_bot  # noqa
 from examples.function_calling import test as function_calling  # noqa
 from examples.function_calling_in_parallel import test as parallel_function_calling  # noqa
@ -19,6 +16,7 @@ from examples.group_chat_demo import test as group_chat_demo  # noqa
 from examples.llm_riddles import test as llm_riddles  # noqa
 from examples.llm_vl_mix_text import test as llm_vl_mix_text  # noqa
 from examples.multi_agent_router import test as multi_agent_router  # noqa
+from examples.qwen2vl_assistant_tooluse import test as qwen2vl_assistant_tooluse  # noqa
 from examples.react_data_analysis import test as react_data_analysis  # noqa
 from examples.visual_storytelling import test as visual_storytelling  # noqa

@ -34,26 +32,6 @@ def test_assistant_weather_bot(query, file):
    assistant_weather_bot(query=query, file=file)


-@pytest.mark.parametrize('query', ['你今天真好看'])
-def test_assistant_angry_girlfriend(query):
-    assistant_angry_girlfriend(query=query)
-
-
-# @pytest.mark.parametrize('query', ['医生，可以帮我看看我是否健康吗？'])
-# @pytest.mark.parametrize('file', [
-#     None,
-#     'https://pic4.zhimg.com/80/v2-2c8eedf3e12386fedcd5589cf5575717_720w.webp'
-# ])
-# def test_assistant_doctor(query, file):
-#     assistant_doctor(query=query, file=file)
-
-
-@pytest.mark.parametrize('query', ['请用image_gen开始创作！'])
-@pytest.mark.parametrize('file', [None, os.path.join(ROOT_RESOURCE, 'growing_girl.pdf')])
-def test_assistant_growing_girl(query, file):
-    assistant_growing_girl(query=query, file=file)
-
-
 def test_llm_vl_mix_text():
    llm_vl_mix_text()

@ -104,3 +82,7 @@ def test_group_chat_chess(query):

 def test_group_chat_demo():
    group_chat_demo()
+
+
+def test_qwen2vl_assistant_tooluse():
+    qwen2vl_assistant_tooluse()
--- a/tests/memory/test_memory.py
+++ b/tests/memory/test_memory.py
@ -16,8 +16,8 @@ def test_memory():
    mem = Memory(llm=llm_cfg)
    messages = [
        Message('user', [
-            ContentItem(text='女孩成长历程'),
-            ContentItem(file=str(Path(__file__).resolve().parent.parent.parent / 'examples/resource/growing_girl.pdf'))
+            ContentItem(text='how to flip images'),
+            ContentItem(file=str(Path(__file__).resolve().parent.parent.parent / 'examples/resource/doc.pdf'))
        ])
    ]
    *_, last = mem.run(messages, max_ref_token=4000, parser_page_size=500)