Message "content" now supports both `str` and `List` in Agents (#713)

* Change "content" type in Conversable Agent * content and system_message support str and List Update for all other agents * Content_str now also takes None as input * Group Chat now works with LMM too * Style: newline for import in Conversable Agentt * Add test for gourpchat + lmm * Resolve comments 1. Undo AssistantAgent changes 2. Modify the asserts and raises in `content_str` function and update test accordingly. * Undo AssistantAgent * Update comments and add assertion for LMM * Typo fix in docstring for content_str * Remove “None” out conversable_agent.py * Lint message to dict in multimodal_conversable_agent.py * Address lint issues * linting * Move lmm test into contrib test * Resolve 2 comments * Move img_utils into contrib folder * Resolve img_utils path issues
2023-12-02 16:40:50 -09:00 · 2023-12-02 16:40:50 -09:00 · c19f234149
parent 77e1d28c1b
commit c19f234149
16 changed files with 362 additions and 198 deletions
--- a/.github/workflows/contrib-lmm.yml
+++ b/.github/workflows/contrib-lmm.yml
@ -1,60 +0,0 @@
-# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
-# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: ContribTests
-
-on:
-  pull_request:
-    branches: ['main', 'dev/v0.2']
-    paths:
-      - 'autogen/img_utils.py'
-      - 'autogen/agentchat/contrib/multimodal_conversable_agent.py'
-      - 'autogen/agentchat/contrib/llava_agent.py'
-      - 'test/test_img_utils.py'
-      - 'test/agentchat/contrib/test_lmm.py'
-      - 'test/agentchat/contrib/test_llava.py'
-      - '.github/workflows/lmm-test.yml'
-      - 'setup.py'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-jobs:
-  LMMTest:
-
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        os: [ubuntu-latest, macos-latest, windows-2019]
-        python-version: ["3.8", "3.9", "3.10", "3.11"]
-    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v4
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Install packages and dependencies for all tests
-        run: |
-          python -m pip install --upgrade pip wheel
-          pip install pytest
-      - name: Install packages and dependencies for LMM
-        run: |
-          pip install -e .[lmm]
-          pip uninstall -y openai
-      - name: Test LMM and LLaVA
-        run: |
-          pytest test/test_img_utils.py test/agentchat/contrib/test_lmm.py test/agentchat/contrib/test_llava.py
-      - name: Coverage
-        if: matrix.python-version == '3.10'
-        run: |
-          pip install coverage>=5.3
-          coverage run -a -m pytest test/test_img_utils.py test/agentchat/contrib/test_lmm.py test/agentchat/contrib/test_llava.py
-          coverage xml
-      - name: Upload coverage to Codecov
-        if: matrix.python-version == '3.10'
-        uses: codecov/codecov-action@v3
-        with:
-          file: ./coverage.xml
-          flags: unittests
--- a/.github/workflows/contrib-tests.yml
+++ b/.github/workflows/contrib-tests.yml
@ -136,3 +136,40 @@ jobs:
      - name: Test TeachableAgent
        run: |
          pytest test/agentchat/contrib/test_teachable_agent.py
+
+  LMMTest:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-2019]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install packages and dependencies for all tests
+        run: |
+          python -m pip install --upgrade pip wheel
+          pip install pytest
+      - name: Install packages and dependencies for LMM
+        run: |
+          pip install -e .[lmm]
+          pip uninstall -y openai
+      - name: Test LMM and LLaVA
+        run: |
+          pytest test/agentchat/contrib/test_img_utils.py test/agentchat/contrib/test_lmm.py test/agentchat/contrib/test_llava.py
+      - name: Coverage
+        if: matrix.python-version == '3.10'
+        run: |
+          pip install coverage>=5.3
+          coverage run -a -m pytest test/agentchat/contrib/test_img_utils.py test/agentchat/contrib/test_lmm.py test/agentchat/contrib/test_llava.py
+          coverage xml
+      - name: Upload coverage to Codecov
+        if: matrix.python-version == '3.10'
+        uses: codecov/codecov-action@v3
+        with:
+          file: ./coverage.xml
+          flags: unittests
--- a/.gitignore
+++ b/.gitignore
@ -167,6 +167,7 @@ wolfram.txt

 # DB on disk for TeachableAgent
 tmp/
+test/my_tmp/*

 # Storage for the AgentEval output
 test/test_files/agenteval-in-out/out/
--- a/autogen/agentchat/assistant_agent.py
+++ b/autogen/agentchat/assistant_agent.py
@ -1,6 +1,7 @@
-from .conversable_agent import ConversableAgent
 from typing import Callable, Dict, Literal, Optional, Union

+from .conversable_agent import ConversableAgent
+

 class AssistantAgent(ConversableAgent):
    """(In preview) Assistant agent, designed to solve a task with LLM.
--- a/autogen/agentchat/contrib/img_utils.py
+++ b/autogen/agentchat/contrib/img_utils.py
--- a/autogen/agentchat/contrib/llava_agent.py
+++ b/autogen/agentchat/contrib/llava_agent.py
@ -10,9 +10,9 @@ import requests
 from regex import R

 from autogen.agentchat.agent import Agent
+from autogen.agentchat.contrib.img_utils import get_image_data, llava_formater
 from autogen.agentchat.contrib.multimodal_conversable_agent import MultimodalConversableAgent
 from autogen.code_utils import content_str
-from autogen.img_utils import get_image_data, llava_formater

 try:
    from termcolor import colored
--- a/autogen/agentchat/contrib/multimodal_conversable_agent.py
+++ b/autogen/agentchat/contrib/multimodal_conversable_agent.py
@ -1,8 +1,9 @@
+import copy
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 from autogen import OpenAIWrapper
 from autogen.agentchat import Agent, ConversableAgent
-from autogen.img_utils import gpt4v_formatter
+from autogen.agentchat.contrib.img_utils import gpt4v_formatter

 try:
    from termcolor import colored
@ -41,19 +42,14 @@ class MultimodalConversableAgent(ConversableAgent):
            *args,
            **kwargs,
        )
-
+        # call the setter to handle special format.
        self.update_system_message(system_message)
        self._is_termination_msg = (
            is_termination_msg
            if is_termination_msg is not None
-            else (lambda x: any([item["text"] == "TERMINATE" for item in x.get("content") if item["type"] == "text"]))
+            else (lambda x: content_str(x.get("content")) == "TERMINATE")
        )

-    @property
-    def system_message(self) -> List:
-        """Return the system message."""
-        return self._oai_system_message[0]["content"]
-
    def update_system_message(self, system_message: Union[Dict, List, str]):
        """Update the system message.

@ -64,44 +60,29 @@ class MultimodalConversableAgent(ConversableAgent):
        self._oai_system_message[0]["role"] = "system"

    @staticmethod
-    def _message_to_dict(message: Union[Dict, List, str]):
-        """Convert a message to a dictionary.
+    def _message_to_dict(message: Union[Dict, List, str]) -> Dict:
+        """Convert a message to a dictionary. This implementation
+        handles the GPT-4V formatting for easier prompts.

-        The message can be a string or a dictionary. The string will be put in the "content" field of the new dictionary.
+        The message can be a string, a dictionary, or a list of dictionaries:
+            - If it's a string, it will be cast into a list and placed in the 'content' field.
+            - If it's a list, it will be directly placed in the 'content' field.
+            - If it's a dictionary, it is already in message dict format. The 'content' field of this dictionary
+            will be processed using the gpt4v_formatter.
        """
        if isinstance(message, str):
            return {"content": gpt4v_formatter(message)}
        if isinstance(message, list):
            return {"content": message}
-        else:
+        if isinstance(message, dict):
+            assert "content" in message, "The message dict must have a `content` field"
+            if isinstance(message["content"], str):
+                message = copy.deepcopy(message)
+                message["content"] = gpt4v_formatter(message["content"])
+            try:
+                content_str(message["content"])
+            except (TypeError, ValueError) as e:
+                print("The `content` field should be compatible with the content_str function!")
+                raise e
            return message
-
-    def _print_received_message(self, message: Union[Dict, str], sender: Agent):
-        # print the message received
-        print(colored(sender.name, "yellow"), "(to", f"{self.name}):\n", flush=True)
-        if message.get("role") == "function":
-            func_print = f"***** Response from calling function \"{message['name']}\" *****"
-            print(colored(func_print, "green"), flush=True)
-            print(content_str(message["content"]), flush=True)
-            print(colored("*" * len(func_print), "green"), flush=True)
-        else:
-            content = message.get("content")
-            if content is not None:
-                if "context" in message:
-                    content = OpenAIWrapper.instantiate(
-                        content,
-                        message["context"],
-                        self.llm_config and self.llm_config.get("allow_format_str_template", False),
-                    )
-                print(content_str(content), flush=True)
-            if "function_call" in message:
-                func_print = f"***** Suggested function Call: {message['function_call'].get('name', '(No function name found)')} *****"
-                print(colored(func_print, "green"), flush=True)
-                print(
-                    "Arguments: \n",
-                    message["function_call"].get("arguments", "(No arguments found)"),
-                    flush=True,
-                    sep="",
-                )
-                print(colored("*" * len(func_print), "green"), flush=True)
-        print("\n", "-" * 80, flush=True, sep="")
+        raise ValueError(f"Unsupported message type: {type(message)}")
--- a/autogen/agentchat/conversable_agent.py
+++ b/autogen/agentchat/conversable_agent.py
@ -1,18 +1,14 @@
 import asyncio
-from collections import defaultdict
 import copy
 import json
 import logging
+from collections import defaultdict
 from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Type, Union
+
 from autogen import OpenAIWrapper
+from autogen.code_utils import DEFAULT_MODEL, UNKNOWN, content_str, execute_code, extract_code, infer_lang
+
 from .agent import Agent
-from autogen.code_utils import (
-    DEFAULT_MODEL,
-    UNKNOWN,
-    execute_code,
-    extract_code,
-    infer_lang,
-)

 try:
    from termcolor import colored
@ -50,7 +46,7 @@ class ConversableAgent(Agent):
    def __init__(
        self,
        name: str,
-        system_message: Optional[str] = "You are a helpful AI Assistant.",
+        system_message: Optional[Union[str, List]] = "You are a helpful AI Assistant.",
        is_termination_msg: Optional[Callable[[Dict], bool]] = None,
        max_consecutive_auto_reply: Optional[int] = None,
        human_input_mode: Optional[str] = "TERMINATE",
@ -62,7 +58,7 @@ class ConversableAgent(Agent):
        """
        Args:
            name (str): name of the agent.
-            system_message (str): system message for the ChatCompletion inference.
+            system_message (str or list): system message for the ChatCompletion inference.
            is_termination_msg (function): a function that takes a message in the form of a dictionary
                and returns a boolean value indicating if this received message is a termination message.
                The dict can contain the following keys: "content", "role", "name", "function_call".
@ -105,8 +101,11 @@ class ConversableAgent(Agent):
        self._oai_messages = defaultdict(list)
        self._oai_system_message = [{"content": system_message, "role": "system"}]
        self._is_termination_msg = (
-            is_termination_msg if is_termination_msg is not None else (lambda x: x.get("content") == "TERMINATE")
+            is_termination_msg
+            if is_termination_msg is not None
+            else (lambda x: content_str(x.get("content")) == "TERMINATE")
        )
+
        if llm_config is False:
            self.llm_config = False
            self.client = None
@ -190,15 +189,15 @@ class ConversableAgent(Agent):
        )

    @property
-    def system_message(self):
+    def system_message(self) -> Union[str, List]:
        """Return the system message."""
        return self._oai_system_message[0]["content"]

-    def update_system_message(self, system_message: str):
+    def update_system_message(self, system_message: Union[str, List]):
        """Update the system message.

        Args:
-            system_message (str): system message for the ChatCompletion inference.
+            system_message (str or List): system message for the ChatCompletion inference.
        """
        self._oai_system_message[0]["content"] = system_message

@ -258,7 +257,7 @@ class ConversableAgent(Agent):
        return None if self._code_execution_config is False else self._code_execution_config.get("use_docker")

    @staticmethod
-    def _message_to_dict(message: Union[Dict, str]):
+    def _message_to_dict(message: Union[Dict, str]) -> Dict:
        """Convert a message to a dictionary.

        The message can be a string or a dictionary. The string will be put in the "content" field of the new dictionary.
@ -314,7 +313,7 @@ class ConversableAgent(Agent):
        Args:
            message (dict or str): message to be sent.
                The message could contain the following fields:
-                - content (str): Required, the content of the message. (Can be None)
+                - content (str or List): Required, the content of the message. (Can be None)
                - function_call (str): the name of the function to be called.
                - name (str): the name of the function to be called.
                - role (str): the role of the message, any role that is not "function"
@ -363,7 +362,7 @@ class ConversableAgent(Agent):
        Args:
            message (dict or str): message to be sent.
                The message could contain the following fields:
-                - content (str): Required, the content of the message. (Can be None)
+                - content (str or List): Required, the content of the message. (Can be None)
                - function_call (str): the name of the function to be called.
                - name (str): the name of the function to be called.
                - role (str): the role of the message, any role that is not "function"
@ -419,7 +418,7 @@ class ConversableAgent(Agent):
                        message["context"],
                        self.llm_config and self.llm_config.get("allow_format_str_template", False),
                    )
-                print(content, flush=True)
+                print(content_str(content), flush=True)
            if "function_call" in message:
                function_call = dict(message["function_call"])
                func_print = (
@ -435,7 +434,7 @@ class ConversableAgent(Agent):
                print(colored("*" * len(func_print), "green"), flush=True)
        print("\n", "-" * 80, flush=True, sep="")

-    def _process_received_message(self, message, sender, silent):
+    def _process_received_message(self, message: Union[Dict, str], sender: Agent, silent: bool):
        message = self._message_to_dict(message)
        # When the agent receives a message, the role of the message is "user". (If 'role' exists and is 'function', it will remain unchanged.)
        valid = self._append_oai_message(message, "user", sender)
@ -681,7 +680,7 @@ class ConversableAgent(Agent):
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
-    ):
+    ) -> Tuple[bool, Union[Dict, None]]:
        """Generate a reply using function call."""
        if config is None:
            config = self
@ -698,7 +697,7 @@ class ConversableAgent(Agent):
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
-    ):
+    ) -> Tuple[bool, Union[Dict, None]]:
        """Generate a reply using async function call."""
        if config is None:
            config = self
@ -720,8 +719,26 @@ class ConversableAgent(Agent):
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
-    ) -> Tuple[bool, Union[str, Dict, None]]:
-        """Check if the conversation should be terminated, and if human reply is provided."""
+    ) -> Tuple[bool, Union[str, None]]:
+        """Check if the conversation should be terminated, and if human reply is provided.
+
+        This method checks for conditions that require the conversation to be terminated, such as reaching
+        a maximum number of consecutive auto-replies or encountering a termination message. Additionally,
+        it prompts for and processes human input based on the configured human input mode, which can be
+        'ALWAYS', 'NEVER', or 'TERMINATE'. The method also manages the consecutive auto-reply counter
+        for the conversation and prints relevant messages based on the human input received.
+
+        Args:
+            - messages (Optional[List[Dict]]): A list of message dictionaries, representing the conversation history.
+            - sender (Optional[Agent]): The agent object representing the sender of the message.
+            - config (Optional[Any]): Configuration object, defaults to the current instance if not provided.
+
+        Returns:
+            - Tuple[bool, Union[str, Dict, None]]: A tuple containing a boolean indicating if the conversation
+            should be terminated, and a human reply which can be a string, a dictionary, or None.
+        """
+        # Function implementation...
+
        if config is None:
            config = self
        if messages is None:
@ -791,8 +808,24 @@ class ConversableAgent(Agent):
        messages: Optional[List[Dict]] = None,
        sender: Optional[Agent] = None,
        config: Optional[Any] = None,
-    ) -> Tuple[bool, Union[str, Dict, None]]:
-        """(async) Check if the conversation should be terminated, and if human reply is provided."""
+    ) -> Tuple[bool, Union[str, None]]:
+        """(async) Check if the conversation should be terminated, and if human reply is provided.
+
+        This method checks for conditions that require the conversation to be terminated, such as reaching
+        a maximum number of consecutive auto-replies or encountering a termination message. Additionally,
+        it prompts for and processes human input based on the configured human input mode, which can be
+        'ALWAYS', 'NEVER', or 'TERMINATE'. The method also manages the consecutive auto-reply counter
+        for the conversation and prints relevant messages based on the human input received.
+
+        Args:
+            - messages (Optional[List[Dict]]): A list of message dictionaries, representing the conversation history.
+            - sender (Optional[Agent]): The agent object representing the sender of the message.
+            - config (Optional[Any]): Configuration object, defaults to the current instance if not provided.
+
+        Returns:
+            - Tuple[bool, Union[str, Dict, None]]: A tuple containing a boolean indicating if the conversation
+            should be terminated, and a human reply which can be a string, a dictionary, or None.
+        """
        if config is None:
            config = self
        if messages is None:
@ -962,8 +995,20 @@ class ConversableAgent(Agent):
                    return reply
        return self._default_auto_reply

-    def _match_trigger(self, trigger, sender):
-        """Check if the sender matches the trigger."""
+    def _match_trigger(self, trigger: Union[None, str, type, Agent, Callable, List], sender: Agent) -> bool:
+        """Check if the sender matches the trigger.
+
+        Args:
+            - trigger (Union[None, str, type, Agent, Callable, List]): The condition to match against the sender.
+            Can be `None`, string, type, `Agent` instance, callable, or a list of these.
+            - sender (Agent): The sender object or type to be matched against the trigger.
+
+        Returns:
+            - bool: Returns `True` if the sender matches the trigger, otherwise `False`.
+
+        Raises:
+            - ValueError: If the trigger type is unsupported.
+        """
        if trigger is None:
            return sender is None
        elif isinstance(trigger, str):
@ -971,9 +1016,12 @@ class ConversableAgent(Agent):
        elif isinstance(trigger, type):
            return isinstance(sender, trigger)
        elif isinstance(trigger, Agent):
+            # return True if the sender is the same type (class) as the trigger
            return trigger == sender
        elif isinstance(trigger, Callable):
-            return trigger(sender)
+            rst = trigger(sender)
+            assert rst in [True, False], f"trigger {trigger} must return a boolean value."
+            return rst
        elif isinstance(trigger, list):
            return any(self._match_trigger(t, sender) for t in trigger)
        else:
@ -1095,7 +1143,7 @@ class ConversableAgent(Agent):
            result.append(char)
        return "".join(result)

-    def execute_function(self, func_call):
+    def execute_function(self, func_call) -> Tuple[bool, Dict[str, str]]:
        """Execute a function call and return the result.

        Override this function to modify the way to execute a function call.
@ -1195,7 +1243,10 @@ class ConversableAgent(Agent):
        """Generate the initial message for the agent.

        Override this function to customize the initial message based on user's request.
-        If not overridden, "message" needs to be provided in the context.
+        If not overriden, "message" needs to be provided in the context.
+
+        Args:
+            **context: any context information, and "message" parameter needs to be provided.
        """
        return context["message"]

--- a/autogen/agentchat/groupchat.py
+++ b/autogen/agentchat/groupchat.py
@ -1,9 +1,11 @@
 import logging
-import sys
 import random
+import re
+import sys
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Union
-import re
+
+from ..code_utils import content_str
 from .agent import Agent
 from .conversable_agent import ConversableAgent

@ -50,6 +52,14 @@ class GroupChat:
        """Reset the group chat."""
        self.messages.clear()

+    def append(self, message: Dict):
+        """Append a message to the group chat.
+        We cast the content to str here so that it can be managed by text-based
+        model.
+        """
+        message["content"] = content_str(message["content"])
+        self.messages.append(message)
+
    def agent_by_name(self, name: str) -> Agent:
        """Returns the agent with a given name."""
        return self.agents[self.agent_names.index(name)]
@ -64,7 +74,7 @@ class GroupChat:
                if self.agents[(offset + i) % len(self.agents)] in agents:
                    return self.agents[(offset + i) % len(self.agents)]

-    def select_speaker_msg(self, agents: List[Agent]):
+    def select_speaker_msg(self, agents: List[Agent]) -> str:
        """Return the message for selecting the next speaker."""
        return f"""You are in a role play game. The following roles are available:
 {self._participant_roles(agents)}.
@ -72,7 +82,7 @@ class GroupChat:
 Read the following conversation.
 Then select the next role from {[agent.name for agent in agents]} to play. Only return the role."""

-    def manual_select_speaker(self, agents: List[Agent]) -> Agent:
+    def manual_select_speaker(self, agents: List[Agent]) -> Union[Agent, None]:
        """Manually select the next speaker."""

        print("Please select the next speaker from the following list:")
@ -190,19 +200,26 @@ Then select the next role from {[agent.name for agent in agents]} to play. Only

        roles = []
        for agent in agents:
-            if agent.system_message.strip() == "":
+            if content_str(agent.system_message).strip() == "":
                logger.warning(
                    f"The agent '{agent.name}' has an empty system_message, and may not work well with GroupChat."
                )
            roles.append(f"{agent.name}: {agent.system_message}")
        return "\n".join(roles)

-    def _mentioned_agents(self, message_content: str, agents: List[Agent]) -> Dict:
-        """
-        Finds and counts agent mentions in the string message_content, taking word boundaries into account.
+    def _mentioned_agents(self, message_content: Union[str, List], agents: List[Agent]) -> Dict:
+        """Counts the number of times each agent is mentioned in the provided message content.

-        Returns: A dictionary mapping agent names to mention counts (to be included, at least one mention must occur)
+        Args:
+            message_content (Union[str, List]): The content of the message, either as a single string or a list of strings.
+            agents (List[Agent]): A list of Agent objects, each having a 'name' attribute to be searched in the message content.
+
+        Returns:
+            Dict: a counter for mentioned agents.
        """
+        # Cast message content to str
+        message_content = content_str(message_content)
+
        mentions = dict()
        for agent in agents:
            regex = (
@ -224,7 +241,7 @@ class GroupChatManager(ConversableAgent):
        # unlimited consecutive auto reply by default
        max_consecutive_auto_reply: Optional[int] = sys.maxsize,
        human_input_mode: Optional[str] = "NEVER",
-        system_message: Optional[str] = "Group chat manager.",
+        system_message: Optional[Union[str, List]] = "Group chat manager.",
        **kwargs,
    ):
        super().__init__(
@ -256,12 +273,12 @@ class GroupChatManager(ConversableAgent):
            # set the name to speaker's name if the role is not function
            if message["role"] != "function":
                message["name"] = speaker.name
-            groupchat.messages.append(message)
+
+            groupchat.append(message)

            if self._is_termination_msg(message):
                # The conversation is over
                break
-
            # broadcast the message to all agents except the speaker
            for agent in groupchat.agents:
                if agent != speaker:
@ -306,7 +323,8 @@ class GroupChatManager(ConversableAgent):
            # set the name to speaker's name if the role is not function
            if message["role"] != "function":
                message["name"] = speaker.name
-            groupchat.messages.append(message)
+
+            groupchat.append(message)

            if self._is_termination_msg(message):
                # The conversation is over
--- a/autogen/agentchat/user_proxy_agent.py
+++ b/autogen/agentchat/user_proxy_agent.py
@ -1,5 +1,6 @@
+from typing import Callable, Dict, List, Literal, Optional, Union
+
 from .conversable_agent import ConversableAgent
-from typing import Callable, Dict, Literal, Optional, Union


 class UserProxyAgent(ConversableAgent):
@ -25,7 +26,7 @@ class UserProxyAgent(ConversableAgent):
        code_execution_config: Optional[Union[Dict, Literal[False]]] = None,
        default_auto_reply: Optional[Union[str, Dict, None]] = "",
        llm_config: Optional[Union[Dict, Literal[False]]] = False,
-        system_message: Optional[str] = "",
+        system_message: Optional[Union[str, List]] = "",
    ):
        """
        Args:
@ -66,7 +67,7 @@ class UserProxyAgent(ConversableAgent):
                Please refer to [OpenAIWrapper.create](/docs/reference/oai/client#create)
                for available options.
                Default to false, which disables llm-based auto reply.
-            system_message (str): system message for ChatCompletion inference.
+            system_message (str or List): system message for ChatCompletion inference.
                Only used when llm_config is not False. Use it to reprogram the agent.
        """
        super().__init__(
--- a/autogen/code_utils.py
+++ b/autogen/code_utils.py
@ -38,16 +38,44 @@ PATH_SEPARATOR = WIN32 and "\\" or "/"
 logger = logging.getLogger(__name__)


-def content_str(content: Union[str, List]) -> str:
-    if type(content) is str:
+def content_str(content: Union[str, List, None]) -> str:
+    """Converts `content` into a string format.
+
+    This function processes content that may be a string, a list of mixed text and image URLs, or None,
+    and converts it into a string. Text is directly appended to the result string, while image URLs are
+    represented by a placeholder image token. If the content is None, an empty string is returned.
+
+    Args:
+        - content (Union[str, List, None]): The content to be processed. Can be a string, a list of dictionaries
+                                      representing text and image URLs, or None.
+
+    Returns:
+        str: A string representation of the input content. Image URLs are replaced with an image token.
+
+    Note:
+    - The function expects each dictionary in the list to have a "type" key that is either "text" or "image_url".
+      For "text" type, the "text" key's value is appended to the result. For "image_url", an image token is appended.
+    - This function is useful for handling content that may include both text and image references, especially
+      in contexts where images need to be represented as placeholders.
+    """
+    if content is None:
+        return ""
+    if isinstance(content, str):
        return content
+    if not isinstance(content, list):
+        raise TypeError(f"content must be None, str, or list, but got {type(content)}")
+
    rst = ""
    for item in content:
+        if not isinstance(item, dict):
+            raise TypeError("Wrong content format: every element should be dict if the content is a list.")
+        assert "type" in item, "Wrong content format. Missing 'type' key in content's dict."
        if item["type"] == "text":
            rst += item["text"]
-        else:
-            assert isinstance(item, dict) and item["type"] == "image_url", "Wrong content format."
+        elif item["type"] == "image_url":
            rst += "<image>"
+        else:
+            raise ValueError(f"Wrong content format: unknown type {item['type']} within the content")
    return rst


--- a/notebook/agentchat_dalle_and_gpt4v.ipynb
+++ b/notebook/agentchat_dalle_and_gpt4v.ipynb
@ -39,7 +39,7 @@
    "import autogen\n",
    "from autogen import AssistantAgent, Agent, UserProxyAgent, ConversableAgent\n",
    "\n",
-    "from autogen.img_utils import get_image_data, _to_pil\n",
+    "from autogen.agentchat.contrib.img_utils import get_image_data, _to_pil\n",
    "from termcolor import colored\n",
    "import random"
   ]
--- a/notebook/agentchat_lmm_gpt-4v.ipynb
+++ b/notebook/agentchat_lmm_gpt-4v.ipynb
@ -91,38 +91,6 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "57462351",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['openai']"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "# Remove the `api_type` param as it is not needed for 4V\n",
-    "[config.pop(\"api_type\", None) for config in config_list_4v]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "id": "e23df0dd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# image_agent._oai_messages[user_proxy]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
   "id": "67157629",
   "metadata": {
    "scrolled": false
@ -180,7 +148,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 4,
   "id": "73a2b234",
   "metadata": {
    "scrolled": false
@ -236,7 +204,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 5,
   "id": "e8eca993",
   "metadata": {},
   "outputs": [],
@ -339,7 +307,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 6,
   "id": "977b9017",
   "metadata": {
    "scrolled": false
@ -724,7 +692,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 7,
   "id": "f0a58827",
   "metadata": {},
   "outputs": [],
@ -736,10 +704,100 @@
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "b95bf449",
+   "id": "c6206648",
   "metadata": {},
   "outputs": [],
   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a95d87c2",
+   "metadata": {},
+   "source": [
+    "## Group Chat Example with Multimodal Agent"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "56bd5742",
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
+      "\n",
+      "Describe the image:\n",
+      "                        <img https://th.bing.com/th/id/R.422068ce8af4e15b0634fe2540adea7a?rik=y4OcXBE%2fqutDOw&pid=ImgRaw&r=0>.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[31m\n",
+      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+      "\u001b[33mimage-explainer-1\u001b[0m (to chat_manager):\n",
+      "\n",
+      "In a soft-focus world, a caramel-colored puppy with a coat of curly fur sits serenely, its innocent eyes gazing into the distance. Adorned with a collar that hosts a vibrant, multicolored bandana and a shiny tag engraved with the name \"Webster,\" the pup exudes a sense of youthful curiosity and gentle charm. Behind this bundle of joy, the muted backdrop of a home's interior whispers tales of comfort and domesticity, with a pair of black boots resting by the door, hinting at the comings and goings of human life amidst which this little creature finds its love and belonging.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[31m\n",
+      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+      "\u001b[33mimage-explainer-2\u001b[0m (to chat_manager):\n",
+      "\n",
+      "The image shows a young, caramel-colored puppy with curly fur sitting on the floor. The puppy is wearing a blue collar with a colorful bandana and a tag that appears to have the name \"Webster\" engraved on it. In the background, there are black boots near a white door, suggesting an indoor, home setting. The focus is on the puppy, making the background appear softly blurred. The puppy's expression is gentle, with a hint of curiosity in its eyes.\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[31m\n",
+      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+      "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
+      "\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n",
+      "\u001b[31m\n",
+      ">>>>>>>> USING AUTO REPLY...\u001b[0m\n",
+      "\u001b[33mUser_proxy\u001b[0m (to chat_manager):\n",
+      "\n",
+      "\n",
+      "\n",
+      "--------------------------------------------------------------------------------\n"
+     ]
+    }
+   ],
+   "source": [
+    "agent1 = MultimodalConversableAgent(\n",
+    "    name=\"image-explainer-1\",\n",
+    "    max_consecutive_auto_reply=10,\n",
+    "    llm_config={\"config_list\": config_list_4v, \"temperature\": 0.5, \"max_tokens\": 300},\n",
+    "    system_message=\"Your image description is poetic and engaging.\",\n",
+    ")\n",
+    "agent2 = MultimodalConversableAgent(\n",
+    "    name=\"image-explainer-2\",\n",
+    "    max_consecutive_auto_reply=10,\n",
+    "    llm_config={\"config_list\": config_list_4v, \"temperature\": 0.5, \"max_tokens\": 300},\n",
+    "    system_message=\"Your image description is factual and to the point.\",\n",
+    ")\n",
+    "\n",
+    "user_proxy = autogen.UserProxyAgent(\n",
+    "    name=\"User_proxy\",\n",
+    "    system_message=\"Ask both image explainer 1 and 2 for their description.\",\n",
+    "    human_input_mode=\"TERMINATE\",  # Try between ALWAYS, NEVER, and TERMINATE\n",
+    "    max_consecutive_auto_reply=10,\n",
+    ")\n",
+    "\n",
+    "# We set max_round to 5\n",
+    "groupchat = autogen.GroupChat(agents=[agent1, agent2, user_proxy], \n",
+    "                              messages=[], \n",
+    "                              max_round=5)\n",
+    "group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, \n",
+    "                                              llm_config=gpt4_llm_config)\n",
+    "\n",
+    "user_proxy.initiate_chat(group_chat_manager,\n",
+    "                        message=f\"\"\"Describe the image:\n",
+    "                        <img https://th.bing.com/th/id/R.422068ce8af4e15b0634fe2540adea7a?rik=y4OcXBE%2fqutDOw&pid=ImgRaw&r=0>.\"\"\")"
+   ]
  }
 ],
 "metadata": {
--- a/test/agentchat/contrib/test_img_utils.py
+++ b/test/agentchat/contrib/test_img_utils.py
@ -10,7 +10,7 @@ import requests
 try:
    from PIL import Image

-    from autogen.img_utils import extract_img_paths, get_image_data, gpt4v_formatter, llava_formater
+    from autogen.agentchat.contrib.img_utils import extract_img_paths, get_image_data, gpt4v_formatter, llava_formater
 except ImportError:
    skip = True
 else:
@ -71,7 +71,7 @@ class TestLlavaFormater(unittest.TestCase):
        result = llava_formater(prompt)
        self.assertEqual(result, expected_output)

-    @patch("autogen.img_utils.get_image_data")
+    @patch("autogen.agentchat.contrib.img_utils.get_image_data")
    def test_with_images(self, mock_get_image_data):
        """
        Test the llava_formater function with a prompt containing images.
@ -84,7 +84,7 @@ class TestLlavaFormater(unittest.TestCase):
        result = llava_formater(prompt)
        self.assertEqual(result, expected_output)

-    @patch("autogen.img_utils.get_image_data")
+    @patch("autogen.agentchat.contrib.img_utils.get_image_data")
    def test_with_ordered_images(self, mock_get_image_data):
        """
        Test the llava_formater function with ordered image tokens.
@ -109,7 +109,7 @@ class TestGpt4vFormatter(unittest.TestCase):
        result = gpt4v_formatter(prompt)
        self.assertEqual(result, expected_output)

-    @patch("autogen.img_utils.get_image_data")
+    @patch("autogen.agentchat.contrib.img_utils.get_image_data")
    def test_with_images(self, mock_get_image_data):
        """
        Test the gpt4v_formatter function with a prompt containing images.
@ -126,7 +126,7 @@ class TestGpt4vFormatter(unittest.TestCase):
        result = gpt4v_formatter(prompt)
        self.assertEqual(result, expected_output)

-    @patch("autogen.img_utils.get_image_data")
+    @patch("autogen.agentchat.contrib.img_utils.get_image_data")
    def test_multiple_images(self, mock_get_image_data):
        """
        Test the gpt4v_formatter function with a prompt containing multiple images.
--- a/test/agentchat/contrib/test_lmm.py
+++ b/test/agentchat/contrib/test_lmm.py
@ -79,5 +79,53 @@ class TestMultimodalConversableAgent(unittest.TestCase):
        self.agent._print_received_message.assert_called_with(message_str, sender)


+@pytest.mark.skipif(skip, reason="Dependency not installed")
+def test_group_chat_with_lmm():
+    """
+    Tests the group chat functionality with two MultimodalConversable Agents.
+    Verifies that the chat is correctly limited by the max_round parameter.
+    Each agent is set to describe an image in a unique style, but the chat should not exceed the specified max_rounds.
+    """
+
+    # Configuration parameters
+    max_round = 5
+    max_consecutive_auto_reply = 10
+    llm_config = False
+
+    # Creating two MultimodalConversable Agents with different descriptive styles
+    agent1 = MultimodalConversableAgent(
+        name="image-explainer-1",
+        max_consecutive_auto_reply=max_consecutive_auto_reply,
+        llm_config=llm_config,
+        system_message="Your image description is poetic and engaging.",
+    )
+    agent2 = MultimodalConversableAgent(
+        name="image-explainer-2",
+        max_consecutive_auto_reply=max_consecutive_auto_reply,
+        llm_config=llm_config,
+        system_message="Your image description is factual and to the point.",
+    )
+
+    # Creating a user proxy agent for initiating the group chat
+    user_proxy = autogen.UserProxyAgent(
+        name="User_proxy",
+        system_message="Ask both image explainer 1 and 2 for their description.",
+        human_input_mode="NEVER",  # Options: 'ALWAYS' or 'NEVER'
+        max_consecutive_auto_reply=max_consecutive_auto_reply,
+    )
+
+    # Setting up the group chat
+    groupchat = autogen.GroupChat(agents=[agent1, agent2, user_proxy], messages=[], max_round=max_round)
+    group_chat_manager = autogen.GroupChatManager(groupchat=groupchat, llm_config=llm_config)
+
+    # Initiating the group chat and observing the number of rounds
+    user_proxy.initiate_chat(group_chat_manager, message=f"What do you see? <img {base64_encoded_image}>")
+
+    # Assertions to check if the number of rounds does not exceed max_round
+    assert all(len(arr) <= max_round for arr in agent1._oai_messages.values()), "Agent 1 exceeded max rounds"
+    assert all(len(arr) <= max_round for arr in agent2._oai_messages.values()), "Agent 2 exceeded max rounds"
+    assert all(len(arr) <= max_round for arr in user_proxy._oai_messages.values()), "User proxy exceeded max rounds"
+
+
 if __name__ == "__main__":
    unittest.main()
--- a/test/test_code.py
+++ b/test/test_code.py
@ -403,7 +403,7 @@ class TestContentStr(unittest.TestCase):

    def test_invalid_content(self):
        content = [{"type": "text", "text": "hello"}, {"type": "wrong_type", "url": "http://example.com/image.png"}]
-        with self.assertRaises(AssertionError) as context:
+        with self.assertRaises(ValueError) as context:
            content_str(content)
        self.assertIn("Wrong content format", str(context.exception))