[AMD][Hardware][Misc][Bugfix] xformer cleanup and light navi logic and CI fixes and refactoring (#4129)

2024-04-22 00:57:24 -04:00 · 2024-04-22 00:57:24 -04:00 · 95e5b087cf
parent a37d815b83
commit 95e5b087cf
6 changed files with 19 additions and 217 deletions
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@ -15,10 +15,8 @@ steps:
  commands:
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
-  - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_basic_correctness.py
  - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
  - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
-  - VLLM_ATTENTION_BACKEND=ROCM_FLASH pytest -v -s basic_correctness/test_chunked_prefill.py

 - label: Core Test
  command: pytest -v -s core
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE"
 ARG FA_GFX_ARCHS="gfx90a;gfx942"
 RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"

-ARG FA_BRANCH="3d2b6f5"
+ARG FA_BRANCH="ae7928c"
 RUN echo "FA_BRANCH is $FA_BRANCH"

 # whether to build flash-attention
@ -92,13 +92,10 @@ RUN if [ "$BUILD_TRITON" = "1" ]; then \
 COPY ./ /app/vllm

 RUN python3 -m pip install --upgrade pip numba
-RUN python3 -m pip install xformers==0.0.23 --no-deps

 RUN cd /app \
    && cd vllm \
    && pip install -U -r requirements-rocm.txt \
-    && if [ "$BUILD_FA" = "1" ]; then \
-       bash patch_xformers.rocm.sh; fi \
    && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
    && python3 setup.py install \
    && cd ..
--- a/patch_xformers.rocm.sh
+++ b/patch_xformers.rocm.sh
@ -1,33 +0,0 @@
-#!/bin/bash
-set -e
-
-XFORMERS_VERSION="0.0.23"
-
-export XFORMERS_INSTALLED_VERSION=$(python -c 'import xformers; print(xformers.__version__)')
-
-if [ "$XFORMERS_INSTALLED_VERSION" != "$XFORMERS_VERSION" ]; then
-    echo "ERROR: xformers version must be ${XFORMERS_VERSION}. ${XFORMERS_INSTALLED_VERSION} is installed"
-    exit 1
-fi
-
-export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
-export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
-
-echo "XFORMERS_FMHA_FLASH_PATH = ${XFORMERS_FMHA_FLASH_PATH}"
-echo "XFORMERS_FMHA_COMMON_PATH = ${XFORMERS_FMHA_COMMON_PATH}"
-
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
-    echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
-    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-${XFORMERS_VERSION}.rocm.patch"
-    echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
-else
-    echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
-fi
-
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"; then
-    echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
-    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-${XFORMERS_VERSION}.rocm.patch"
-    echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
-else
-    echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
-fi
--- a/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
+++ b/rocm_patch/commonpy_xformers-0.0.23.rocm.patch
@ -1,13 +0,0 @@
--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py	2023-11-29 03:17:03.930103539 +0000
-+++ common.py	2023-11-28 16:14:19.846233146 +0000
-@@ -298,8 +298,8 @@
-         dtype = d.query.dtype
-         if device_type not in cls.SUPPORTED_DEVICES:
-             reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
-        if device_type == "cuda" and not _built_with_cuda:
-            reasons.append("xFormers wasn't build with CUDA support")
-+        #if device_type == "cuda" and not _built_with_cuda:
-+        #    reasons.append("xFormers wasn't build with CUDA support")
-         if device_type == "cuda":
-             device_capability = torch.cuda.get_device_capability(d.device)
-             if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
--- a/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
+++ b/rocm_patch/flashpy_xformers-0.0.23.rocm.patch
@ -1,152 +0,0 @@
--- flash_ori.py	2023-12-13 05:43:31.530752623 +0000
-+++ flash_patch.py	2023-12-13 06:00:45.962403104 +0000
-@@ -36,44 +36,44 @@
- 
- FLASH_VERSION = "0.0.0"
- try:
-    try:
-        from ... import _C_flashattention  # type: ignore[attr-defined]
-        from ..._cpp_lib import _build_metadata
-
-        if _build_metadata is not None:
-            FLASH_VERSION = _build_metadata.flash_version
-    except ImportError:
-        import flash_attn
-        from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
-
-        FLASH_VERSION = flash_attn.__version__
-        flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
-        if (
-            flash_ver_parsed != (2, 3, 6)
-            and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
-        ):
-            raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
-+    #try:
-+    #    from ... import _C_flashattention  # type: ignore[attr-defined]
-+    #    from ..._cpp_lib import _build_metadata
-+
-+    #    if _build_metadata is not None:
-+    #        FLASH_VERSION = _build_metadata.flash_version
-+    #except ImportError:
-+    import flash_attn
-+    from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
-+
-+    FLASH_VERSION = flash_attn.__version__
-+    #    flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:3])
-+    #    if (
-+    #        flash_ver_parsed != (2, 3, 6)
-+    #        and os.environ.get("XFORMERS_IGNORE_FLASH_VERSION_CHECK", "0") != "1"
-+    #    ):
-+    #        raise ImportError("Requires Flash attention 2.3.6 for varlen_fwd api")
- 
-     # create library so that flash-attn goes through the PyTorch Dispatcher
-    _flash_lib = torch.library.Library("xformers_flash", "DEF")
-
-    _flash_lib.define(
-        "flash_fwd(Tensor query, Tensor key, Tensor value, "
-        "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
-        "int max_seqlen_q, int max_seqlen_k, "
-        "float p, float softmax_scale, "
-        "bool is_causal, int window_left, "
-        "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
-    )
-+    #_flash_lib = torch.library.Library("xformers_flash", "DEF")
- 
-    _flash_lib.define(
-        "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
-        "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
-        "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
-        "int max_seqlen_q, int max_seqlen_k, "
-        "float p, float softmax_scale, bool is_causal, "
-        "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
-    )
-+    #_flash_lib.define(
-+    #    "flash_fwd(Tensor query, Tensor key, Tensor value, "
-+    #    "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, Tensor? seqused_k, "
-+    #    "int max_seqlen_q, int max_seqlen_k, "
-+    #    "float p, float softmax_scale, "
-+    #    "bool is_causal, int window_left, "
-+    #    "int window_right, bool return_softmax) -> (Tensor, Tensor, Tensor)"
-+    #)
-+
-+    #_flash_lib.define(
-+    #    "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
-+    #    "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
-+    #    "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
-+    #    "int max_seqlen_q, int max_seqlen_k, "
-+    #    "float p, float softmax_scale, bool is_causal, "
-+    #    "int window_left, int window_right, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
-+    #)
- 
-     def _flash_fwd(
-         query,
-@@ -111,8 +111,8 @@
-                 p,
-                 softmax_scale,
-                 is_causal,
-                window_left,  # window_size_left
-                window_right,  # window_size_right
-+        #        window_left,  # window_size_left
-+        #        window_right,  # window_size_right
-                 return_softmax,
-                 None,  # rng
-             )
-@@ -134,15 +134,15 @@
-                 out,
-                 cu_seq_lens_q,
-                 cu_seq_lens_k,
-                seqused_k,
-+         #       seqused_k,
-                 max_seq_len_q,
-                 max_seq_len_k,
-                 p,
-                 softmax_scale,
-                 False,
-                 is_causal,
-                window_left,
-                window_right,
-+         #       window_left,
-+         #       window_right,
-                 return_softmax,
-                 None,
-             )
-@@ -184,8 +184,8 @@
-                 p,
-                 softmax_scale,
-                 is_causal,
-                window_left,
-                window_right,
-+        #        window_left,
-+        #        window_right,
-                 None,
-                 rng_state,
-             )
-@@ -208,15 +208,15 @@
-                 softmax_scale,
-                 False,  # zero_tensors
-                 is_causal,
-                window_left,
-                window_right,
-+        #        window_left,
-+        #        window_right,
-                 None,
-                 rng_state,
-             )
-         return dq, dk, dv
- 
-    _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
-    _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
-+    #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
-+    #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
- except ImportError:
-     pass
- 
-@@ -400,7 +400,7 @@
-         implementation.
-     """
- 
-    OPERATOR = get_operator("xformers_flash", "flash_fwd")
-+    OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd")
-     SUPPORTED_DEVICES: Set[str] = {"cuda"}
-     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
-     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@ -154,25 +154,30 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                f"Head size {head_size} is not supported by PagedAttention. "
                f"Supported head sizes are: {suppored_head_sizes}.")

-        self.use_naive_attn = torch.cuda.get_device_capability()[0] != 9
+        self.use_naive_attn = False
        # NOTE: Allow for switching between Triton and CK. Defaulting to triton.
        self.use_triton_flash_attn = (os.environ.get(
            "VLLM_USE_TRITON_FLASH_ATTN", "True").lower() in ("true", "1"))
-        if self.use_naive_attn:
-            # AMD Radeon 7900 series (gfx1100) currently does not support
-            # xFormers nor FlashAttention. As a temporary workaround, we use
-            # naive PyTorch implementation of attention.
-            self.attn_fuc = _naive_attention
-            logger.debug("Using naive attention in ROCmBackend")
-        elif self.use_triton_flash_attn:
+        if self.use_triton_flash_attn:
            from vllm.attention.ops.triton_flash_attention import (  # noqa: F401
                triton_attention)
            self.attn_func = triton_attention
            logger.debug("Using Triton FA in ROCmBackend")
        else:
-            from flash_attn import flash_attn_varlen_func  # noqa: F401
-            self.attn_func = flash_attn_varlen_func
-            logger.debug("Using CK FA in ROCmBackend")
+            # if not using triton, navi3x not use flash-attn either
+            if torch.cuda.get_device_capability()[0] == 11:
+                self.use_naive_attn = True
+            else:
+                try:
+                    from flash_attn import flash_attn_varlen_func  # noqa: F401
+                    self.attn_func = flash_attn_varlen_func
+                    logger.debug("Using CK FA in ROCmBackend")
+                except ModuleNotFoundError:
+                    self.use_naive_attn = True
+
+            if self.use_naive_attn:
+                self.attn_func = _naive_attention
+                logger.debug("Using naive attention in ROCmBackend")

    def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
        """torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
@ -247,13 +252,13 @@ class ROCmFlashAttentionImpl(AttentionImpl):
                # triton attention
                # When block_tables are not filled, it means q and k are the
                # prompt, and they have the same length.
-                if self.use_naive_attn or self.use_triton_flash_attn:
+                if self.use_triton_flash_attn or self.use_naive_attn:
                    if self.num_kv_heads != self.num_heads:
                        # Interleave for MQA workaround.
                        key = self.repeat_kv(key, self.num_queries_per_kv)
                        value = self.repeat_kv(value, self.num_queries_per_kv)
                    if self.use_naive_attn:
-                        out = self.attn_fuc(
+                        out = self.attn_func(
                            query,
                            key,
                            value,