[X86] Support Intel avxvnni

This patch mainly made the following changes: 1. Support AVX-VNNI instructions; 2. Introduce ExplicitVEXPrefix flag so that vpdpbusd/vpdpbusds/vpdpbusds/vpdpbusds instructions only use vex-encoding when user explicity add {vex} prefix. Differential Revision: https://reviews.llvm.org/D89105
2020-10-30 12:58:05 +08:00 · 2020-10-30 12:58:05 +08:00 · 756f597841
parent d11710dae6
commit 756f597841
41 changed files with 2603 additions and 74 deletions
--- a/clang/docs/ClangCommandLineReference.rst
+++ b/clang/docs/ClangCommandLineReference.rst
@ -3253,6 +3253,8 @@ X86

 .. option:: -mavx512vpopcntdq, -mno-avx512vpopcntdq

+.. option:: -mavxvnni, -mno-avxvnni
+
 .. option:: -mbmi, -mno-bmi

 .. option:: -mbmi2, -mno-bmi2
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@ -205,6 +205,8 @@ X86 Support in Clang

 - Support for ``UINTR`` instructions has been added.

+- Support for ``AVXVNNI`` instructions has been added.
+
 Internal API Changes
 --------------------

--- a/clang/include/clang/Basic/BuiltinsX86.def
+++ b/clang/include/clang/Basic/BuiltinsX86.def
@ -960,17 +960,17 @@ TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl
 TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f")
 TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f")

-TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
-TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
+TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
 TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")

 TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@ -3235,6 +3235,8 @@ def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Gro
 def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
 def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
 def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
+def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
+def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
 def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
 def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
 def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@ -306,6 +306,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
      HasAMXINT8 = true;
    } else if (Feature == "+amx-tile") {
      HasAMXTILE = true;
+    } else if (Feature == "+avxvnni") {
+      HasAVXVNNI = true;
    } else if (Feature == "+serialize") {
      HasSERIALIZE = true;
    } else if (Feature == "+tsxldtrk") {
@ -728,6 +730,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
    Builder.defineMacro("__AMXINT8__");
  if (HasAMXBF16)
    Builder.defineMacro("__AMXBF16__");
+  if (HasAVXVNNI)
+    Builder.defineMacro("__AVXVNNI__");
  if (HasSERIALIZE)
    Builder.defineMacro("__SERIALIZE__");
  if (HasTSXLDTRK)
@ -846,6 +850,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
      .Case("avx512vbmi2", true)
      .Case("avx512ifma", true)
      .Case("avx512vp2intersect", true)
+      .Case("avxvnni", true)
      .Case("bmi", true)
      .Case("bmi2", true)
      .Case("cldemote", true)
@ -918,6 +923,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
      .Case("amx-bf16", HasAMXBF16)
      .Case("amx-int8", HasAMXINT8)
      .Case("amx-tile", HasAMXTILE)
+      .Case("avxvnni", HasAVXVNNI)
      .Case("avx", SSELevel >= AVX)
      .Case("avx2", SSELevel >= AVX2)
      .Case("avx512f", SSELevel >= AVX512F)
--- a/clang/lib/Basic/Targets/X86.h
+++ b/clang/lib/Basic/Targets/X86.h
@ -130,6 +130,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
  bool HasKL = false;      // For key locker
  bool HasWIDEKL = false; // For wide key locker
  bool HasHRESET = false;
+  bool HasAVXVNNI = false;
  bool HasAMXTILE = false;
  bool HasAMXINT8 = false;
  bool HasAMXBF16 = false;
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@ -35,6 +35,7 @@ set(files
  avx512vnniintrin.h
  avx512vlvnniintrin.h
  avxintrin.h
+  avxvnniintrin.h
  bmi2intrin.h
  bmiintrin.h
  __clang_cuda_builtin_vars.h
--- a/clang/lib/Headers/avx512vlvnniintrin.h
+++ b/clang/lib/Headers/avx512vlvnniintrin.h
@ -18,13 +18,157 @@
 #define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
 #define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))

+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpbusds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssd_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+#define _mm256_dpwssds_epi32(S, A, B) \
+  (__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
+/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpbusds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssd_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
+/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
+///      tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
+///      DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+#define _mm_dpwssds_epi32(S, A, B) \
+  (__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))

 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
-                                             (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                     (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
-{
-  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
-                                              (__v8si)__B);
-}
-
 static __inline__ __m256i __DEFAULT_FN_ATTRS256
 _mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
 {
@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
                                    (__v8si)_mm256_setzero_si256());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                       (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
-                                             (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
                                        (__v4si)_mm_setzero_si128());
 }

-static __inline__ __m128i __DEFAULT_FN_ATTRS128
-_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
-{
-  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
-                                              (__v4si)__B);
-}
-
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
 {
--- a/clang/lib/Headers/avxvnniintrin.h
+++ b/clang/lib/Headers/avxvnniintrin.h
@ -0,0 +1,225 @@
+/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
+ *
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ *
+ *===-----------------------------------------------------------------------===
+ */
+#ifndef __IMMINTRIN_H
+#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
+#endif
+
+#ifndef __AVXVNNIINTRIN_H
+#define __AVXVNNIINTRIN_H
+
+/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
+/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
+/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
+/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
+
+/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
+/* Define the default attributes for the functions in this file. */
+#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
+#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+///  and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 7
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:256] := 0
+/// \endoperation
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
+{
+  return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
+/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
+/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
+/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
+///      tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
+///      tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
+///      tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
+/// and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := __S.dword[j] + tmp1 + tmp2
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
+/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
+/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
+/// using signed saturation, and store the packed 32-bit results in DST.
+///
+/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
+///
+/// \operation
+///    FOR j := 0 to 3
+///      tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
+///      tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
+///      DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
+///    ENDFOR
+///    DST[MAX:128] := 0
+/// \endoperation
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
+{
+  return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
+}
+
+#undef __DEFAULT_FN_ATTRS128
+#undef __DEFAULT_FN_ATTRS256
+
+#endif // __AVXVNNIINTRIN_H
--- a/clang/lib/Headers/cpuid.h
+++ b/clang/lib/Headers/cpuid.h
@ -196,6 +196,7 @@
 #define bit_AMXINT8       0x02000000

 /* Features in %eax for leaf 7 sub-leaf 1 */
+#define bit_AVXVNNI       0x00000008
 #define bit_AVX512BF16    0x00000020
 #define bit_HRESET        0x00400000

--- a/clang/lib/Headers/immintrin.h
+++ b/clang/lib/Headers/immintrin.h
@ -145,6 +145,11 @@
 #include <avx512vlvnniintrin.h>
 #endif

+#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
+    defined(__AVXVNNI__)
+#include <avxvnniintrin.h>
+#endif
+
 #if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) ||      \
    defined(__AVX512DQ__)
 #include <avx512dqintrin.h>
--- a/clang/test/CodeGen/X86/avxvnni-builtins.c
+++ b/clang/test/CodeGen/X86/avxvnni-builtins.c
@ -0,0 +1,99 @@
+// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
+
+#include <immintrin.h>
+
+__m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.256
+  return _mm256_dpbusd_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.256
+  return _mm256_dpbusds_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.256
+  return _mm256_dpwssd_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.256
+  return _mm256_dpwssds_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.128
+  return _mm_dpbusd_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.128
+  return _mm_dpbusds_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssd_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.128
+  return _mm_dpwssd_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssds_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.128
+  return _mm_dpwssds_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.256
+  return _mm256_dpbusd_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpbusds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.256
+  return _mm256_dpbusds_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.256
+  return _mm256_dpwssd_avx_epi32(__S, __A, __B);
+}
+
+__m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
+  // CHECK-LABEL: @test_mm256_dpwssds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.256
+  return _mm256_dpwssds_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusd.128
+  return _mm_dpbusd_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpbusds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpbusds.128
+  return _mm_dpbusds_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssd_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssd.128
+  return _mm_dpwssd_avx_epi32(__S, __A, __B);
+}
+
+__m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
+  // CHECK-LABEL: @test_mm_dpwssds_avx_epi32
+  // CHECK: @llvm.x86.avx512.vpdpwssds.128
+  return _mm_dpwssds_avx_epi32(__S, __A, __B);
+}
--- a/clang/test/CodeGen/attr-target-x86.c
+++ b/clang/test/CodeGen/attr-target-x86.c
@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4() {}
 // CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
 // CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
 // CHECK-NOT: tune-cpu
-// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
+// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
 // CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
-// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
+// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
 // CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
 // CHECK-NOT: tune-cpu
 // CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@ -288,3 +288,8 @@
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-UINTR %s
 // UINTR: "-target-feature" "+uintr"
 // NO-UINTR: "-target-feature" "-uintr"
+
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AVX-VNNI %s
+// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AVX-VNNI %s
+// AVX-VNNI: "-target-feature" "+avxvnni"
+// NO-AVX-VNNI: "-target-feature" "-avxvnni"
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@ -1654,6 +1654,7 @@
 // CHECK_SPR_M32: #define __AVX512VL__ 1
 // CHECK_SPR_M32: #define __AVX512VNNI__ 1
 // CHECK_SPR_M32: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_SPR_M32: #define __AVXVNNI__ 1
 // CHECK_SPR_M32: #define __AVX__ 1
 // CHECK_SPR_M32: #define __BMI2__ 1
 // CHECK_SPR_M32: #define __BMI__ 1
@ -1724,6 +1725,7 @@
 // CHECK_SPR_M64: #define __AVX512VL__ 1
 // CHECK_SPR_M64: #define __AVX512VNNI__ 1
 // CHECK_SPR_M64: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_SPR_M64: #define __AVXVNNI__ 1
 // CHECK_SPR_M64: #define __AVX__ 1
 // CHECK_SPR_M64: #define __BMI2__ 1
 // CHECK_SPR_M64: #define __BMI__ 1
@ -1782,6 +1784,7 @@
 // CHECK_ADL_M32: #define __AES__ 1
 // CHECK_ADL_M32: #define __AVX2__ 1
 // CHECK_ADL_M32-NOT: AVX512
+// CHECK_ADL_M32: #define __AVXVNNI__ 1
 // CHECK_ADL_M32: #define __AVX__ 1
 // CHECK_ADL_M32: #define __BMI2__ 1
 // CHECK_ADL_M32: #define __BMI__ 1
@ -1822,6 +1825,7 @@
 // CHECK_ADL_M64: #define __AES__ 1
 // CHECK_ADL_M64: #define __AVX2__ 1
 // CHECK_ADL_M64-NOT: AVX512
+// CHECK_ADL_M64: #define __AVXVNNI__ 1
 // CHECK_ADL_M64: #define __AVX__ 1
 // CHECK_ADL_M64: #define __BMI2__ 1
 // CHECK_ADL_M64: #define __BMI__ 1
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@ -544,3 +544,17 @@
 // RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s

 // NOUINTR-NOT: #define __UINTR__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s
+
+// AVXVNNI: #define __AVX2__ 1
+// AVXVNNI: #define __AVXVNNI__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNI %s
+
+// NOAVXVNNI-NOT: #define __AVXVNNI__ 1
+
+// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNINOAVX2 %s
+
+// AVXVNNINOAVX2-NOT: #define __AVX2__ 1
+// AVXVNNINOAVX2-NOT: #define __AVXVNNI__ 1
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@ -121,6 +121,7 @@ During this release ...
  the target CPU.
 * Support for ``HRESET`` instructions has been added.
 * Support for ``UINTR`` instructions has been added.
+* Support for ``AVXVNNI`` instructions has been added.

 Changes to the AMDGPU Target
 -----------------------------
--- a/llvm/include/llvm/Support/X86TargetParser.def
+++ b/llvm/include/llvm/Support/X86TargetParser.def
@ -190,6 +190,7 @@ X86_FEATURE       (XSAVEC,          "xsavec")
 X86_FEATURE       (XSAVEOPT,        "xsaveopt")
 X86_FEATURE       (XSAVES,          "xsaves")
 X86_FEATURE       (HRESET,          "hreset")
+X86_FEATURE       (AVXVNNI,         "avxvnni")
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@ -1497,6 +1497,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
  Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
  bool HasLeaf7Subleaf1 =
      MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
+  Features["avxvnni"]    = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave;
  Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
  Features["hreset"]     = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);

--- a/llvm/lib/Support/X86TargetParser.cpp
+++ b/llvm/lib/Support/X86TargetParser.cpp
@ -205,10 +205,10 @@ constexpr FeatureBitset FeaturesSapphireRapids =
    FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE |
    FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
    FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
-    FeatureWAITPKG;
+    FeatureWAITPKG | FeatureAVXVNNI;
 constexpr FeatureBitset FeaturesAlderlake =
    FeaturesSkylakeClient | FeatureCLDEMOTE | FeatureHRESET | FeaturePTWRITE |
-    FeatureSERIALIZE | FeatureWAITPKG;
+    FeatureSERIALIZE | FeatureWAITPKG | FeatureAVXVNNI;

 // Intel Atom processors.
 // Bonnell has feature parity with Core2 and adds MOVBE.
@ -575,6 +575,9 @@ constexpr FeatureBitset ImpliedFeaturesHRESET = {};
 constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
 constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;

+// AVXVNNI Features
+constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
+
 constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
 #define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM},
 #include "llvm/Support/X86TargetParser.def"
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@ -3845,6 +3845,13 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
      (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
    return Match_Unsupported;

+  // These instructions are only available with {vex}, {vex2} or {vex3} prefix
+  if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
+      (ForcedVEXEncoding != VEXEncoding_VEX &&
+       ForcedVEXEncoding != VEXEncoding_VEX2 &&
+       ForcedVEXEncoding != VEXEncoding_VEX3))
+    return Match_Unsupported;
+
  // These instructions match ambiguously with their VEX encoded counterparts
  // and appear first in the matching table. Reject them unless we're forcing
  // EVEX encoding.
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@ -952,7 +952,11 @@ namespace X86II {

    // NOTRACK prefix
    NoTrackShift = EVEX_RCShift + 1,
-    NOTRACK = 1ULL << NoTrackShift
+    NOTRACK = 1ULL << NoTrackShift,
+
+    // Force VEX encoding
+    ExplicitVEXShift = NoTrackShift + 1,
+    ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
  };

  /// \returns true if the instruction with given opcode is a prefix.
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@ -348,7 +348,7 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
    O << "\trep\t";

  // These all require a pseudo prefix
-  if (Flags & X86::IP_USE_VEX)
+  if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
    O << "\t{vex}";
  else if (Flags & X86::IP_USE_VEX2)
    O << "\t{vex2}";
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@ -171,6 +171,9 @@ def FeaturePKU   : SubtargetFeature<"pku", "HasPKU", "true",
 def FeatureVNNI    : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
                          "Enable AVX-512 Vector Neural Network Instructions",
                                      [FeatureAVX512]>;
+def FeatureAVXVNNI    : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
+                           "Support AVX_VNNI encoding",
+                                      [FeatureAVX2]>;
 def FeatureBF16    : SubtargetFeature<"avx512bf16", "HasBF16", "true",
                           "Support bfloat16 floating point",
                                      [FeatureBWI]>;
@ -769,6 +772,7 @@ def ProcessorFeatures {
                                                  FeatureCLDEMOTE,
                                                  FeatureWAITPKG,
                                                  FeaturePTWRITE,
+                                                  FeatureAVXVNNI,
                                                  FeatureTSXLDTRK,
                                                  FeatureENQCMD,
                                                  FeatureSHSTK,
@ -781,7 +785,8 @@ def ProcessorFeatures {
    !listconcat(ICXFeatures, SPRAdditionalFeatures);

  // Alderlake
-  list<SubtargetFeature> ADLAdditionalFeatures = [FeatureCLDEMOTE,
+  list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
+                                                  FeatureCLDEMOTE,
                                                  FeatureHRESET,
                                                  FeaturePTWRITE,
                                                  FeatureSERIALIZE,
--- a/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/llvm/lib/Target/X86/X86EvexToVex.cpp
@ -85,6 +85,8 @@ public:
 private:
  /// Machine instruction info used throughout the class.
  const X86InstrInfo *TII = nullptr;
+
+  const X86Subtarget *ST = nullptr;
 };

 } // end anonymous namespace
@ -94,8 +96,8 @@ char EvexToVexInstPass::ID = 0;
 bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
  TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();

-  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX512())
+  ST = &MF.getSubtarget<X86Subtarget>();
+  if (!ST->hasAVX512())
    return false;

  bool Changed = false;
@ -144,10 +146,29 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
 }

 // Do any custom cleanup needed to finalize the conversion.
-static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
+                                     const X86Subtarget *ST) {
  (void)NewOpc;
  unsigned Opc = MI.getOpcode();
  switch (Opc) {
+  case X86::VPDPBUSDSZ256m:
+  case X86::VPDPBUSDSZ256r:
+  case X86::VPDPBUSDSZ128m:
+  case X86::VPDPBUSDSZ128r:
+  case X86::VPDPBUSDZ256m:
+  case X86::VPDPBUSDZ256r:
+  case X86::VPDPBUSDZ128m:
+  case X86::VPDPBUSDZ128r:
+  case X86::VPDPWSSDSZ256m:
+  case X86::VPDPWSSDSZ256r:
+  case X86::VPDPWSSDSZ128m:
+  case X86::VPDPWSSDSZ128r:
+  case X86::VPDPWSSDZ256m:
+  case X86::VPDPWSSDZ256r:
+  case X86::VPDPWSSDZ128m:
+  case X86::VPDPWSSDZ128r:
+    // These can only VEX convert if AVXVNNI is enabled.
+    return ST->hasAVXVNNI();
  case X86::VALIGNDZ128rri:
  case X86::VALIGNDZ128rmi:
  case X86::VALIGNQZ128rri:
@ -259,7 +280,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
  if (usesExtendedRegister(MI))
    return false;

-  if (!performCustomAdjustments(MI, NewOpc))
+  if (!performCustomAdjustments(MI, NewOpc, ST))
    return false;

  MI.setDesc(TII->get(NewOpc));
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@ -3748,18 +3748,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
  { X86::VPCONFLICTQZ128rrk,         X86::VPCONFLICTQZ128rmk,         0 },
  { X86::VPCONFLICTQZ256rrk,         X86::VPCONFLICTQZ256rmk,         0 },
  { X86::VPCONFLICTQZrrk,            X86::VPCONFLICTQZrmk,            0 },
+  { X86::VPDPBUSDSYrr,               X86::VPDPBUSDSYrm,               0 },
  { X86::VPDPBUSDSZ128r,             X86::VPDPBUSDSZ128m,             0 },
  { X86::VPDPBUSDSZ256r,             X86::VPDPBUSDSZ256m,             0 },
  { X86::VPDPBUSDSZr,                X86::VPDPBUSDSZm,                0 },
+  { X86::VPDPBUSDSrr,                X86::VPDPBUSDSrm,                0 },
+  { X86::VPDPBUSDYrr,                X86::VPDPBUSDYrm,                0 },
  { X86::VPDPBUSDZ128r,              X86::VPDPBUSDZ128m,              0 },
  { X86::VPDPBUSDZ256r,              X86::VPDPBUSDZ256m,              0 },
  { X86::VPDPBUSDZr,                 X86::VPDPBUSDZm,                 0 },
+  { X86::VPDPBUSDrr,                 X86::VPDPBUSDrm,                 0 },
+  { X86::VPDPWSSDSYrr,               X86::VPDPWSSDSYrm,               0 },
  { X86::VPDPWSSDSZ128r,             X86::VPDPWSSDSZ128m,             0 },
  { X86::VPDPWSSDSZ256r,             X86::VPDPWSSDSZ256m,             0 },
  { X86::VPDPWSSDSZr,                X86::VPDPWSSDSZm,                0 },
+  { X86::VPDPWSSDSrr,                X86::VPDPWSSDSrm,                0 },
+  { X86::VPDPWSSDYrr,                X86::VPDPWSSDYrm,                0 },
  { X86::VPDPWSSDZ128r,              X86::VPDPWSSDZ128m,              0 },
  { X86::VPDPWSSDZ256r,              X86::VPDPWSSDZ256m,              0 },
  { X86::VPDPWSSDZr,                 X86::VPDPWSSDZm,                 0 },
+  { X86::VPDPWSSDrr,                 X86::VPDPWSSDrm,                 0 },
  { X86::VPERMBZ128rrkz,             X86::VPERMBZ128rmkz,             0 },
  { X86::VPERMBZ256rrkz,             X86::VPERMBZ256rmkz,             0 },
  { X86::VPERMBZrrkz,                X86::VPERMBZrmkz,                0 },
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@ -264,6 +264,9 @@ class NotMemoryFoldable { bit isMemoryFoldable = 0; }
 // Prevent EVEX->VEX conversion from considering this instruction.
 class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }

+// Force the instruction to use VEX encoding.
+class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
+
 class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
              string AsmStr, Domain d = GenericDomain>
  : Instruction {
@ -348,6 +351,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,

  bit isMemoryFoldable = 1;     // Is it allowed to memory fold/unfold this instruction?
  bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
+  bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.

  // TSFlags layout should be kept in sync with X86BaseInfo.h.
  let TSFlags{6-0}   = FormBits;
@ -376,6 +380,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
  let TSFlags{51-45} = CD8_Scale;
  let TSFlags{52}    = hasEVEX_RC;
  let TSFlags{53}    = hasNoTrackPrefix;
+  let TSFlags{54}    = ExplicitVEXPrefix;
 }

 class PseudoI<dag oops, dag iops, list<dag> pattern>
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@ -2568,6 +2568,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
  case X86::VPTERNLOGQZ256rmbikz:
  case X86::VPTERNLOGQZrmbikz:
    return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+  case X86::VPDPWSSDYrr:
+  case X86::VPDPWSSDrr:
+  case X86::VPDPWSSDSYrr:
+  case X86::VPDPWSSDSrr:
  case X86::VPDPWSSDZ128r:
  case X86::VPDPWSSDZ128rk:
  case X86::VPDPWSSDZ128rkz:
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@ -910,6 +910,8 @@ def PKU        : Predicate<"Subtarget->hasPKU()">;
 def HasVNNI    : Predicate<"Subtarget->hasVNNI()">;
 def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
 def HasBF16      : Predicate<"Subtarget->hasBF16()">;
+def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
+def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;

 def HasBITALG    : Predicate<"Subtarget->hasBITALG()">;
 def HasPOPCNT    : Predicate<"Subtarget->hasPOPCNT()">;
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@ -7164,6 +7164,48 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
                                 int_x86_avx_maskstore_pd_256,
                                 WriteFMaskMove64, WriteFMaskMove64Y>;

+//===----------------------------------------------------------------------===//
+// AVX_VNNI
+//===----------------------------------------------------------------------===//
+let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
+multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                       bit IsCommutable> {
+  let isCommutable = IsCommutable in
+  def rr  : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2, VR128:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
+                                       VR128:$src2, VR128:$src3)))]>,
+             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  def rm  : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
+             (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
+                                      (loadv4i32 addr:$src3))))]>,
+             VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
+
+  let isCommutable = IsCommutable in
+  def Yrr  : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, VR256:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
+                                       VR256:$src2, VR256:$src3)))]>,
+             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+
+  def Yrm  : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
+             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
+             !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+             [(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
+                                      (loadv8i32 addr:$src3))))]>,
+             VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
+}
+
+defm VPDPBUSD   : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
+defm VPDPBUSDS  : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
+defm VPDPWSSD   : avx_vnni_rm<0x52, "vpdpwssd",  X86Vpdpwssd, 1>, ExplicitVEXPrefix;
+defm VPDPWSSDS  : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
+
 //===----------------------------------------------------------------------===//
 // VPERMIL - Permute Single and Double Floating-Point Values
 //
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@ -355,6 +355,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
  /// Processor has AVX-512 Vector Neural Network Instructions
  bool HasVNNI = false;

+  /// Processor has AVX Vector Neural Network Instructions
+  bool HasAVXVNNI = false;
+
  /// Processor has AVX-512 bfloat16 floating-point extensions
  bool HasBF16 = false;

@ -750,6 +753,7 @@ public:
  bool useRetpolineIndirectBranches() const {
    return UseRetpolineIndirectBranches;
  }
+  bool hasAVXVNNI() const { return HasAVXVNNI; }
  bool hasAMXTILE() const { return HasAMXTILE; }
  bool hasAMXBF16() const { return HasAMXBF16; }
  bool hasAMXINT8() const { return HasAMXINT8; }
--- a/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx_vnni-intrinsics.ll
@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
+  ret <8 x i32> %res
+}
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+
+define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
+; AVXVNNI:       # %bb.0:
+; AVXVNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; AVXVNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
+; AVX512VNNI:       # %bb.0:
+; AVX512VNNI-NEXT:    {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
+; AVX512VNNI-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
+  ret <4 x i32> %res
+}
--- a/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avxvnni.ll
@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnni < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
+declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
+
+define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusd %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusd %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusds %xmm1, %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_256:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
+; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; CHECK-NEXT:    #APP
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    #NO_APP
+; CHECK-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
+; CHECK-NEXT:    {vex} vpdpbusds %ymm1, %ymm2, %ymm0
+; CHECK-NEXT:    retq
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
--- a/llvm/test/MC/Disassembler/X86/avx_vnni.txt
+++ b/llvm/test/MC/Disassembler/X86/avx_vnni.txt
@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=i686-apple-darwin9 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x30
+
+# CHECK: {vex} vpdpbusd  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x30
+
+# CHECK: {vex} vpdpbusd  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x30
+
+# CHECK: {vex} vpdpbusds  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x30
+
+# CHECK: {vex} vpdpbusds  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x30
+
+# CHECK: {vex} vpdpwssd  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x30
+
+# CHECK: {vex} vpdpwssd  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds  268435456(%esp,%esi,8), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%edi,%eax,4), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%eax), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x30
+
+# CHECK: {vex} vpdpwssds  -1024(,%ebp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  4064(%ecx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -4096(%edx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  268435456(%esp,%esi,8), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%edi,%eax,4), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%eax), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x30
+
+# CHECK: {vex} vpdpwssds  -512(,%ebp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  2032(%ecx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -2048(%edx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- a/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt
+++ b/llvm/test/MC/Disassembler/X86/intel-syntax-avx_vnni.txt
@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x50,0x30
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x50,0x30
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x51,0x30
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x51,0x30
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x52,0x30
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x52,0x30
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+0xc4,0xe2,0x55,0x53,0x30
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+0xc4,0xe2,0x51,0x53,0x30
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- a/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt
+++ b/llvm/test/MC/Disassembler/X86/intel-syntax-x86-64-avx_vnni.txt
@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- a/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt
+++ b/llvm/test/MC/Disassembler/X86/x86-64-avx_vnni.txt
@ -0,0 +1,170 @@
+# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 | FileCheck %s
+
+# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xf4
+
+# CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusd  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusd  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xf4
+
+# CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpbusds  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpbusds  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xf4
+
+# CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssd  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssd  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
+
+# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xf4
+
+# CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %ymm5, %ymm6
+0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%rip), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -1024(,%rbp,2), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  4064(%rcx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -4096(%rdx), %ymm5, %ymm6
+0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
+
+# CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %xmm5, %xmm6
+0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  (%rip), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -512(,%rbp,2), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
+
+# CHECK: {vex} vpdpwssds  2032(%rcx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
+
+# CHECK: {vex} vpdpwssds  -2048(%rdx), %xmm5, %xmm6
+0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff
+
--- a/llvm/test/MC/X86/avx_vnni-encoding.s
+++ b/llvm/test/MC/X86/avx_vnni-encoding.s
@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni --show-encoding < %s  | FileCheck %s
+
+// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
+          {vex} vpdpbusd  (%eax), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
+          {vex} vpdpbusd  (%eax), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd  -2048(%edx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
+          {vex} vpdpbusds  (%eax), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
+          {vex} vpdpbusds  (%eax), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds  -2048(%edx), %xmm5, %xmm6
+
+// CHECK: vpdpwssd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+
+// CHECK: vpdpwssd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+
+// CHECK: vpdpwssd  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
+          {vex} vpdpwssd  (%eax), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: vpdpwssd  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
+          {vex} vpdpwssd  (%eax), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vpdpwssd  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd  -2048(%edx), %xmm5, %xmm6
+
+// CHECK: vpdpwssds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+
+// CHECK: vpdpwssds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+
+// CHECK: vpdpwssds  268435456(%esp,%esi,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%esp,%esi,8), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  291(%edi,%eax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%edi,%eax,4), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  (%eax), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
+          {vex} vpdpwssds  (%eax), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  -1024(,%ebp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds  -1024(,%ebp,2), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  4064(%ecx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds  4064(%ecx), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  -4096(%edx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds  -4096(%edx), %ymm5, %ymm6
+
+// CHECK: vpdpwssds  268435456(%esp,%esi,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%esp,%esi,8), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  291(%edi,%eax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%edi,%eax,4), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  (%eax), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
+          {vex} vpdpwssds  (%eax), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  -512(,%ebp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds  -512(,%ebp,2), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  2032(%ecx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds  2032(%ecx), %xmm5, %xmm6
+
+// CHECK: vpdpwssds  -2048(%edx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds  -2048(%edx), %xmm5, %xmm6
+
--- a/llvm/test/MC/X86/intel-syntax-avx_vnni.s
+++ b/llvm/test/MC/X86/intel-syntax-avx_vnni.s
@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
+
--- a/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s
+++ b/llvm/test/MC/X86/intel-syntax-x86-64-avx_vnni.s
@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds ymm6, ymm5, ymm4
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds xmm6, xmm5, xmm4
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
+
+// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
+
+// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
+
--- a/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s
+++ b/llvm/test/MC/X86/x86-64-avx_vnni-encoding.s
@ -0,0 +1,226 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni --show-encoding < %s  | FileCheck %s
+
+// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
+          {vex} vpdpbusd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
+          {vex} vpdpbusd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusd  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusd  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusd  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusd  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusd  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusd  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusd  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusd  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusd  -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
+          {vex} vpdpbusds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
+          {vex} vpdpbusds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpbusds  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpbusds  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpbusds  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpbusds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpbusds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpbusds  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpbusds  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpbusds  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpbusds  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpbusds  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpbusds  -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
+          {vex} vpdpwssd %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
+          {vex} vpdpwssd %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssd  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssd  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssd  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssd  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssd  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssd  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssd  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssd  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssd  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssd  -2048(%rdx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
+          {vex} vpdpwssds %ymm4, %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
+          {vex} vpdpwssds %xmm4, %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%rbp,%r14,8), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%r8,%rax,4), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  (%rip), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds  (%rip), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  -1024(,%rbp,2), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
+          {vex} vpdpwssds  -1024(,%rbp,2), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  4064(%rcx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
+          {vex} vpdpwssds  4064(%rcx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  -4096(%rdx), %ymm5, %ymm6
+// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
+          {vex} vpdpwssds  -4096(%rdx), %ymm5, %ymm6
+
+// CHECK: {vex} vpdpwssds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
+          {vex} vpdpwssds  268435456(%rbp,%r14,8), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  291(%r8,%rax,4), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
+          {vex} vpdpwssds  291(%r8,%rax,4), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  (%rip), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
+          {vex} vpdpwssds  (%rip), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  -512(,%rbp,2), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
+          {vex} vpdpwssds  -512(,%rbp,2), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  2032(%rcx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
+          {vex} vpdpwssds  2032(%rcx), %xmm5, %xmm6
+
+// CHECK: {vex} vpdpwssds  -2048(%rdx), %xmm5, %xmm6
+// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
+          {vex} vpdpwssds  -2048(%rdx), %xmm5, %xmm6
+