[X86] Support Intel avxvnni

This patch mainly made the following changes:

1. Support AVX-VNNI instructions;
2. Introduce ExplicitVEXPrefix flag so that vpdpbusd/vpdpbusds/vpdpbusds/vpdpbusds instructions only use vex-encoding when user explicity add {vex} prefix.

Differential Revision: https://reviews.llvm.org/D89105
This commit is contained in:
Liu, Chen3 2020-10-30 12:58:05 +08:00
parent d11710dae6
commit 756f597841
41 changed files with 2603 additions and 74 deletions

View File

@ -3253,6 +3253,8 @@ X86
.. option:: -mavx512vpopcntdq, -mno-avx512vpopcntdq
.. option:: -mavxvnni, -mno-avxvnni
.. option:: -mbmi, -mno-bmi
.. option:: -mbmi2, -mno-bmi2

View File

@ -205,6 +205,8 @@ X86 Support in Clang
- Support for ``UINTR`` instructions has been added.
- Support for ``AVXVNNI`` instructions has been added.
Internal API Changes
--------------------

View File

@ -960,17 +960,17 @@ TARGET_BUILTIN(__builtin_ia32_alignq256, "V4OiV4OiV4OiIi", "ncV:256:", "avx512vl
TARGET_BUILTIN(__builtin_ia32_extractf64x4_mask, "V4dV8dIiV4dUc", "ncV:512:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_extractf32x4_mask, "V4fV16fIiV4fUc", "ncV:512:", "avx512f")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpbusds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssd128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssd256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssd512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssds128, "V4iV4iV4iV4i", "ncV:128:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssds256, "V8iV8iV8iV8i", "ncV:256:", "avx512vl,avx512vnni|avxvnni")
TARGET_BUILTIN(__builtin_ia32_vpdpwssds512, "V16iV16iV16iV16i", "ncV:512:", "avx512vnni")
TARGET_BUILTIN(__builtin_ia32_gather3div2df, "V2dV2dvC*V2OiUcIi", "nV:128:", "avx512vl")

View File

@ -3235,6 +3235,8 @@ def mavx512vpopcntdq : Flag<["-"], "mavx512vpopcntdq">, Group<m_x86_Features_Gro
def mno_avx512vpopcntdq : Flag<["-"], "mno-avx512vpopcntdq">, Group<m_x86_Features_Group>;
def mavx512vp2intersect : Flag<["-"], "mavx512vp2intersect">, Group<m_x86_Features_Group>;
def mno_avx512vp2intersect : Flag<["-"], "mno-avx512vp2intersect">, Group<m_x86_Features_Group>;
def mavxvnni : Flag<["-"], "mavxvnni">, Group<m_x86_Features_Group>;
def mno_avxvnni : Flag<["-"], "mno-avxvnni">, Group<m_x86_Features_Group>;
def madx : Flag<["-"], "madx">, Group<m_x86_Features_Group>;
def mno_adx : Flag<["-"], "mno-adx">, Group<m_x86_Features_Group>;
def maes : Flag<["-"], "maes">, Group<m_x86_Features_Group>;

View File

@ -306,6 +306,8 @@ bool X86TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
HasAMXINT8 = true;
} else if (Feature == "+amx-tile") {
HasAMXTILE = true;
} else if (Feature == "+avxvnni") {
HasAVXVNNI = true;
} else if (Feature == "+serialize") {
HasSERIALIZE = true;
} else if (Feature == "+tsxldtrk") {
@ -728,6 +730,8 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
Builder.defineMacro("__AMXINT8__");
if (HasAMXBF16)
Builder.defineMacro("__AMXBF16__");
if (HasAVXVNNI)
Builder.defineMacro("__AVXVNNI__");
if (HasSERIALIZE)
Builder.defineMacro("__SERIALIZE__");
if (HasTSXLDTRK)
@ -846,6 +850,7 @@ bool X86TargetInfo::isValidFeatureName(StringRef Name) const {
.Case("avx512vbmi2", true)
.Case("avx512ifma", true)
.Case("avx512vp2intersect", true)
.Case("avxvnni", true)
.Case("bmi", true)
.Case("bmi2", true)
.Case("cldemote", true)
@ -918,6 +923,7 @@ bool X86TargetInfo::hasFeature(StringRef Feature) const {
.Case("amx-bf16", HasAMXBF16)
.Case("amx-int8", HasAMXINT8)
.Case("amx-tile", HasAMXTILE)
.Case("avxvnni", HasAVXVNNI)
.Case("avx", SSELevel >= AVX)
.Case("avx2", SSELevel >= AVX2)
.Case("avx512f", SSELevel >= AVX512F)

View File

@ -130,6 +130,7 @@ class LLVM_LIBRARY_VISIBILITY X86TargetInfo : public TargetInfo {
bool HasKL = false; // For key locker
bool HasWIDEKL = false; // For wide key locker
bool HasHRESET = false;
bool HasAVXVNNI = false;
bool HasAMXTILE = false;
bool HasAMXINT8 = false;
bool HasAMXBF16 = false;

View File

@ -35,6 +35,7 @@ set(files
avx512vnniintrin.h
avx512vlvnniintrin.h
avxintrin.h
avxvnniintrin.h
bmi2intrin.h
bmiintrin.h
__clang_cuda_builtin_vars.h

View File

@ -18,13 +18,157 @@
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128)))
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpbusd_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpbusds_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpwssd_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
#define _mm256_dpwssds_epi32(S, A, B) \
(__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpbusd_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with
/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpbusds_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpwssd_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with
/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j])
/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1])
/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
#define _mm_dpwssds_epi32(S, A, B) \
(__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
@ -42,13 +186,6 @@ _mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
@ -65,13 +202,6 @@ _mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
@ -88,13 +218,6 @@ _mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A,
(__v8si)__B);
}
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B)
{
@ -111,13 +234,6 @@ _mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B)
(__v8si)_mm256_setzero_si256());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
@ -134,13 +250,6 @@ _mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
@ -157,13 +266,6 @@ _mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{
@ -180,13 +282,6 @@ _mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B)
(__v4si)_mm_setzero_si128());
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A,
(__v4si)__B);
}
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B)
{

View File

@ -0,0 +1,225 @@
/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------===
*
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*
*===-----------------------------------------------------------------------===
*/
#ifndef __IMMINTRIN_H
#error "Never use <avxvnniintrin.h> directly; include <immintrin.h> instead."
#endif
#ifndef __AVXVNNIINTRIN_H
#define __AVXVNNIINTRIN_H
/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */
/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B)
/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B)
/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B)
/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B)
/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B)
/* Intrinsics with _avx_ prefix are for compatibility with msvc. */
/* Define the default attributes for the functions in this file. */
#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256)))
#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128)))
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 7
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:256] := 0
/// \endoperation
static __inline__ __m256i __DEFAULT_FN_ATTRS256
_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B)
{
return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with
/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed
/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer
/// in \a __S using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPBUSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j]))
/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1]))
/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2]))
/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3]))
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S,
/// and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSD </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with
/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit
/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S
/// using signed saturation, and store the packed 32-bit results in DST.
///
/// This intrinsic corresponds to the <c> VPDPWSSDS </c> instructions.
///
/// \operation
/// FOR j := 0 to 3
/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j])
/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1])
/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2)
/// ENDFOR
/// DST[MAX:128] := 0
/// \endoperation
static __inline__ __m128i __DEFAULT_FN_ATTRS128
_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B)
{
return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B);
}
#undef __DEFAULT_FN_ATTRS128
#undef __DEFAULT_FN_ATTRS256
#endif // __AVXVNNIINTRIN_H

View File

@ -196,6 +196,7 @@
#define bit_AMXINT8 0x02000000
/* Features in %eax for leaf 7 sub-leaf 1 */
#define bit_AVXVNNI 0x00000008
#define bit_AVX512BF16 0x00000020
#define bit_HRESET 0x00400000

View File

@ -145,6 +145,11 @@
#include <avx512vlvnniintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVXVNNI__)
#include <avxvnniintrin.h>
#endif
#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \
defined(__AVX512DQ__)
#include <avx512dqintrin.h>

View File

@ -0,0 +1,99 @@
// RUN: %clang_cc1 -ffreestanding %s -triple=x86_64-apple-darwin -target-feature +avxvnni -emit-llvm -o - -Wall -Werror | FileCheck %s
#include <immintrin.h>
__m256i test_mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpbusd_epi32
// CHECK: @llvm.x86.avx512.vpdpbusd.256
return _mm256_dpbusd_epi32(__S, __A, __B);
}
__m256i test_mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpbusds_epi32
// CHECK: @llvm.x86.avx512.vpdpbusds.256
return _mm256_dpbusds_epi32(__S, __A, __B);
}
__m256i test_mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpwssd_epi32
// CHECK: @llvm.x86.avx512.vpdpwssd.256
return _mm256_dpwssd_epi32(__S, __A, __B);
}
__m256i test_mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpwssds_epi32
// CHECK: @llvm.x86.avx512.vpdpwssds.256
return _mm256_dpwssds_epi32(__S, __A, __B);
}
__m128i test_mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpbusd_epi32
// CHECK: @llvm.x86.avx512.vpdpbusd.128
return _mm_dpbusd_epi32(__S, __A, __B);
}
__m128i test_mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpbusds_epi32
// CHECK: @llvm.x86.avx512.vpdpbusds.128
return _mm_dpbusds_epi32(__S, __A, __B);
}
__m128i test_mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpwssd_epi32
// CHECK: @llvm.x86.avx512.vpdpwssd.128
return _mm_dpwssd_epi32(__S, __A, __B);
}
__m128i test_mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpwssds_epi32
// CHECK: @llvm.x86.avx512.vpdpwssds.128
return _mm_dpwssds_epi32(__S, __A, __B);
}
__m256i test_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpbusd_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpbusd.256
return _mm256_dpbusd_avx_epi32(__S, __A, __B);
}
__m256i test_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpbusds_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpbusds.256
return _mm256_dpbusds_avx_epi32(__S, __A, __B);
}
__m256i test_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpwssd_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpwssd.256
return _mm256_dpwssd_avx_epi32(__S, __A, __B);
}
__m256i test_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) {
// CHECK-LABEL: @test_mm256_dpwssds_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpwssds.256
return _mm256_dpwssds_avx_epi32(__S, __A, __B);
}
__m128i test_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpbusd_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpbusd.128
return _mm_dpbusd_avx_epi32(__S, __A, __B);
}
__m128i test_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpbusds_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpbusds.128
return _mm_dpbusds_avx_epi32(__S, __A, __B);
}
__m128i test_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpwssd_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpwssd.128
return _mm_dpwssd_avx_epi32(__S, __A, __B);
}
__m128i test_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) {
// CHECK-LABEL: @test_mm_dpwssds_avx_epi32
// CHECK: @llvm.x86.avx512.vpdpwssds.128
return _mm_dpwssds_avx_epi32(__S, __A, __B);
}

View File

@ -54,9 +54,9 @@ void __attribute__((target("arch=x86-64-v4"))) x86_64_v4() {}
// CHECK: #0 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87" "tune-cpu"="i686"
// CHECK: #1 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt"
// CHECK-NOT: tune-cpu
// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
// CHECK: #2 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-aes,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-gfni,-kl,-pclmul,-sha,-sse2,-sse3,-sse4.1,-sse4.2,-sse4a,-ssse3,-vaes,-vpclmulqdq,-widekl,-xop" "tune-cpu"="i686"
// CHECK: #3 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "tune-cpu"="i686"
// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
// CHECK: #4 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-avx,-avx2,-avx512bf16,-avx512bitalg,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vbmi2,-avx512vl,-avx512vnni,-avx512vp2intersect,-avx512vpopcntdq,-avxvnni,-f16c,-fma,-fma4,-sse4.1,-sse4.2,-vaes,-vpclmulqdq,-xop" "tune-cpu"="i686"
// CHECK: #5 = {{.*}}"target-cpu"="ivybridge" "target-features"="+avx,+cx16,+cx8,+f16c,+fsgsbase,+fxsr,+mmx,+pclmul,+popcnt,+rdrnd,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-aes,-vaes"
// CHECK-NOT: tune-cpu
// CHECK: #6 = {{.*}}"target-cpu"="i686" "target-features"="+cx8,+x87,-3dnow,-3dnowa,-mmx"

View File

@ -288,3 +288,8 @@
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-UINTR %s
// UINTR: "-target-feature" "+uintr"
// NO-UINTR: "-target-feature" "-uintr"
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mavxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=AVX-VNNI %s
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-avxvnni %s -### -o %t.o 2>&1 | FileCheck --check-prefix=NO-AVX-VNNI %s
// AVX-VNNI: "-target-feature" "+avxvnni"
// NO-AVX-VNNI: "-target-feature" "-avxvnni"

View File

@ -1654,6 +1654,7 @@
// CHECK_SPR_M32: #define __AVX512VL__ 1
// CHECK_SPR_M32: #define __AVX512VNNI__ 1
// CHECK_SPR_M32: #define __AVX512VPOPCNTDQ__ 1
// CHECK_SPR_M32: #define __AVXVNNI__ 1
// CHECK_SPR_M32: #define __AVX__ 1
// CHECK_SPR_M32: #define __BMI2__ 1
// CHECK_SPR_M32: #define __BMI__ 1
@ -1724,6 +1725,7 @@
// CHECK_SPR_M64: #define __AVX512VL__ 1
// CHECK_SPR_M64: #define __AVX512VNNI__ 1
// CHECK_SPR_M64: #define __AVX512VPOPCNTDQ__ 1
// CHECK_SPR_M64: #define __AVXVNNI__ 1
// CHECK_SPR_M64: #define __AVX__ 1
// CHECK_SPR_M64: #define __BMI2__ 1
// CHECK_SPR_M64: #define __BMI__ 1
@ -1782,6 +1784,7 @@
// CHECK_ADL_M32: #define __AES__ 1
// CHECK_ADL_M32: #define __AVX2__ 1
// CHECK_ADL_M32-NOT: AVX512
// CHECK_ADL_M32: #define __AVXVNNI__ 1
// CHECK_ADL_M32: #define __AVX__ 1
// CHECK_ADL_M32: #define __BMI2__ 1
// CHECK_ADL_M32: #define __BMI__ 1
@ -1822,6 +1825,7 @@
// CHECK_ADL_M64: #define __AES__ 1
// CHECK_ADL_M64: #define __AVX2__ 1
// CHECK_ADL_M64-NOT: AVX512
// CHECK_ADL_M64: #define __AVXVNNI__ 1
// CHECK_ADL_M64: #define __AVX__ 1
// CHECK_ADL_M64: #define __BMI2__ 1
// CHECK_ADL_M64: #define __BMI__ 1

View File

@ -544,3 +544,17 @@
// RUN: %clang -target i386-unknown-linux-gnu -march=i386 -mno-uintr -x c -E -dM -o - %s | FileCheck -check-prefix=NOUINTR %s
// NOUINTR-NOT: #define __UINTR__ 1
// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNI %s
// AVXVNNI: #define __AVX2__ 1
// AVXVNNI: #define __AVXVNNI__ 1
// RUN: %clang -target i386-unknown-unknown -march=atom -mno-avxvnni -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=NOAVXVNNI %s
// NOAVXVNNI-NOT: #define __AVXVNNI__ 1
// RUN: %clang -target i386-unknown-unknown -march=atom -mavxvnni -mno-avx2 -x c -E -dM -o - %s | FileCheck -match-full-lines --check-prefix=AVXVNNINOAVX2 %s
// AVXVNNINOAVX2-NOT: #define __AVX2__ 1
// AVXVNNINOAVX2-NOT: #define __AVXVNNI__ 1

View File

@ -121,6 +121,7 @@ During this release ...
the target CPU.
* Support for ``HRESET`` instructions has been added.
* Support for ``UINTR`` instructions has been added.
* Support for ``AVXVNNI`` instructions has been added.
Changes to the AMDGPU Target
-----------------------------

View File

@ -190,6 +190,7 @@ X86_FEATURE (XSAVEC, "xsavec")
X86_FEATURE (XSAVEOPT, "xsaveopt")
X86_FEATURE (XSAVES, "xsaves")
X86_FEATURE (HRESET, "hreset")
X86_FEATURE (AVXVNNI, "avxvnni")
// These features aren't really CPU features, but the frontend can set them.
X86_FEATURE (RETPOLINE_EXTERNAL_THUNK, "retpoline-external-thunk")
X86_FEATURE (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")

View File

@ -1497,6 +1497,7 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
Features["amx-int8"] = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
bool HasLeaf7Subleaf1 =
MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
Features["avxvnni"] = HasLeaf7Subleaf1 && ((EAX >> 4) & 1) && HasAVXSave;
Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
Features["hreset"] = HasLeaf7Subleaf1 && ((EAX >> 22) & 1);

View File

@ -205,10 +205,10 @@ constexpr FeatureBitset FeaturesSapphireRapids =
FeatureAVX512BF16 | FeatureAVX512VP2INTERSECT | FeatureCLDEMOTE |
FeatureENQCMD | FeatureMOVDIR64B | FeatureMOVDIRI | FeaturePTWRITE |
FeatureSERIALIZE | FeatureSHSTK | FeatureTSXLDTRK | FeatureUINTR |
FeatureWAITPKG;
FeatureWAITPKG | FeatureAVXVNNI;
constexpr FeatureBitset FeaturesAlderlake =
FeaturesSkylakeClient | FeatureCLDEMOTE | FeatureHRESET | FeaturePTWRITE |
FeatureSERIALIZE | FeatureWAITPKG;
FeatureSERIALIZE | FeatureWAITPKG | FeatureAVXVNNI;
// Intel Atom processors.
// Bonnell has feature parity with Core2 and adds MOVBE.
@ -575,6 +575,9 @@ constexpr FeatureBitset ImpliedFeaturesHRESET = {};
constexpr FeatureBitset ImpliedFeaturesKL = FeatureSSE2;
constexpr FeatureBitset ImpliedFeaturesWIDEKL = FeatureKL;
// AVXVNNI Features
constexpr FeatureBitset ImpliedFeaturesAVXVNNI = FeatureAVX2;
constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
#define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM},
#include "llvm/Support/X86TargetParser.def"

View File

@ -3845,6 +3845,13 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
(MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
return Match_Unsupported;
// These instructions are only available with {vex}, {vex2} or {vex3} prefix
if (MCID.TSFlags & X86II::ExplicitVEXPrefix &&
(ForcedVEXEncoding != VEXEncoding_VEX &&
ForcedVEXEncoding != VEXEncoding_VEX2 &&
ForcedVEXEncoding != VEXEncoding_VEX3))
return Match_Unsupported;
// These instructions match ambiguously with their VEX encoded counterparts
// and appear first in the matching table. Reject them unless we're forcing
// EVEX encoding.

View File

@ -952,7 +952,11 @@ namespace X86II {
// NOTRACK prefix
NoTrackShift = EVEX_RCShift + 1,
NOTRACK = 1ULL << NoTrackShift
NOTRACK = 1ULL << NoTrackShift,
// Force VEX encoding
ExplicitVEXShift = NoTrackShift + 1,
ExplicitVEXPrefix = 1ULL << ExplicitVEXShift
};
/// \returns true if the instruction with given opcode is a prefix.

View File

@ -348,7 +348,7 @@ void X86InstPrinterCommon::printInstFlags(const MCInst *MI, raw_ostream &O) {
O << "\trep\t";
// These all require a pseudo prefix
if (Flags & X86::IP_USE_VEX)
if ((Flags & X86::IP_USE_VEX) || (TSFlags & X86II::ExplicitVEXPrefix))
O << "\t{vex}";
else if (Flags & X86::IP_USE_VEX2)
O << "\t{vex2}";

View File

@ -171,6 +171,9 @@ def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
"Enable AVX-512 Vector Neural Network Instructions",
[FeatureAVX512]>;
def FeatureAVXVNNI : SubtargetFeature<"avxvnni", "HasAVXVNNI", "true",
"Support AVX_VNNI encoding",
[FeatureAVX2]>;
def FeatureBF16 : SubtargetFeature<"avx512bf16", "HasBF16", "true",
"Support bfloat16 floating point",
[FeatureBWI]>;
@ -769,6 +772,7 @@ def ProcessorFeatures {
FeatureCLDEMOTE,
FeatureWAITPKG,
FeaturePTWRITE,
FeatureAVXVNNI,
FeatureTSXLDTRK,
FeatureENQCMD,
FeatureSHSTK,
@ -781,7 +785,8 @@ def ProcessorFeatures {
!listconcat(ICXFeatures, SPRAdditionalFeatures);
// Alderlake
list<SubtargetFeature> ADLAdditionalFeatures = [FeatureCLDEMOTE,
list<SubtargetFeature> ADLAdditionalFeatures = [FeatureAVXVNNI,
FeatureCLDEMOTE,
FeatureHRESET,
FeaturePTWRITE,
FeatureSERIALIZE,

View File

@ -85,6 +85,8 @@ public:
private:
/// Machine instruction info used throughout the class.
const X86InstrInfo *TII = nullptr;
const X86Subtarget *ST = nullptr;
};
} // end anonymous namespace
@ -94,8 +96,8 @@ char EvexToVexInstPass::ID = 0;
bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
if (!ST.hasAVX512())
ST = &MF.getSubtarget<X86Subtarget>();
if (!ST->hasAVX512())
return false;
bool Changed = false;
@ -144,10 +146,29 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
}
// Do any custom cleanup needed to finalize the conversion.
static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc,
const X86Subtarget *ST) {
(void)NewOpc;
unsigned Opc = MI.getOpcode();
switch (Opc) {
case X86::VPDPBUSDSZ256m:
case X86::VPDPBUSDSZ256r:
case X86::VPDPBUSDSZ128m:
case X86::VPDPBUSDSZ128r:
case X86::VPDPBUSDZ256m:
case X86::VPDPBUSDZ256r:
case X86::VPDPBUSDZ128m:
case X86::VPDPBUSDZ128r:
case X86::VPDPWSSDSZ256m:
case X86::VPDPWSSDSZ256r:
case X86::VPDPWSSDSZ128m:
case X86::VPDPWSSDSZ128r:
case X86::VPDPWSSDZ256m:
case X86::VPDPWSSDZ256r:
case X86::VPDPWSSDZ128m:
case X86::VPDPWSSDZ128r:
// These can only VEX convert if AVXVNNI is enabled.
return ST->hasAVXVNNI();
case X86::VALIGNDZ128rri:
case X86::VALIGNDZ128rmi:
case X86::VALIGNQZ128rri:
@ -259,7 +280,7 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
if (usesExtendedRegister(MI))
return false;
if (!performCustomAdjustments(MI, NewOpc))
if (!performCustomAdjustments(MI, NewOpc, ST))
return false;
MI.setDesc(TII->get(NewOpc));

View File

@ -3748,18 +3748,26 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
{ X86::VPCONFLICTQZ128rrk, X86::VPCONFLICTQZ128rmk, 0 },
{ X86::VPCONFLICTQZ256rrk, X86::VPCONFLICTQZ256rmk, 0 },
{ X86::VPCONFLICTQZrrk, X86::VPCONFLICTQZrmk, 0 },
{ X86::VPDPBUSDSYrr, X86::VPDPBUSDSYrm, 0 },
{ X86::VPDPBUSDSZ128r, X86::VPDPBUSDSZ128m, 0 },
{ X86::VPDPBUSDSZ256r, X86::VPDPBUSDSZ256m, 0 },
{ X86::VPDPBUSDSZr, X86::VPDPBUSDSZm, 0 },
{ X86::VPDPBUSDSrr, X86::VPDPBUSDSrm, 0 },
{ X86::VPDPBUSDYrr, X86::VPDPBUSDYrm, 0 },
{ X86::VPDPBUSDZ128r, X86::VPDPBUSDZ128m, 0 },
{ X86::VPDPBUSDZ256r, X86::VPDPBUSDZ256m, 0 },
{ X86::VPDPBUSDZr, X86::VPDPBUSDZm, 0 },
{ X86::VPDPBUSDrr, X86::VPDPBUSDrm, 0 },
{ X86::VPDPWSSDSYrr, X86::VPDPWSSDSYrm, 0 },
{ X86::VPDPWSSDSZ128r, X86::VPDPWSSDSZ128m, 0 },
{ X86::VPDPWSSDSZ256r, X86::VPDPWSSDSZ256m, 0 },
{ X86::VPDPWSSDSZr, X86::VPDPWSSDSZm, 0 },
{ X86::VPDPWSSDSrr, X86::VPDPWSSDSrm, 0 },
{ X86::VPDPWSSDYrr, X86::VPDPWSSDYrm, 0 },
{ X86::VPDPWSSDZ128r, X86::VPDPWSSDZ128m, 0 },
{ X86::VPDPWSSDZ256r, X86::VPDPWSSDZ256m, 0 },
{ X86::VPDPWSSDZr, X86::VPDPWSSDZm, 0 },
{ X86::VPDPWSSDrr, X86::VPDPWSSDrm, 0 },
{ X86::VPERMBZ128rrkz, X86::VPERMBZ128rmkz, 0 },
{ X86::VPERMBZ256rrkz, X86::VPERMBZ256rmkz, 0 },
{ X86::VPERMBZrrkz, X86::VPERMBZrmkz, 0 },

View File

@ -264,6 +264,9 @@ class NotMemoryFoldable { bit isMemoryFoldable = 0; }
// Prevent EVEX->VEX conversion from considering this instruction.
class NotEVEX2VEXConvertible { bit notEVEX2VEXConvertible = 1; }
// Force the instruction to use VEX encoding.
class ExplicitVEXPrefix { bit ExplicitVEXPrefix = 1; }
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
string AsmStr, Domain d = GenericDomain>
: Instruction {
@ -348,6 +351,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
bit notEVEX2VEXConvertible = 0; // Prevent EVEX->VEX conversion.
bit ExplicitVEXPrefix = 0; // Force the instruction to use VEX encoding.
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
@ -376,6 +380,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
let TSFlags{51-45} = CD8_Scale;
let TSFlags{52} = hasEVEX_RC;
let TSFlags{53} = hasNoTrackPrefix;
let TSFlags{54} = ExplicitVEXPrefix;
}
class PseudoI<dag oops, dag iops, list<dag> pattern>

View File

@ -2568,6 +2568,10 @@ bool X86InstrInfo::findCommutedOpIndices(const MachineInstr &MI,
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
case X86::VPDPWSSDYrr:
case X86::VPDPWSSDrr:
case X86::VPDPWSSDSYrr:
case X86::VPDPWSSDSrr:
case X86::VPDPWSSDZ128r:
case X86::VPDPWSSDZ128rk:
case X86::VPDPWSSDZ128rkz:

View File

@ -910,6 +910,8 @@ def PKU : Predicate<"Subtarget->hasPKU()">;
def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
def HasVP2INTERSECT : Predicate<"Subtarget->hasVP2INTERSECT()">;
def HasBF16 : Predicate<"Subtarget->hasBF16()">;
def HasAVXVNNI : Predicate <"Subtarget->hasAVXVNNI()">;
def NoVLX_Or_NoVNNI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVNNI()">;
def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;

View File

@ -7164,6 +7164,48 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
int_x86_avx_maskstore_pd_256,
WriteFMaskMove64, WriteFMaskMove64Y>;
//===----------------------------------------------------------------------===//
// AVX_VNNI
//===----------------------------------------------------------------------===//
let Predicates = [HasAVXVNNI, NoVLX_Or_NoVNNI], Constraints = "$src1 = $dst" in
multiclass avx_vnni_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
bit IsCommutable> {
let isCommutable = IsCommutable in
def rr : AVX8I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR128:$dst, (v4i32 (OpNode VR128:$src1,
VR128:$src2, VR128:$src3)))]>,
VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
def rm : AVX8I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR128:$dst, (v4i32 (OpNode VR128:$src1, VR128:$src2,
(loadv4i32 addr:$src3))))]>,
VEX_4V, Sched<[SchedWriteVecIMul.XMM]>;
let isCommutable = IsCommutable in
def Yrr : AVX8I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR256:$dst, (v8i32 (OpNode VR256:$src1,
VR256:$src2, VR256:$src3)))]>,
VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
def Yrm : AVX8I<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, i256mem:$src3),
!strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
[(set VR256:$dst, (v8i32 (OpNode VR256:$src1, VR256:$src2,
(loadv8i32 addr:$src3))))]>,
VEX_4V, VEX_L, Sched<[SchedWriteVecIMul.XMM]>;
}
defm VPDPBUSD : avx_vnni_rm<0x50, "vpdpbusd", X86Vpdpbusd, 0>, ExplicitVEXPrefix;
defm VPDPBUSDS : avx_vnni_rm<0x51, "vpdpbusds", X86Vpdpbusds, 0>, ExplicitVEXPrefix;
defm VPDPWSSD : avx_vnni_rm<0x52, "vpdpwssd", X86Vpdpwssd, 1>, ExplicitVEXPrefix;
defm VPDPWSSDS : avx_vnni_rm<0x53, "vpdpwssds", X86Vpdpwssds, 1>, ExplicitVEXPrefix;
//===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values
//

View File

@ -355,6 +355,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
/// Processor has AVX-512 Vector Neural Network Instructions
bool HasVNNI = false;
/// Processor has AVX Vector Neural Network Instructions
bool HasAVXVNNI = false;
/// Processor has AVX-512 bfloat16 floating-point extensions
bool HasBF16 = false;
@ -750,6 +753,7 @@ public:
bool useRetpolineIndirectBranches() const {
return UseRetpolineIndirectBranches;
}
bool hasAVXVNNI() const { return HasAVXVNNI; }
bool hasAMXTILE() const { return HasAMXTILE; }
bool hasAMXBF16() const { return HasAMXBF16; }
bool hasAMXINT8() const { return HasAMXINT8; }

View File

@ -0,0 +1,133 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVXVNNI
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+avx512vl,+avxvnni --show-mc-encoding | FileCheck %s --check-prefixes=AVX512VNNI
declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
define <8 x i32>@test_int_x86_avx_vpdpbusd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x50,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_256:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x50,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
ret <8 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
define <4 x i32>@test_int_x86_avx_vpdpbusd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x50,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusd_128:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x50,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
ret <4 x i32> %res
}
declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
define <8 x i32>@test_int_x86_avx_vpdpbusds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x51,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_256:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x51,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
ret <8 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
define <4 x i32>@test_int_x86_avx_vpdpbusds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x51,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpbusds_128:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpbusds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x51,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
ret <4 x i32> %res
}
declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
define <8 x i32>@test_int_x86_avx_vpdpwssd_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x52,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_256:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssd %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x52,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
ret <8 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
define <4 x i32>@test_int_x86_avx_vpdpwssd_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x52,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssd_128:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssd %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x52,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
ret <4 x i32> %res
}
declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
define <8 x i32>@test_int_x86_avx_vpdpwssds_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # encoding: [0xc4,0xe2,0x75,0x53,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_256:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssds %ymm2, %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x53,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2)
ret <8 x i32> %res
}
declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
define <4 x i32>@test_int_x86_avx_vpdpwssds_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
; AVXVNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
; AVXVNNI: # %bb.0:
; AVXVNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x53,0xc2]
; AVXVNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
;
; AVX512VNNI-LABEL: test_int_x86_avx_vpdpwssds_128:
; AVX512VNNI: # %bb.0:
; AVX512VNNI-NEXT: {vex} vpdpwssds %xmm2, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x53,0xc2]
; AVX512VNNI-NEXT: ret{{[l|q]}} # encoding: [0xc3]
%res = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2)
ret <4 x i32> %res
}

View File

@ -0,0 +1,242 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+avxvnni < %s | FileCheck %s
target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-unknown-unknown"
declare <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32>, <8 x i32>, <8 x i32>)
declare <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32>, <8 x i32>, <8 x i32>)
declare <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32>, <8 x i32>, <8 x i32>)
declare <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32>, <4 x i32>, <4 x i32>)
declare <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32>, <8 x i32>, <8 x i32>)
define <4 x i32> @stack_fold_vpdpwssd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
define <4 x i32> @stack_fold_vpdpwssd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpwssd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_vpdpwssd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
ret <8 x i32> %2
}
define <8 x i32> @stack_fold_vpdpwssd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssd_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpwssd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
ret <8 x i32> %2
}
define <4 x i32> @stack_fold_vpdpwssds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
define <4 x i32> @stack_fold_vpdpwssds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpwssds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_vpdpwssds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
ret <8 x i32> %2
}
define <8 x i32> @stack_fold_vpdpwssds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpwssds_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpwssds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpwssds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
ret <8 x i32> %2
}
define <4 x i32> @stack_fold_vpdpbusd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
define <4 x i32> @stack_fold_vpdpbusd_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; CHECK-NEXT: {vex} vpdpbusd %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpbusd.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_vpdpbusd_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpbusd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
ret <8 x i32> %2
}
define <8 x i32> @stack_fold_vpdpbusd_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusd_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; CHECK-NEXT: {vex} vpdpbusd %ymm1, %ymm2, %ymm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpbusd.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
ret <8 x i32> %2
}
define <4 x i32> @stack_fold_vpdpbusds(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 16-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
ret <4 x i32> %2
}
define <4 x i32> @stack_fold_vpdpbusds_commuted(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; CHECK-NEXT: {vex} vpdpbusds %xmm1, %xmm2, %xmm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <4 x i32> @llvm.x86.avx512.vpdpbusds.128(<4 x i32> %a0, <4 x i32> %a2, <4 x i32> %a1)
ret <4 x i32> %2
}
define <8 x i32> @stack_fold_vpdpbusds_256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds_256:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: {vex} vpdpbusds {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm0 # 32-byte Folded Reload
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2)
ret <8 x i32> %2
}
define <8 x i32> @stack_fold_vpdpbusds_256_commuted(<8 x i32> %a0, <8 x i32> %a1, <8 x i32> %a2) {
; CHECK-LABEL: stack_fold_vpdpbusds_256_commuted:
; CHECK: # %bb.0:
; CHECK-NEXT: vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; CHECK-NEXT: #APP
; CHECK-NEXT: nop
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; CHECK-NEXT: {vex} vpdpbusds %ymm1, %ymm2, %ymm0
; CHECK-NEXT: retq
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = call <8 x i32> @llvm.x86.avx512.vpdpbusds.256(<8 x i32> %a0, <8 x i32> %a2, <8 x i32> %a1)
ret <8 x i32> %2
}

View File

@ -0,0 +1,170 @@
# RUN: llvm-mc --disassemble %s -triple=i686-apple-darwin9 | FileCheck %s
# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xf4
# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xf4
# CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd (%eax), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0x30
# CHECK: {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd (%eax), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0x30
# CHECK: {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xf4
# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xf4
# CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds (%eax), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0x30
# CHECK: {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds (%eax), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0x30
# CHECK: {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xf4
# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xf4
# CHECK: {vex} vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd (%eax), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0x30
# CHECK: {vex} vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssd 4064(%ecx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssd -4096(%edx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd (%eax), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0x30
# CHECK: {vex} vpdpwssd -512(,%ebp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssd 2032(%ecx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssd -2048(%edx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xf4
# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xf4
# CHECK: {vex} vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds (%eax), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0x30
# CHECK: {vex} vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssds 4064(%ecx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssds -4096(%edx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds (%eax), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0x30
# CHECK: {vex} vpdpwssds -512(,%ebp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssds 2032(%ecx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssds -2048(%edx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff

View File

@ -0,0 +1,170 @@
# RUN: llvm-mc --disassemble %s -triple=i686 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x50,0xf4
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x50,0xf4
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
0xc4,0xe2,0x55,0x50,0x30
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
0xc4,0xe2,0x51,0x50,0x30
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x51,0xf4
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x51,0xf4
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
0xc4,0xe2,0x55,0x51,0x30
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
0xc4,0xe2,0x51,0x51,0x30
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x52,0xf4
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x52,0xf4
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
0xc4,0xe2,0x55,0x52,0x30
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
0xc4,0xe2,0x51,0x52,0x30
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x53,0xf4
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x53,0xf4
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
0xc4,0xe2,0x55,0x53,0x30
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
0xc4,0xe2,0x51,0x53,0x30
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff

View File

@ -0,0 +1,170 @@
# RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x50,0xf4
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x50,0xf4
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x51,0xf4
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x51,0xf4
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x52,0xf4
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x52,0xf4
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
0xc4,0xe2,0x55,0x53,0xf4
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
0xc4,0xe2,0x51,0x53,0xf4
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff

View File

@ -0,0 +1,170 @@
# RUN: llvm-mc --disassemble %s -triple=x86_64-apple-darwin9 | FileCheck %s
# CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xf4
# CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xf4
# CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6
0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6
0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd (%rip), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6
0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6
0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusd (%rip), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xf4
# CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xf4
# CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6
0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6
0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds (%rip), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6
0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6
0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpbusds (%rip), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xf4
# CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xf4
# CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6
0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6
0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd (%rip), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6
0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6
0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssd (%rip), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff
# CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xf4
# CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xf4
# CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6
0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6
0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds (%rip), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff
# CHECK: {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00
# CHECK: {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6
0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff
# CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6
0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10
# CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6
0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00
# CHECK: {vex} vpdpwssds (%rip), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00
# CHECK: {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff
# CHECK: {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00
# CHECK: {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6
0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff

View File

@ -0,0 +1,226 @@
// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni --show-encoding < %s | FileCheck %s
// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
{vex} vpdpbusd %ymm4, %ymm5, %ymm6
// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
{vex} vpdpbusd %xmm4, %xmm5, %xmm6
// CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusd 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusd 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd (%eax), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
{vex} vpdpbusd (%eax), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusd -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusd 4064(%ecx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd -4096(%edx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusd -4096(%edx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusd 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusd 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd (%eax), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
{vex} vpdpbusd (%eax), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusd -512(,%ebp,2), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusd 2032(%ecx), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd -2048(%edx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusd -2048(%edx), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
{vex} vpdpbusds %ymm4, %ymm5, %ymm6
// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
{vex} vpdpbusds %xmm4, %xmm5, %xmm6
// CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusds 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusds 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds (%eax), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
{vex} vpdpbusds (%eax), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusds -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusds 4064(%ecx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds -4096(%edx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusds -4096(%edx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusds 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusds 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds (%eax), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
{vex} vpdpbusds (%eax), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusds -512(,%ebp,2), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusds 2032(%ecx), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds -2048(%edx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusds -2048(%edx), %xmm5, %xmm6
// CHECK: vpdpwssd %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
{vex} vpdpwssd %ymm4, %ymm5, %ymm6
// CHECK: vpdpwssd %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
{vex} vpdpwssd %xmm4, %xmm5, %xmm6
// CHECK: vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssd 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssd 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: vpdpwssd (%eax), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
{vex} vpdpwssd (%eax), %ymm5, %ymm6
// CHECK: vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssd -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: vpdpwssd 4064(%ecx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssd 4064(%ecx), %ymm5, %ymm6
// CHECK: vpdpwssd -4096(%edx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssd -4096(%edx), %ymm5, %ymm6
// CHECK: vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssd 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssd 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: vpdpwssd (%eax), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
{vex} vpdpwssd (%eax), %xmm5, %xmm6
// CHECK: vpdpwssd -512(,%ebp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssd -512(,%ebp,2), %xmm5, %xmm6
// CHECK: vpdpwssd 2032(%ecx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssd 2032(%ecx), %xmm5, %xmm6
// CHECK: vpdpwssd -2048(%edx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssd -2048(%edx), %xmm5, %xmm6
// CHECK: vpdpwssds %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
{vex} vpdpwssds %ymm4, %ymm5, %ymm6
// CHECK: vpdpwssds %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
{vex} vpdpwssds %xmm4, %xmm5, %xmm6
// CHECK: vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssds 268435456(%esp,%esi,8), %ymm5, %ymm6
// CHECK: vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssds 291(%edi,%eax,4), %ymm5, %ymm6
// CHECK: vpdpwssds (%eax), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
{vex} vpdpwssds (%eax), %ymm5, %ymm6
// CHECK: vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssds -1024(,%ebp,2), %ymm5, %ymm6
// CHECK: vpdpwssds 4064(%ecx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssds 4064(%ecx), %ymm5, %ymm6
// CHECK: vpdpwssds -4096(%edx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssds -4096(%edx), %ymm5, %ymm6
// CHECK: vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssds 268435456(%esp,%esi,8), %xmm5, %xmm6
// CHECK: vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssds 291(%edi,%eax,4), %xmm5, %xmm6
// CHECK: vpdpwssds (%eax), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
{vex} vpdpwssds (%eax), %xmm5, %xmm6
// CHECK: vpdpwssds -512(,%ebp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssds -512(,%ebp,2), %xmm5, %xmm6
// CHECK: vpdpwssds 2032(%ecx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssds 2032(%ecx), %xmm5, %xmm6
// CHECK: vpdpwssds -2048(%edx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssds -2048(%edx), %xmm5, %xmm6

View File

@ -0,0 +1,226 @@
// RUN: llvm-mc -triple i686-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
{vex} vpdpbusd ymm6, ymm5, ymm4
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
{vex} vpdpbusd xmm6, xmm5, xmm4
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x30]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [eax]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x30]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [eax]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [edx - 2048]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
{vex} vpdpbusds ymm6, ymm5, ymm4
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
{vex} vpdpbusds xmm6, xmm5, xmm4
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x30]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [eax]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x30]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [eax]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [edx - 2048]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
{vex} vpdpwssd ymm6, ymm5, ymm4
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
{vex} vpdpwssd xmm6, xmm5, xmm4
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x30]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [eax]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x30]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [eax]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [edx - 2048]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
{vex} vpdpwssds ymm6, ymm5, ymm4
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
{vex} vpdpwssds xmm6, xmm5, xmm4
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x30]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [eax]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*ebp - 1024]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [ecx + 4064]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [edx - 4096]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0xf4,0x00,0x00,0x00,0x10]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [esp + 8*esi + 268435456]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb4,0x87,0x23,0x01,0x00,0x00]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [edi + 4*eax + 291]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x30]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [eax]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*ebp - 512]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [ecx + 2032]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [edx - 2048]

View File

@ -0,0 +1,226 @@
// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
{vex} vpdpbusd ymm6, ymm5, ymm4
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
{vex} vpdpbusd xmm6, xmm5, xmm4
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [rip]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: {vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusd ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [rip]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: {vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusd xmm6, xmm5, xmmword ptr [rdx - 2048]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
{vex} vpdpbusds ymm6, ymm5, ymm4
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
{vex} vpdpbusds xmm6, xmm5, xmm4
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [rip]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: {vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusds ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [rip]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: {vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusds xmm6, xmm5, xmmword ptr [rdx - 2048]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
{vex} vpdpwssd ymm6, ymm5, ymm4
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
{vex} vpdpwssd xmm6, xmm5, xmm4
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [rip]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: {vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssd ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [rip]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: {vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssd xmm6, xmm5, xmmword ptr [rdx - 2048]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymm4
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
{vex} vpdpwssds ymm6, ymm5, ymm4
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmm4
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
{vex} vpdpwssds xmm6, xmm5, xmm4
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [rip]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [2*rbp - 1024]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [rcx + 4064]
// CHECK: {vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssds ymm6, ymm5, ymmword ptr [rdx - 4096]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [rbp + 8*r14 + 268435456]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [r8 + 4*rax + 291]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [rip]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [2*rbp - 512]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [rcx + 2032]
// CHECK: {vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssds xmm6, xmm5, xmmword ptr [rdx - 2048]

View File

@ -0,0 +1,226 @@
// RUN: llvm-mc -triple x86_64-unknown-unknown -mattr=+avxvnni --show-encoding < %s | FileCheck %s
// CHECK: {vex} vpdpbusd %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xf4]
{vex} vpdpbusd %ymm4, %ymm5, %ymm6
// CHECK: {vex} vpdpbusd %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xf4]
{vex} vpdpbusd %xmm4, %xmm5, %xmm6
// CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xa2,0x55,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusd 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xc2,0x55,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusd 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd (%rip), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusd (%rip), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusd -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusd 4064(%rcx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x50,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusd -4096(%rdx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xa2,0x51,0x50,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusd 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xc2,0x51,0x50,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusd 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd (%rip), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusd (%rip), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusd -512(,%rbp,2), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusd 2032(%rcx), %xmm5, %xmm6
// CHECK: {vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x50,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusd -2048(%rdx), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xf4]
{vex} vpdpbusds %ymm4, %ymm5, %ymm6
// CHECK: {vex} vpdpbusds %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xf4]
{vex} vpdpbusds %xmm4, %xmm5, %xmm6
// CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xa2,0x55,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusds 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xc2,0x55,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusds 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds (%rip), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusds (%rip), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpbusds -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpbusds 4064(%rcx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x51,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpbusds -4096(%rdx), %ymm5, %ymm6
// CHECK: {vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xa2,0x51,0x51,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpbusds 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xc2,0x51,0x51,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpbusds 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds (%rip), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpbusds (%rip), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpbusds -512(,%rbp,2), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpbusds 2032(%rcx), %xmm5, %xmm6
// CHECK: {vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x51,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpbusds -2048(%rdx), %xmm5, %xmm6
// CHECK: {vex} vpdpwssd %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xf4]
{vex} vpdpwssd %ymm4, %ymm5, %ymm6
// CHECK: {vex} vpdpwssd %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xf4]
{vex} vpdpwssd %xmm4, %xmm5, %xmm6
// CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xa2,0x55,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssd 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xc2,0x55,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssd 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: {vex} vpdpwssd (%rip), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssd (%rip), %ymm5, %ymm6
// CHECK: {vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssd -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: {vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssd 4064(%rcx), %ymm5, %ymm6
// CHECK: {vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x52,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssd -4096(%rdx), %ymm5, %ymm6
// CHECK: {vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xa2,0x51,0x52,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssd 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: {vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xc2,0x51,0x52,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssd 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: {vex} vpdpwssd (%rip), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssd (%rip), %xmm5, %xmm6
// CHECK: {vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssd -512(,%rbp,2), %xmm5, %xmm6
// CHECK: {vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssd 2032(%rcx), %xmm5, %xmm6
// CHECK: {vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x52,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssd -2048(%rdx), %xmm5, %xmm6
// CHECK: {vex} vpdpwssds %ymm4, %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xf4]
{vex} vpdpwssds %ymm4, %ymm5, %ymm6
// CHECK: {vex} vpdpwssds %xmm4, %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xf4]
{vex} vpdpwssds %xmm4, %xmm5, %xmm6
// CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xa2,0x55,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssds 268435456(%rbp,%r14,8), %ymm5, %ymm6
// CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xc2,0x55,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssds 291(%r8,%rax,4), %ymm5, %ymm6
// CHECK: {vex} vpdpwssds (%rip), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssds (%rip), %ymm5, %ymm6
// CHECK: {vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0x34,0x6d,0x00,0xfc,0xff,0xff]
{vex} vpdpwssds -1024(,%rbp,2), %ymm5, %ymm6
// CHECK: {vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb1,0xe0,0x0f,0x00,0x00]
{vex} vpdpwssds 4064(%rcx), %ymm5, %ymm6
// CHECK: {vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6
// CHECK: encoding: [0xc4,0xe2,0x55,0x53,0xb2,0x00,0xf0,0xff,0xff]
{vex} vpdpwssds -4096(%rdx), %ymm5, %ymm6
// CHECK: {vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xa2,0x51,0x53,0xb4,0xf5,0x00,0x00,0x00,0x10]
{vex} vpdpwssds 268435456(%rbp,%r14,8), %xmm5, %xmm6
// CHECK: {vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xc2,0x51,0x53,0xb4,0x80,0x23,0x01,0x00,0x00]
{vex} vpdpwssds 291(%r8,%rax,4), %xmm5, %xmm6
// CHECK: {vex} vpdpwssds (%rip), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x35,0x00,0x00,0x00,0x00]
{vex} vpdpwssds (%rip), %xmm5, %xmm6
// CHECK: {vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0x34,0x6d,0x00,0xfe,0xff,0xff]
{vex} vpdpwssds -512(,%rbp,2), %xmm5, %xmm6
// CHECK: {vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb1,0xf0,0x07,0x00,0x00]
{vex} vpdpwssds 2032(%rcx), %xmm5, %xmm6
// CHECK: {vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6
// CHECK: encoding: [0xc4,0xe2,0x51,0x53,0xb2,0x00,0xf8,0xff,0xff]
{vex} vpdpwssds -2048(%rdx), %xmm5, %xmm6