From 8e364c680fab1724bd0d4749b8d1177843164aa7 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Mon, 15 Apr 2019 17:17:35 +0000 Subject: [PATCH] [X86] Restore the pavg intrinsics. The pattern we replaced these with may be too hard to match as demonstrated by PR41496 and PR41316. This patch restores the intrinsics and then we can start focusing on the optimizing the intrinsics. I've mostly reverted the original patch that removed them. Though I modified the avx512 intrinsics to not have masking built in. Differential Revision: https://reviews.llvm.org/D60674 llvm-svn: 358427 --- clang/include/clang/Basic/BuiltinsX86.def | 6 + clang/lib/Headers/avx2intrin.h | 12 +- clang/lib/Headers/avx512bwintrin.h | 12 +- clang/lib/Headers/emmintrin.h | 12 +- clang/test/CodeGen/avx2-builtins.c | 16 +-- clang/test/CodeGen/avx512bw-builtins.c | 50 +-------- clang/test/CodeGen/avx512vlbw-builtins.c | 68 ++---------- clang/test/CodeGen/sse2-builtins.c | 16 +-- llvm/include/llvm/IR/IntrinsicsX86.td | 18 +++ llvm/lib/IR/AutoUpgrade.cpp | 36 +++--- llvm/lib/Target/X86/X86IntrinsicsInfo.h | 6 + .../CodeGen/X86/avx2-intrinsics-fast-isel.ll | 16 +-- .../X86/avx2-intrinsics-x86-upgrade.ll | 20 ---- llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll | 32 ++++++ llvm/test/CodeGen/X86/avx512bw-intrinsics.ll | 52 +++++++++ .../test/CodeGen/X86/avx512bwvl-intrinsics.ll | 105 ++++++++++++++++++ .../CodeGen/X86/sse2-intrinsics-fast-isel.ll | 38 +------ .../X86/sse2-intrinsics-x86-upgrade.ll | 40 ------- llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll | 42 +++++++ 19 files changed, 310 insertions(+), 287 deletions(-) diff --git a/clang/include/clang/Basic/BuiltinsX86.def b/clang/include/clang/Basic/BuiltinsX86.def index 373ff247735f..5cbab8a3b327 100644 --- a/clang/include/clang/Basic/BuiltinsX86.def +++ b/clang/include/clang/Basic/BuiltinsX86.def @@ -263,6 +263,8 @@ TARGET_BUILTIN(__builtin_ia32_paddusw128, "V8sV8sV8s", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_psubusb128, "V16cV16cV16c", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_psubusw128, "V8sV8sV8s", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_pmulhw128, "V8sV8sV8s", "ncV:128:", "sse2") +TARGET_BUILTIN(__builtin_ia32_pavgb128, "V16cV16cV16c", "ncV:128:", "sse2") +TARGET_BUILTIN(__builtin_ia32_pavgw128, "V8sV8sV8s", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_pmaxub128, "V16cV16cV16c", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_pmaxsw128, "V8sV8sV8s", "ncV:128:", "sse2") TARGET_BUILTIN(__builtin_ia32_pminub128, "V16cV16cV16c", "ncV:128:", "sse2") @@ -574,6 +576,8 @@ TARGET_BUILTIN(__builtin_ia32_paddusw256, "V16sV16sV16s", "ncV:256:", "avx2") TARGET_BUILTIN(__builtin_ia32_psubusb256, "V32cV32cV32c", "ncV:256:", "avx2") TARGET_BUILTIN(__builtin_ia32_psubusw256, "V16sV16sV16s", "ncV:256:", "avx2") TARGET_BUILTIN(__builtin_ia32_palignr256, "V32cV32cV32cIi", "ncV:256:", "avx2") +TARGET_BUILTIN(__builtin_ia32_pavgb256, "V32cV32cV32c", "ncV:256:", "avx2") +TARGET_BUILTIN(__builtin_ia32_pavgw256, "V16sV16sV16s", "ncV:256:", "avx2") TARGET_BUILTIN(__builtin_ia32_pblendvb256, "V32cV32cV32cV32c", "ncV:256:", "avx2") TARGET_BUILTIN(__builtin_ia32_pblendw256, "V16sV16sV16sIi", "ncV:256:", "avx2") TARGET_BUILTIN(__builtin_ia32_phaddw256, "V16sV16sV16s", "ncV:256:", "avx2") @@ -1053,6 +1057,8 @@ TARGET_BUILTIN(__builtin_ia32_paddsb512, "V64cV64cV64c", "ncV:512:", "avx512bw") TARGET_BUILTIN(__builtin_ia32_paddsw512, "V32sV32sV32s", "ncV:512:", "avx512bw") TARGET_BUILTIN(__builtin_ia32_paddusb512, "V64cV64cV64c", "ncV:512:", "avx512bw") TARGET_BUILTIN(__builtin_ia32_paddusw512, "V32sV32sV32s", "ncV:512:", "avx512bw") +TARGET_BUILTIN(__builtin_ia32_pavgb512, "V64cV64cV64c", "ncV:512:", "avx512bw") +TARGET_BUILTIN(__builtin_ia32_pavgw512, "V32sV32sV32s", "ncV:512:", "avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaxsb512, "V64cV64cV64c", "ncV:512:", "avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaxsw512, "V32sV32sV32s", "ncV:512:", "avx512bw") TARGET_BUILTIN(__builtin_ia32_pmaxub512, "V64cV64cV64c", "ncV:512:", "avx512bw") diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h index f42cfc2e842f..162e83ea2fbc 100644 --- a/clang/lib/Headers/avx2intrin.h +++ b/clang/lib/Headers/avx2intrin.h @@ -132,21 +132,13 @@ _mm256_andnot_si256(__m256i __a, __m256i __b) static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu8(__m256i __a, __m256i __b) { - typedef unsigned short __v32hu __attribute__((__vector_size__(64))); - return (__m256i)__builtin_convertvector( - ((__builtin_convertvector((__v32qu)__a, __v32hu) + - __builtin_convertvector((__v32qu)__b, __v32hu)) + 1) - >> 1, __v32qu); + return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_avg_epu16(__m256i __a, __m256i __b) { - typedef unsigned int __v16su __attribute__((__vector_size__(64))); - return (__m256i)__builtin_convertvector( - ((__builtin_convertvector((__v16hu)__a, __v16su) + - __builtin_convertvector((__v16hu)__b, __v16su)) + 1) - >> 1, __v16hu); + return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); } static __inline__ __m256i __DEFAULT_FN_ATTRS256 diff --git a/clang/lib/Headers/avx512bwintrin.h b/clang/lib/Headers/avx512bwintrin.h index 9c752d4f6d43..723829647fae 100644 --- a/clang/lib/Headers/avx512bwintrin.h +++ b/clang/lib/Headers/avx512bwintrin.h @@ -705,11 +705,7 @@ _mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_avg_epu8 (__m512i __A, __m512i __B) { - typedef unsigned short __v64hu __attribute__((__vector_size__(128))); - return (__m512i)__builtin_convertvector( - ((__builtin_convertvector((__v64qu) __A, __v64hu) + - __builtin_convertvector((__v64qu) __B, __v64hu)) + 1) - >> 1, __v64qu); + return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 @@ -732,11 +728,7 @@ _mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) static __inline__ __m512i __DEFAULT_FN_ATTRS512 _mm512_avg_epu16 (__m512i __A, __m512i __B) { - typedef unsigned int __v32su __attribute__((__vector_size__(128))); - return (__m512i)__builtin_convertvector( - ((__builtin_convertvector((__v32hu) __A, __v32su) + - __builtin_convertvector((__v32hu) __B, __v32su)) + 1) - >> 1, __v32hu); + return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B); } static __inline__ __m512i __DEFAULT_FN_ATTRS512 diff --git a/clang/lib/Headers/emmintrin.h b/clang/lib/Headers/emmintrin.h index b2745d494e6d..3d55f5f2710f 100644 --- a/clang/lib/Headers/emmintrin.h +++ b/clang/lib/Headers/emmintrin.h @@ -2305,11 +2305,7 @@ _mm_adds_epu16(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu8(__m128i __a, __m128i __b) { - typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); - return (__m128i)__builtin_convertvector( - ((__builtin_convertvector((__v16qu)__a, __v16hu) + - __builtin_convertvector((__v16qu)__b, __v16hu)) + 1) - >> 1, __v16qu); + return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); } /// Computes the rounded avarages of corresponding elements of two @@ -2329,11 +2325,7 @@ _mm_avg_epu8(__m128i __a, __m128i __b) static __inline__ __m128i __DEFAULT_FN_ATTRS _mm_avg_epu16(__m128i __a, __m128i __b) { - typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); - return (__m128i)__builtin_convertvector( - ((__builtin_convertvector((__v8hu)__a, __v8su) + - __builtin_convertvector((__v8hu)__b, __v8su)) + 1) - >> 1, __v8hu); + return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); } /// Multiplies the corresponding elements of two 128-bit signed [8 x i16] diff --git a/clang/test/CodeGen/avx2-builtins.c b/clang/test/CodeGen/avx2-builtins.c index 90c07c1d389d..4ef11478853a 100644 --- a/clang/test/CodeGen/avx2-builtins.c +++ b/clang/test/CodeGen/avx2-builtins.c @@ -107,25 +107,13 @@ __m256i test_mm256_andnot_si256(__m256i a, __m256i b) { __m256i test_mm256_avg_epu8(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_avg_epu8 - // CHECK-NOT: call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) - // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> - // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> - // CHECK: add <32 x i16> %{{.*}}, %{{.*}} - // CHECK: add <32 x i16> %{{.*}}, - // CHECK: lshr <32 x i16> %{{.*}}, - // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8> + // CHECK: call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}}) return _mm256_avg_epu8(a, b); } __m256i test_mm256_avg_epu16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_avg_epu16 - // CHECK-NOT: call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) - // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: add <16 x i32> %{{.*}}, %{{.*}} - // CHECK: add <16 x i32> %{{.*}}, - // CHECK: lshr <16 x i32> %{{.*}}, - // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16> + // CHECK: call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_avg_epu16(a, b); } diff --git a/clang/test/CodeGen/avx512bw-builtins.c b/clang/test/CodeGen/avx512bw-builtins.c index 9e75baae77f5..562bc312889e 100644 --- a/clang/test/CodeGen/avx512bw-builtins.c +++ b/clang/test/CodeGen/avx512bw-builtins.c @@ -1066,73 +1066,35 @@ __m512i test_mm512_maskz_adds_epu16(__mmask32 __U, __m512i __A, __m512i __B) { } __m512i test_mm512_avg_epu8(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_avg_epu8 - // CHECK-NOT: @llvm.x86.avx512.mask.pavg.b.512 - // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> - // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> - // CHECK: add <64 x i16> %{{.*}}, %{{.*}} - // CHECK: add <64 x i16> %{{.*}}, - // CHECK: lshr <64 x i16> %{{.*}}, - // CHECK: trunc <64 x i16> %{{.*}} to <64 x i8> + // CHECK: @llvm.x86.avx512.pavg.b.512 return _mm512_avg_epu8(__A,__B); } __m512i test_mm512_mask_avg_epu8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_avg_epu8 - // CHECK-NOT: @llvm.x86.avx512.mask.pavg.b.512 - // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> - // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> - // CHECK: add <64 x i16> %{{.*}}, %{{.*}} - // CHECK: add <64 x i16> %{{.*}}, - // CHECK: lshr <64 x i16> %{{.*}}, - // CHECK: trunc <64 x i16> %{{.*}} to <64 x i8> + // CHECK: @llvm.x86.avx512.pavg.b.512 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_mask_avg_epu8(__W,__U,__A,__B); } __m512i test_mm512_maskz_avg_epu8(__mmask64 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_avg_epu8 - // CHECK-NOT: @llvm.x86.avx512.mask.pavg.b.512 - // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> - // CHECK: zext <64 x i8> %{{.*}} to <64 x i16> - // CHECK: add <64 x i16> %{{.*}}, %{{.*}} - // CHECK: add <64 x i16> %{{.*}}, - // CHECK: lshr <64 x i16> %{{.*}}, - // CHECK: trunc <64 x i16> %{{.*}} to <64 x i8> - // CHECK: store <8 x i64> zeroinitializer + // CHECK: @llvm.x86.avx512.pavg.b.512 // CHECK: select <64 x i1> %{{.*}}, <64 x i8> %{{.*}}, <64 x i8> %{{.*}} return _mm512_maskz_avg_epu8(__U,__A,__B); } __m512i test_mm512_avg_epu16(__m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_avg_epu16 - // CHECK-NOT: @llvm.x86.avx512.mask.pavg.w.512 - // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> - // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> - // CHECK: add <32 x i32> %{{.*}}, %{{.*}} - // CHECK: add <32 x i32> %{{.*}}, - // CHECK: lshr <32 x i32> %{{.*}}, - // CHECK: trunc <32 x i32> %{{.*}} to <32 x i16> + // CHECK: @llvm.x86.avx512.pavg.w.512 return _mm512_avg_epu16(__A,__B); } __m512i test_mm512_mask_avg_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_mask_avg_epu16 - // CHECK-NOT: @llvm.x86.avx512.mask.pavg.w.512 - // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> - // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> - // CHECK: add <32 x i32> %{{.*}}, %{{.*}} - // CHECK: add <32 x i32> %{{.*}}, - // CHECK: lshr <32 x i32> %{{.*}}, - // CHECK: trunc <32 x i32> %{{.*}} to <32 x i16> + // CHECK: @llvm.x86.avx512.pavg.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_mask_avg_epu16(__W,__U,__A,__B); } __m512i test_mm512_maskz_avg_epu16(__mmask32 __U, __m512i __A, __m512i __B) { // CHECK-LABEL: @test_mm512_maskz_avg_epu16 - // CHECK-NOT: @llvm.x86.avx512.mask.pavg.w.512 - // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> - // CHECK: zext <32 x i16> %{{.*}} to <32 x i32> - // CHECK: add <32 x i32> %{{.*}}, %{{.*}} - // CHECK: add <32 x i32> %{{.*}}, - // CHECK: lshr <32 x i32> %{{.*}}, - // CHECK: trunc <32 x i32> %{{.*}} to <32 x i16> - // CHECK: store <8 x i64> zeroinitializer + // CHECK: @llvm.x86.avx512.pavg.w.512 // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}} return _mm512_maskz_avg_epu16(__U,__A,__B); } diff --git a/clang/test/CodeGen/avx512vlbw-builtins.c b/clang/test/CodeGen/avx512vlbw-builtins.c index 80efe72787a2..1d853844beff 100644 --- a/clang/test/CodeGen/avx512vlbw-builtins.c +++ b/clang/test/CodeGen/avx512vlbw-builtins.c @@ -1179,101 +1179,49 @@ __m256i test_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) { } __m128i test_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mask_avg_epu8 - // CHECK-NOT: @llvm.x86.sse2.pavg.b - // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> - // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> - // CHECK: add <16 x i16> %{{.*}}, %{{.*}} - // CHECK: add <16 x i16> %{{.*}}, - // CHECK: lshr <16 x i16> %{{.*}}, - // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8> + // CHECK: @llvm.x86.sse2.pavg.b // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_mask_avg_epu8(__W,__U,__A,__B); } __m128i test_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_maskz_avg_epu8 - // CHECK-NOT: @llvm.x86.sse2.pavg.b - // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> - // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> - // CHECK: add <16 x i16> %{{.*}}, %{{.*}} - // CHECK: add <16 x i16> %{{.*}}, - // CHECK: lshr <16 x i16> %{{.*}}, - // CHECK: trunc <16 x i16> %{{.*}} to <16 x i8> - // CHECK: store <2 x i64> zeroinitializer + // CHECK: @llvm.x86.sse2.pavg.b // CHECK: select <16 x i1> %{{.*}}, <16 x i8> %{{.*}}, <16 x i8> %{{.*}} return _mm_maskz_avg_epu8(__U,__A,__B); } __m256i test_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_avg_epu8 - // CHECK-NOT: @llvm.x86.avx2.pavg.b - // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> - // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> - // CHECK: add <32 x i16> %{{.*}}, %{{.*}} - // CHECK: add <32 x i16> %{{.*}}, - // CHECK: lshr <32 x i16> %{{.*}}, - // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8> + // CHECK: @llvm.x86.avx2.pavg.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_mask_avg_epu8(__W,__U,__A,__B); } __m256i test_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_avg_epu8 - // CHECK-NOT: @llvm.x86.avx2.pavg.b - // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> - // CHECK: zext <32 x i8> %{{.*}} to <32 x i16> - // CHECK: add <32 x i16> %{{.*}}, %{{.*}} - // CHECK: add <32 x i16> %{{.*}}, - // CHECK: lshr <32 x i16> %{{.*}}, - // CHECK: trunc <32 x i16> %{{.*}} to <32 x i8> - // CHECK: store <4 x i64> zeroinitializer + // CHECK: @llvm.x86.avx2.pavg.b // CHECK: select <32 x i1> %{{.*}}, <32 x i8> %{{.*}}, <32 x i8> %{{.*}} return _mm256_maskz_avg_epu8(__U,__A,__B); } __m128i test_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_mask_avg_epu16 - // CHECK-NOT: @llvm.x86.sse2.pavg.w - // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: add <8 x i32> %{{.*}}, %{{.*}} - // CHECK: add <8 x i32> %{{.*}}, - // CHECK: lshr <8 x i32> %{{.*}}, - // CHECK: trunc <8 x i32> %{{.*}} to <8 x i16> + // CHECK: @llvm.x86.sse2.pavg.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_mask_avg_epu16(__W,__U,__A,__B); } __m128i test_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) { // CHECK-LABEL: @test_mm_maskz_avg_epu16 - // CHECK-NOT: @llvm.x86.sse2.pavg.w - // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: add <8 x i32> %{{.*}}, %{{.*}} - // CHECK: add <8 x i32> %{{.*}}, - // CHECK: lshr <8 x i32> %{{.*}}, - // CHECK: trunc <8 x i32> %{{.*}} to <8 x i16> - // CHECK: store <2 x i64> zeroinitializer + // CHECK: @llvm.x86.sse2.pavg.w // CHECK: select <8 x i1> %{{.*}}, <8 x i16> %{{.*}}, <8 x i16> %{{.*}} return _mm_maskz_avg_epu16(__U,__A,__B); } __m256i test_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_mask_avg_epu16 - // CHECK-NOT: @llvm.x86.avx2.pavg.w - // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: add <16 x i32> %{{.*}}, %{{.*}} - // CHECK: add <16 x i32> %{{.*}}, - // CHECK: lshr <16 x i32> %{{.*}}, - // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16> + // CHECK: @llvm.x86.avx2.pavg.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_mask_avg_epu16(__W,__U,__A,__B); } __m256i test_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) { // CHECK-LABEL: @test_mm256_maskz_avg_epu16 - // CHECK-NOT: @llvm.x86.avx2.pavg.w - // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: zext <16 x i16> %{{.*}} to <16 x i32> - // CHECK: add <16 x i32> %{{.*}}, %{{.*}} - // CHECK: add <16 x i32> %{{.*}}, - // CHECK: lshr <16 x i32> %{{.*}}, - // CHECK: trunc <16 x i32> %{{.*}} to <16 x i16> - // CHECK: store <4 x i64> zeroinitializer + // CHECK: @llvm.x86.avx2.pavg.w // CHECK: select <16 x i1> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}} return _mm256_maskz_avg_epu16(__U,__A,__B); } diff --git a/clang/test/CodeGen/sse2-builtins.c b/clang/test/CodeGen/sse2-builtins.c index 6e3a0a707288..acf4b20dd147 100644 --- a/clang/test/CodeGen/sse2-builtins.c +++ b/clang/test/CodeGen/sse2-builtins.c @@ -100,25 +100,13 @@ __m128i test_mm_andnot_si128(__m128i A, __m128i B) { __m128i test_mm_avg_epu8(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_avg_epu8 - // CHECK-NOT: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) - // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> - // CHECK: zext <16 x i8> %{{.*}} to <16 x i16> - // CHECK: add <16 x i16> %{{.*}}, %{{.*}} - // CHECK: add <16 x i16> %{{.*}}, - // CHECK: lshr <16 x i16> %{{.*}}, - // CHECK:trunc <16 x i16> %{{.*}} to <16 x i8> + // CHECK: call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_avg_epu8(A, B); } __m128i test_mm_avg_epu16(__m128i A, __m128i B) { // CHECK-LABEL: test_mm_avg_epu16 - // CHECK-NOT: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) - // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: zext <8 x i16> %{{.*}} to <8 x i32> - // CHECK: add <8 x i32> %{{.*}}, %{{.*}} - // CHECK: add <8 x i32> %{{.*}}, - // CHECK: lshr <8 x i32> %{{.*}}, - // CHECK: trunc <8 x i32> %{{.*}} to <8 x i16> + // CHECK: call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_avg_epu16(A, B); } diff --git a/llvm/include/llvm/IR/IntrinsicsX86.td b/llvm/include/llvm/IR/IntrinsicsX86.td index e4f2acdfa993..06b603a788ba 100644 --- a/llvm/include/llvm/IR/IntrinsicsX86.td +++ b/llvm/include/llvm/IR/IntrinsicsX86.td @@ -366,6 +366,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_sse2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd128">, Intrinsic<[llvm_v4i32_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb128">, + Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, + llvm_v16i8_ty], [IntrNoMem, Commutative]>; + def int_x86_sse2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw128">, + Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, + llvm_v8i16_ty], [IntrNoMem, Commutative]>; def int_x86_sse2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw128">, Intrinsic<[llvm_v2i64_ty], [llvm_v16i8_ty, llvm_v16i8_ty], [IntrNoMem, Commutative]>; @@ -1330,6 +1336,12 @@ let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". def int_x86_avx2_pmadd_wd : GCCBuiltin<"__builtin_ia32_pmaddwd256">, Intrinsic<[llvm_v8i32_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pavg_b : GCCBuiltin<"__builtin_ia32_pavgb256">, + Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, + llvm_v32i8_ty], [IntrNoMem, Commutative]>; + def int_x86_avx2_pavg_w : GCCBuiltin<"__builtin_ia32_pavgw256">, + Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, + llvm_v16i16_ty], [IntrNoMem, Commutative]>; def int_x86_avx2_psad_bw : GCCBuiltin<"__builtin_ia32_psadbw256">, Intrinsic<[llvm_v4i64_ty], [llvm_v32i8_ty, llvm_v32i8_ty], [IntrNoMem, Commutative]>; @@ -3533,6 +3545,12 @@ let TargetPrefix = "x86" in { def int_x86_avx512_pmulh_w_512 : GCCBuiltin<"__builtin_ia32_pmulhw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; + def int_x86_avx512_pavg_b_512 : GCCBuiltin<"__builtin_ia32_pavgb512">, + Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], + [IntrNoMem]>; + def int_x86_avx512_pavg_w_512 : GCCBuiltin<"__builtin_ia32_pavgw512">, + Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], + [IntrNoMem]>; def int_x86_avx512_pmaddw_d_512 : GCCBuiltin<"__builtin_ia32_pmaddwd512">, Intrinsic<[llvm_v16i32_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 4f0a32efcae3..84f250afa9a7 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -386,8 +386,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) { Name.startswith("avx512.mask.prol.") || // Added in 8.0 Name.startswith("avx512.ptestm") || //Added in 6.0 Name.startswith("avx512.ptestnm") || //Added in 6.0 - Name.startswith("sse2.pavg") || // Added in 6.0 - Name.startswith("avx2.pavg") || // Added in 6.0 Name.startswith("avx512.mask.pavg")) // Added in 6.0 return true; @@ -1547,6 +1545,21 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder, IID = Intrinsic::x86_avx512_conflict_q_512; else llvm_unreachable("Unexpected intrinsic"); + } else if (Name.startswith("pavg.")) { + if (Name[5] == 'b' && VecWidth == 128) + IID = Intrinsic::x86_sse2_pavg_b; + else if (Name[5] == 'b' && VecWidth == 256) + IID = Intrinsic::x86_avx2_pavg_b; + else if (Name[5] == 'b' && VecWidth == 512) + IID = Intrinsic::x86_avx512_pavg_b_512; + else if (Name[5] == 'w' && VecWidth == 128) + IID = Intrinsic::x86_sse2_pavg_w; + else if (Name[5] == 'w' && VecWidth == 256) + IID = Intrinsic::x86_avx2_pavg_w; + else if (Name[5] == 'w' && VecWidth == 512) + IID = Intrinsic::x86_avx512_pavg_w_512; + else + llvm_unreachable("Unexpected intrinsic"); } else return false; @@ -2974,25 +2987,6 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) { LoadInst *LI = Builder.CreateAlignedLoad(VTy, BC, VTy->getBitWidth() / 8); LI->setMetadata(M->getMDKindID("nontemporal"), Node); Rep = LI; - } else if (IsX86 && - (Name.startswith("sse2.pavg") || Name.startswith("avx2.pavg") || - Name.startswith("avx512.mask.pavg"))) { - // llvm.x86.sse2.pavg.b/w, llvm.x86.avx2.pavg.b/w, - // llvm.x86.avx512.mask.pavg.b/w - Value *A = CI->getArgOperand(0); - Value *B = CI->getArgOperand(1); - VectorType *ZextType = VectorType::getExtendedElementVectorType( - cast(A->getType())); - Value *ExtendedA = Builder.CreateZExt(A, ZextType); - Value *ExtendedB = Builder.CreateZExt(B, ZextType); - Value *Sum = Builder.CreateAdd(ExtendedA, ExtendedB); - Value *AddOne = Builder.CreateAdd(Sum, ConstantInt::get(ZextType, 1)); - Value *ShiftR = Builder.CreateLShr(AddOne, ConstantInt::get(ZextType, 1)); - Rep = Builder.CreateTrunc(ShiftR, A->getType()); - if (CI->getNumArgOperands() > 2) { - Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, - CI->getArgOperand(2)); - } } else if (IsX86 && (Name.startswith("fma.vfmadd.") || Name.startswith("fma.vfmsub.") || Name.startswith("fma.vfnmadd.") || diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h index 8243c27b2ecd..54168762c7a4 100644 --- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h +++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h @@ -370,6 +370,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_pblendvb, BLENDV, X86ISD::BLENDV, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), @@ -819,6 +821,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(avx512_pavg_b_512, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(avx512_pavg_w_512, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1030,6 +1034,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), + X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index 8720c86edfff..f29891e6f8a3 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -195,15 +195,11 @@ define <4 x i64> @test_mm256_avg_epu8(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %zext0 = zext <32 x i8> %arg0 to <32 x i16> - %zext1 = zext <32 x i8> %arg1 to <32 x i16> - %add = add <32 x i16> %zext0, %zext1 - %add1 = add <32 x i16> %add, - %lshr = lshr <32 x i16> %add1, - %res = trunc <32 x i16> %lshr to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } +declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-LABEL: test_mm256_avg_epu16: @@ -212,15 +208,11 @@ define <4 x i64> @test_mm256_avg_epu16(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %zext0 = zext <16 x i16> %arg0 to <16 x i32> - %zext1 = zext <16 x i16> %arg1 to <16 x i32> - %add = add <16 x i32> %zext0, %zext1 - %add1 = add <16 x i32> %add, - %lshr = lshr <16 x i32> %add1, - %res = trunc <16 x i32> %lshr to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } +declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_blend_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_blend_epi16: diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll index b340a66def96..fc6929ce205d 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -529,26 +529,6 @@ define <8 x i32> @mm256_min_epu32(<8 x i32> %a0, <8 x i32> %a1) { } declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone -define <32 x i8> @mm256_avg_epu8(<32 x i8> %a0, <32 x i8> %a1) { -; CHECK-LABEL: mm256_avg_epu8: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpavgb %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} - %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone - -define <16 x i16> @mm256_avg_epu16(<16 x i16> %a0, <16 x i16> %a1) { -; CHECK-LABEL: mm256_avg_epu16: -; CHECK: ## %bb.0: -; CHECK-NEXT: vpavgw %ymm1, %ymm0, %ymm0 -; CHECK-NEXT: ret{{[l|q]}} - %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone - define <32 x i8> @test_x86_avx2_pabs_b(<32 x i8> %a0) { ; CHECK-LABEL: test_x86_avx2_pabs_b: ; CHECK: ## %bb.0: diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll index de20c07d4139..aa68cfd70a88 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -151,6 +151,38 @@ define <32 x i8> @test_x86_avx2_packuswb_fold() { } +define <32 x i8> @test_x86_avx2_pavg_b(<32 x i8> %a0, <32 x i8> %a1) { +; AVX2-LABEL: test_x86_avx2_pavg_b: +; AVX2: # %bb.0: +; AVX2-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xe0,0xc1] +; AVX2-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pavg_b: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpavgb %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_pavg_w(<16 x i16> %a0, <16 x i16> %a1) { +; AVX2-LABEL: test_x86_avx2_pavg_w: +; AVX2: # %bb.0: +; AVX2-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # encoding: [0xc5,0xfd,0xe3,0xc1] +; AVX2-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx2_pavg_w: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vpavgw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone + + define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { ; AVX2-LABEL: test_x86_avx2_pmadd_wd: ; AVX2: # %bb.0: diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll index 71cfd891bb18..613bbd6633d6 100644 --- a/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -763,6 +763,58 @@ define <32 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_512(<32 x i16> %x0, <32 ret <32 x i16> %res2 } +declare <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8>, <64 x i8>) + +define <64 x i8> @test_int_x86_avx512_mask_pavg_b_512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_b_512: +; X86: # %bb.0: +; X86-NEXT: vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9] +; X86-NEXT: kmovq {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1] +; X86-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_b_512: +; X64: # %bb.0: +; X64-NEXT: vpavgb %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe0,0xd9] +; X64-NEXT: kmovq %rdi, %k1 # encoding: [0xc4,0xe1,0xfb,0x92,0xcf] +; X64-NEXT: vpavgb %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe0,0xd1] +; X64-NEXT: vpaddb %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1) + %2 = bitcast i64 %x3 to <64 x i1> + %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %x2 + %4 = call <64 x i8> @llvm.x86.avx512.pavg.b.512(<64 x i8> %x0, <64 x i8> %x1) + %res2 = add <64 x i8> %3, %4 + ret <64 x i8> %res2 +} + +declare <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16>, <32 x i16>) + +define <32 x i16> @test_int_x86_avx512_mask_pavg_w_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_w_512: +; X86: # %bb.0: +; X86-NEXT: vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1] +; X86-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_w_512: +; X64: # %bb.0: +; X64-NEXT: vpavgw %zmm1, %zmm0, %zmm3 # encoding: [0x62,0xf1,0x7d,0x48,0xe3,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe3,0xd1] +; X64-NEXT: vpaddw %zmm3, %zmm2, %zmm0 # encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %x2 + %4 = call <32 x i16> @llvm.x86.avx512.pavg.w.512(<32 x i16> %x0, <32 x i16> %x1) + %res2 = add <32 x i16> %3, %4 + ret <32 x i16> %res2 +} + declare <64 x i8> @llvm.x86.avx512.pshuf.b.512(<64 x i8>, <64 x i8>) define <64 x i8>@test_int_x86_avx512_pshuf_b_512(<64 x i8> %x0, <64 x i8> %x1) { diff --git a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll index b9937603f167..bec461abed8b 100644 --- a/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1290,6 +1290,111 @@ define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 ret <16 x i16> %res2 } +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_b_128: +; X86: # %bb.0: +; X86-NEXT: vpavgb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1] +; X86-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_b_128: +; X64: # %bb.0: +; X64-NEXT: vpavgb %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1] +; X64-NEXT: vpaddb %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %x0, <16 x i8> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %x2 + %4 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %x0, <16 x i8> %x1) + %res2 = add <16 x i8> %3, %4 + ret <16 x i8> %res2 +} + +declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) + +define <32 x i8> @test_int_x86_avx512_mask_pavg_b_256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_b_256: +; X86: # %bb.0: +; X86-NEXT: vpavgb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1] +; X86-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_b_256: +; X64: # %bb.0: +; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe0,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe0,0xd1] +; X64-NEXT: vpaddb %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %x0, <32 x i8> %x1) + %2 = bitcast i32 %x3 to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %x2 + %4 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %x0, <32 x i8> %x1) + %res2 = add <32 x i8> %3, %4 + ret <32 x i8> %res2 +} + +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_w_128: +; X86: # %bb.0: +; X86-NEXT: vpavgw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1] +; X86-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_w_128: +; X64: # %bb.0: +; X64-NEXT: vpavgw %xmm1, %xmm0, %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1] +; X64-NEXT: vpaddw %xmm3, %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %x0, <8 x i16> %x1) + %2 = bitcast i8 %x3 to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %x2 + %4 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %x0, <8 x i16> %x1) + %res2 = add <8 x i16> %3, %4 + ret <8 x i16> %res2 +} + +declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) + +define <16 x i16> @test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) { +; X86-LABEL: test_int_x86_avx512_mask_pavg_w_256: +; X86: # %bb.0: +; X86-NEXT: vpavgw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1] +; X86-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_int_x86_avx512_mask_pavg_w_256: +; X64: # %bb.0: +; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm3 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xd9] +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpavgw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1] +; X64-NEXT: vpaddw %ymm3, %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %x0, <16 x i16> %x1) + %2 = bitcast i16 %x3 to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %x2 + %4 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %x0, <16 x i16> %x1) + %res2 = add <16 x i16> %3, %4 + ret <16 x i16> %res2 +} + declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) define <8 x i16> @test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index beb895537ee9..92516b78760e 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -333,28 +333,15 @@ define <2 x i64> @test_mm_avg_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; ; AVX512-LABEL: test_mm_avg_epu8: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxbw %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc0] -; AVX512-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero -; AVX512-NEXT: vpmovzxbw %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xc9] -; AVX512-NEXT: # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero -; AVX512-NEXT: vpaddw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0x76,0xc9] -; AVX512-NEXT: vpsubw %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf9,0xc1] -; AVX512-NEXT: vpsrlw $1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x71,0xd0,0x01] -; AVX512-NEXT: vpmovwb %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0] -; AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %zext0 = zext <16 x i8> %arg0 to <16 x i16> - %zext1 = zext <16 x i8> %arg1 to <16 x i16> - %add = add <16 x i16> %zext0, %zext1 - %add1 = add <16 x i16> %add, - %lshr = lshr <16 x i16> %add1, - %res = trunc <16 x i16> %lshr to <16 x i8> + %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %arg0, <16 x i8> %arg1) nounwind readnone define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_avg_epu16: @@ -369,28 +356,15 @@ define <2 x i64> @test_mm_avg_epu16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; ; AVX512-LABEL: test_mm_avg_epu16: ; AVX512: # %bb.0: -; AVX512-NEXT: vpmovzxwd %xmm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc0] -; AVX512-NEXT: # ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero -; AVX512-NEXT: vpmovzxwd %xmm1, %ymm1 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x33,0xc9] -; AVX512-NEXT: # ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero -; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 # encoding: [0xc5,0xf5,0x76,0xc9] -; AVX512-NEXT: vpsubd %ymm1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfa,0xc1] -; AVX512-NEXT: vpsrld $1, %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x72,0xd0,0x01] -; AVX512-NEXT: vpmovdw %ymm0, %xmm0 # encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0] -; AVX512-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77] +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %zext0 = zext <8 x i16> %arg0 to <8 x i32> - %zext1 = zext <8 x i16> %arg1 to <8 x i32> - %add = add <8 x i32> %zext0, %zext1 - %add1 = add <8 x i32> %add, - %lshr = lshr <8 x i32> %add1, - %res = trunc <8 x i32> %lshr to <8 x i16> + %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_bslli_si128(<2 x i64> %a0) nounwind { ; SSE-LABEL: test_mm_bslli_si128: diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index 7e4703f6957d..d57778507e87 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -630,46 +630,6 @@ define <2 x double> @test_x86_sse2_div_sd(<2 x double> %a0, <2 x double> %a1) { } declare <2 x double> @llvm.x86.sse2.div.sd(<2 x double>, <2 x double>) nounwind readnone -define <16 x i8> @mm_avg_epu8(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: mm_avg_epu8: -; SSE: ## %bb.0: -; SSE-NEXT: pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: mm_avg_epu8: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: mm_avg_epu8: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone - -define <8 x i16> @mm_avg_epu16(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: mm_avg_epu16: -; SSE: ## %bb.0: -; SSE-NEXT: pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: mm_avg_epu16: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: mm_avg_epu16: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone - define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) { ; SSE-LABEL: test_x86_sse2_pmulu_dq: diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll index 44585b77ce0d..0ae2c9e3ccfe 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -946,6 +946,48 @@ define <16 x i8> @test_x86_sse2_packuswb_128_fold() { } +define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_pavg_b: +; SSE: ## %bb.0: +; SSE-NEXT: pavgb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe0,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_pavg_b: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_pavg_b: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_pavg_w: +; SSE: ## %bb.0: +; SSE-NEXT: pavgw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe3,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_pavg_w: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_pavg_w: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone + + define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_pmadd_wd: ; SSE: ## %bb.0: