From 1bcf9847a4fa99520da32b98495749b2a81a5bf6 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 25 May 2016 10:56:23 +0000 Subject: [PATCH] [X86][AVX2] Added more fast-isel tests to match clang/test/CodeGen/avx2-builtins.c llvm-svn: 270685 --- .../CodeGen/X86/avx2-intrinsics-fast-isel.ll | 697 +++++++++++++++++- 1 file changed, 660 insertions(+), 37 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll index c2fc1861f53d..1f93c7a9d8e3 100644 --- a/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -120,11 +120,111 @@ define <4 x i64> @test_mm256_add_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { ret <4 x i64> %res } -; TODO test_mm256_adds_epi16 (__m256i a, __m256i b) -; TODO test_mm256_adds_epi8 (__m256i a, __m256i b) -; TODO test_mm256_adds_epu16 (__m256i a, __m256i b) -; TODO test_mm256_adds_epu8 (__m256i a, __m256i b) -; TODO test_mm256_alignr_epi8 (__m256i a, __m256i b, const int count) +define <4 x i64> @test_mm256_adds_epi8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_adds_epi8: +; X32: # BB#0: +; X32-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_adds_epi8: +; X64: # BB#0: +; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_adds_epi16: +; X32: # BB#0: +; X32-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_adds_epi16: +; X64: # BB#0: +; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_adds_epu8: +; X32: # BB#0: +; X32-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_adds_epu8: +; X64: # BB#0: +; X64-NEXT: vpaddusb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.paddus.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_adds_epu16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_adds_epu16: +; X32: # BB#0: +; X32-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_adds_epu16: +; X64: # BB#0: +; X64-NEXT: vpaddusw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_alignr_epi8: +; X32: # BB#0: +; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_alignr_epi8: +; X64: # BB#0: +; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0,1],ymm0[18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16,17] +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> + %res = bitcast <32 x i8> %shuf to <4 x i64> + ret <4 x i64> %res +} + +define <4 x i64> @test2_mm256_alignr_epi8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test2_mm256_alignr_epi8: +; X32: # BB#0: +; X32-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] +; X32-NEXT: retl +; +; X64-LABEL: test2_mm256_alignr_epi8: +; X64: # BB#0: +; X64-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],ymm1[0],ymm0[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31],ymm1[16] +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %shuf = shufflevector <32 x i8> %arg0, <32 x i8> %arg1, <32 x i32> + %res = bitcast <32 x i8> %shuf to <4 x i64> + ret <4 x i64> %res +} define <4 x i64> @test_mm256_and_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; X32-LABEL: test_mm256_and_si256: @@ -319,19 +419,220 @@ define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind ret <4 x i64> %res } -; TODO test_mm256_cvtepi16_epi32 (__m128i a) -; TODO test_mm256_cvtepi16_epi64 (__m128i a) -; TODO test_mm256_cvtepi32_epi64 (__m128i a) -; TODO test_mm256_cvtepi8_epi16 (__m128i a) -; TODO test_mm256_cvtepi8_epi32 (__m128i a) -; TODO test_mm256_cvtepi8_epi64 (__m128i a) -; TODO test_mm256_cvtepu16_epi32 (__m128i a) -; TODO test_mm256_cvtepu16_epi64 (__m128i a) -; TODO test_mm256_cvtepu32_epi64 (__m128i a) -; TODO test_mm256_cvtepu8_epi16 (__m128i a) -; TODO test_mm256_cvtepu8_epi32 (__m128i a) -; TODO test_mm256_cvtepu8_epi64 (__m128i a) -; TODO test_mm256_extracti128_si256 (__m256i a, const int imm8) +define <4 x i64> @test_mm256_cvtepi8_epi16(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepi8_epi16: +; X32: # BB#0: +; X32-NEXT: vpmovsxbw %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepi8_epi16: +; X64: # BB#0: +; X64-NEXT: vpmovsxbw %xmm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <16 x i8> + %call = call <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8> %arg0) + %res = bitcast <16 x i16> %call to <4 x i64> + ret <4 x i64> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovsxbw(<16 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepi8_epi32(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepi8_epi32: +; X32: # BB#0: +; X32-NEXT: vpmovsxbd %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepi8_epi32: +; X64: # BB#0: +; X64-NEXT: vpmovsxbd %xmm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <16 x i8> + %call = call <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> + ret <4 x i64> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxbd(<16 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepi8_epi64(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepi8_epi64: +; X32: # BB#0: +; X32-NEXT: vpmovsxbq %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepi8_epi64: +; X64: # BB#0: +; X64-NEXT: vpmovsxbq %xmm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <16 x i8> + %call = call <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8> %arg0) + ret <4 x i64> %call +} +declare <4 x i64> @llvm.x86.avx2.pmovsxbq(<16 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepi16_epi32(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepi16_epi32: +; X32: # BB#0: +; X32-NEXT: vpmovsxwd %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepi16_epi32: +; X64: # BB#0: +; X64-NEXT: vpmovsxwd %xmm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <8 x i16> + %call = call <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> + ret <4 x i64> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovsxwd(<8 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepi16_epi64(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepi16_epi64: +; X32: # BB#0: +; X32-NEXT: vpmovsxwq %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepi16_epi64: +; X64: # BB#0: +; X64-NEXT: vpmovsxwq %xmm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <8 x i16> + %call = call <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16> %arg0) + ret <4 x i64> %call +} +declare <4 x i64> @llvm.x86.avx2.pmovsxwq(<8 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepi32_epi64(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepi32_epi64: +; X32: # BB#0: +; X32-NEXT: vpmovsxdq %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepi32_epi64: +; X64: # BB#0: +; X64-NEXT: vpmovsxdq %xmm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <4 x i32> + %res = call <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32> %arg0) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovsxdq(<4 x i32>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepu8_epi16(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepu8_epi16: +; X32: # BB#0: +; X32-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepu8_epi16: +; X64: # BB#0: +; X64-NEXT: vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <16 x i8> + %call = call <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8> %arg0) + %res = bitcast <16 x i16> %call to <4 x i64> + ret <4 x i64> %res +} +declare <16 x i16> @llvm.x86.avx2.pmovzxbw(<16 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepu8_epi32(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepu8_epi32: +; X32: # BB#0: +; X32-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepu8_epi32: +; X64: # BB#0: +; X64-NEXT: vpmovzxbd {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <16 x i8> + %call = call <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> + ret <4 x i64> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxbd(<16 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepu8_epi64(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepu8_epi64: +; X32: # BB#0: +; X32-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepu8_epi64: +; X64: # BB#0: +; X64-NEXT: vpmovzxbq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero,xmm0[2],zero,zero,zero,zero,zero,zero,zero,xmm0[3],zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <16 x i8> + %call = call <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8> %arg0) + ret <4 x i64> %call +} +declare <4 x i64> @llvm.x86.avx2.pmovzxbq(<16 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepu16_epi32(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepu16_epi32: +; X32: # BB#0: +; X32-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepu16_epi32: +; X64: # BB#0: +; X64-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <8 x i16> + %call = call <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16> %arg0) + %res = bitcast <8 x i32> %call to <4 x i64> + ret <4 x i64> %res +} +declare <8 x i32> @llvm.x86.avx2.pmovzxwd(<8 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepu16_epi64(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepu16_epi64: +; X32: # BB#0: +; X32-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepu16_epi64: +; X64: # BB#0: +; X64-NEXT: vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <8 x i16> + %call = call <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16> %arg0) + ret <4 x i64> %call +} +declare <4 x i64> @llvm.x86.avx2.pmovzxwq(<8 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_cvtepu32_epi64(<2 x i64> %a0) { +; X32-LABEL: test_mm256_cvtepu32_epi64: +; X32: # BB#0: +; X32-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_cvtepu32_epi64: +; X64: # BB#0: +; X64-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero +; X64-NEXT: retq + %arg0 = bitcast <2 x i64> %a0 to <4 x i32> + %res = call <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32> %arg0) + ret <4 x i64> %res +} +declare <4 x i64> @llvm.x86.avx2.pmovzxdq(<4 x i32>) nounwind readnone + +define <2 x i64> @test_mm256_extracti128_si256(<4 x i64> %a0) nounwind { +; X32-LABEL: test_mm256_extracti128_si256: +; X32: # BB#0: +; X32-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_extracti128_si256: +; X64: # BB#0: +; X64-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X64-NEXT: vzeroupper +; X64-NEXT: retq + %res = shufflevector <4 x i64> %a0, <4 x i64> %a0, <2 x i32> + ret <2 x i64> %res +} + ; TODO test_mm256_hadd_epi16 (__m256i a, __m256i b) ; TODO test_mm256_hadd_epi32 (__m256i a, __m256i b) ; TODO test_mm256_hadds_epi16 (__m256i a, __m256i b) @@ -381,27 +682,281 @@ define <4 x i64> @test_mm256_cmpgt_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind ; TODO test_mm256_maskstore_epi32 (int* mem_addr, __m256i mask, __m256i a) ; TODO test_mm_maskstore_epi64 (__int64* mem_addr, __m128i mask, __m128i a) ; TODO test_mm256_maskstore_epi64 (__int64* mem_addr, __m256i mask, __m256i a) -; TODO test_mm256_max_epi16 (__m256i a, __m256i b) -; TODO test_mm256_max_epi32 (__m256i a, __m256i b) -; TODO test_mm256_max_epi8 (__m256i a, __m256i b) -; TODO test_mm256_max_epu16 (__m256i a, __m256i b) -; TODO test_mm256_max_epu32 (__m256i a, __m256i b) -; TODO test_mm256_max_epu8 (__m256i a, __m256i b) -; TODO test_mm256_min_epi16 (__m256i a, __m256i b) -; TODO test_mm256_min_epi32 (__m256i a, __m256i b) -; TODO test_mm256_min_epi8 (__m256i a, __m256i b) -; TODO test_mm256_min_epu16 (__m256i a, __m256i b) -; TODO test_mm256_min_epu32 (__m256i a, __m256i b) -; TODO test_mm256_min_epu8 (__m256i a, __m256i b) -; TODO test_mm256_movemask_epi8 (__m256i a) + +define <4 x i64> @test_mm256_max_epi8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_max_epi8: +; X32: # BB#0: +; X32-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_max_epi8: +; X64: # BB#0: +; X64-NEXT: vpmaxsb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_max_epi16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_max_epi16: +; X32: # BB#0: +; X32-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_max_epi16: +; X64: # BB#0: +; X64-NEXT: vpmaxsw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_max_epi32(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_max_epi32: +; X32: # BB#0: +; X32-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_max_epi32: +; X64: # BB#0: +; X64-NEXT: vpmaxsd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <8 x i32> + %arg1 = bitcast <4 x i64> %a1 to <8 x i32> + %res = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %arg0, <8 x i32> %arg1) + %bc = bitcast <8 x i32> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone + +define <4 x i64> @test_mm256_max_epu8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_max_epu8: +; X32: # BB#0: +; X32-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_max_epu8: +; X64: # BB#0: +; X64-NEXT: vpmaxub %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_max_epu16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_max_epu16: +; X32: # BB#0: +; X32-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_max_epu16: +; X64: # BB#0: +; X64-NEXT: vpmaxuw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_max_epu32(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_max_epu32: +; X32: # BB#0: +; X32-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_max_epu32: +; X64: # BB#0: +; X64-NEXT: vpmaxud %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <8 x i32> + %arg1 = bitcast <4 x i64> %a1 to <8 x i32> + %res = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %arg0, <8 x i32> %arg1) + %bc = bitcast <8 x i32> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone + +define <4 x i64> @test_mm256_min_epi8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_min_epi8: +; X32: # BB#0: +; X32-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_min_epi8: +; X64: # BB#0: +; X64-NEXT: vpminsb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_min_epi16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_min_epi16: +; X32: # BB#0: +; X32-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_min_epi16: +; X64: # BB#0: +; X64-NEXT: vpminsw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_min_epi32(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_min_epi32: +; X32: # BB#0: +; X32-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_min_epi32: +; X64: # BB#0: +; X64-NEXT: vpminsd %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <8 x i32> + %arg1 = bitcast <4 x i64> %a1 to <8 x i32> + %res = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %arg0, <8 x i32> %arg1) + %bc = bitcast <8 x i32> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone + +define <4 x i64> @test_mm256_min_epu8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_min_epu8: +; X32: # BB#0: +; X32-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_min_epu8: +; X64: # BB#0: +; X64-NEXT: vpminub %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_min_epu16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_min_epu16: +; X32: # BB#0: +; X32-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_min_epu16: +; X64: # BB#0: +; X64-NEXT: vpminuw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_min_epu32(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_min_epu32: +; X32: # BB#0: +; X32-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_min_epu32: +; X64: # BB#0: +; X64-NEXT: vpminud %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <8 x i32> + %arg1 = bitcast <4 x i64> %a1 to <8 x i32> + %res = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %arg0, <8 x i32> %arg1) + %bc = bitcast <8 x i32> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone + +define i32 @test_mm256_movemask_epi8(<4 x i64> %a0) nounwind { +; X32-LABEL: test_mm256_movemask_epi8: +; X32: # BB#0: +; X32-NEXT: vpmovmskb %ymm0, %eax +; X32-NEXT: vzeroupper +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_movemask_epi8: +; X64: # BB#0: +; X64-NEXT: vpmovmskb %ymm0, %eax +; X64-NEXT: vzeroupper +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %arg0) + ret i32 %res +} +declare i32 @llvm.x86.avx2.pmovmskb(<32 x i8>) nounwind readnone + ; TODO test_mm256_mpsadbw_epu8 (__m256i a, __m256i b, const int imm8) ; TODO test_mm256_mul_epi32 (__m256i a, __m256i b) ; TODO test_mm256_mul_epu32 (__m256i a, __m256i b) ; TODO test_mm256_mulhi_epi16 (__m256i a, __m256i b) ; TODO test_mm256_mulhi_epu16 (__m256i a, __m256i b) ; TODO test_mm256_mulhrs_epi16 (__m256i a, __m256i b) -; TODO test_mm256_mullo_epi16 (__m256i a, __m256i b) -; TODO test_mm256_mullo_epi32 (__m256i a, __m256i b) + +define <4 x i64> @test_mm256_mullo_epi16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_mullo_epi16: +; X32: # BB#0: +; X32-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mullo_epi16: +; X64: # BB#0: +; X64-NEXT: vpmullw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = mul <16 x i16> %arg0, %arg1 + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} + +define <4 x i64> @test_mm256_mullo_epi32(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_mullo_epi32: +; X32: # BB#0: +; X32-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mullo_epi32: +; X64: # BB#0: +; X64-NEXT: vpmulld %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <8 x i32> + %arg1 = bitcast <4 x i64> %a1 to <8 x i32> + %res = mul <8 x i32> %arg0, %arg1 + %bc = bitcast <8 x i32> %res to <4 x i64> + ret <4 x i64> %bc +} define <4 x i64> @test_mm256_or_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind { ; X32-LABEL: test_mm256_or_si256: @@ -529,10 +1084,78 @@ define <4 x i64> @test_mm256_sub_epi64(<4 x i64> %a0, <4 x i64> %a1) nounwind { ret <4 x i64> %res } -; TODO test_mm256_subs_epi16 (__m256i a, __m256i b) -; TODO test_mm256_subs_epi8 (__m256i a, __m256i b) -; TODO test_mm256_subs_epu16 (__m256i a, __m256i b) -; TODO test_mm256_subs_epu8 (__m256i a, __m256i b) +define <4 x i64> @test_mm256_subs_epi8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_subs_epi8: +; X32: # BB#0: +; X32-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_subs_epi8: +; X64: # BB#0: +; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_subs_epi16: +; X32: # BB#0: +; X32-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_subs_epi16: +; X64: # BB#0: +; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone + +define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_subs_epu8: +; X32: # BB#0: +; X32-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_subs_epu8: +; X64: # BB#0: +; X64-NEXT: vpsubusb %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <32 x i8> + %arg1 = bitcast <4 x i64> %a1 to <32 x i8> + %res = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %arg0, <32 x i8> %arg1) + %bc = bitcast <32 x i8> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone + +define <4 x i64> @test_mm256_subs_epu16(<4 x i64> %a0, <4 x i64> %a1) { +; X32-LABEL: test_mm256_subs_epu16: +; X32: # BB#0: +; X32-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_subs_epu16: +; X64: # BB#0: +; X64-NEXT: vpsubusw %ymm1, %ymm0, %ymm0 +; X64-NEXT: retq + %arg0 = bitcast <4 x i64> %a0 to <16 x i16> + %arg1 = bitcast <4 x i64> %a1 to <16 x i16> + %res = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %arg0, <16 x i16> %arg1) + %bc = bitcast <16 x i16> %res to <4 x i64> + ret <4 x i64> %bc +} +declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone + ; TODO test_mm256_unpackhi_epi16 (__m256i a, __m256i b) ; TODO test_mm256_unpackhi_epi32 (__m256i a, __m256i b) ; TODO test_mm256_unpackhi_epi64 (__m256i a, __m256i b)