diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll index 1d9ee0fa95cb..7b73a55c8895 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -5666,10 +5666,276 @@ entry: ret <2 x double> %0 } +define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_mask_expandloadu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_expandloadu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W) + ret <8 x i64> %2 +} + +define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_maskz_expandloadu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_expandloadu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer) + ret <8 x i64> %2 +} + +define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_mask_expandloadu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_expandloadu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W) + ret <8 x double> %2 +} + +define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_maskz_expandloadu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_expandloadu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer) + ret <8 x double> %2 +} + +define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_mask_expandloadu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_expandloadu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__W to <16 x i32> + %1 = bitcast i8* %__P to i32* + %2 = bitcast i16 %__U to <16 x i1> + %3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11 + %4 = bitcast <16 x i32> %3 to <8 x i64> + ret <8 x i64> %4 +} + +define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_maskz_expandloadu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_expandloadu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i32* + %1 = bitcast i16 %__U to <16 x i1> + %2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer) + %3 = bitcast <16 x i32> %2 to <8 x i64> + ret <8 x i64> %3 +} + +define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_mask_expandloadu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_expandloadu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i16 %__U to <16 x i1> + %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11 + ret <16 x float> %2 +} + +define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm512_maskz_expandloadu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_maskz_expandloadu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i16 %__U to <16 x i1> + %2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer) + ret <16 x float> %2 +} + +define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) { +; X86-LABEL: test_mm512_mask_compressstoreu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_compressstoreu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1) + ret void +} + +define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) { +; X86-LABEL: test_mm512_mask_compressstoreu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_compressstoreu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1) + ret void +} + +define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) { +; X86-LABEL: test_mm512_mask_compressstoreu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vcompressps %zmm0, (%eax) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_compressstoreu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i16 %__U to <16 x i1> + tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1) + ret void +} + +define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) { +; X86-LABEL: test_mm512_mask_compressstoreu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: vpcompressd %zmm0, (%eax) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm512_mask_compressstoreu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <8 x i64> %__A to <16 x i32> + %1 = bitcast i8* %__P to i32* + %2 = bitcast i16 %__U to <16 x i1> + tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2) + ret void +} + declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9 declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9 declare float @llvm.fma.f32(float, float, float) #9 declare double @llvm.fma.f64(double, double, double) #9 +declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>) +declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>) +declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10 +declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>) +declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>) +declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>) +declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>) +declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>) !0 = !{i32 1} diff --git a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll index 031afa4470d7..f0f9af5aa130 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2-intrinsics-fast-isel.ll @@ -103,7 +103,9 @@ define void @test_mm512_mask_compressstoreu_epi16(i8* %__P, i32 %__U, <8 x i64> ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__D to <32 x i16> - tail call void @llvm.x86.avx512.mask.compress.store.w.512(i8* %__P, <32 x i16> %0, i32 %__U) + %1 = bitcast i8* %__P to i16* + %2 = bitcast i32 %__U to <32 x i1> + tail call void @llvm.masked.compressstore.v32i16(<32 x i16> %0, i16* %1, <32 x i1> %2) ret void } @@ -126,7 +128,8 @@ define void @test_mm512_mask_compressstoreu_epi8(i8* %__P, i64 %__U, <8 x i64> % ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__D to <64 x i8> - tail call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %__P, <64 x i8> %0, i64 %__U) + %1 = bitcast i64 %__U to <64 x i1> + tail call void @llvm.masked.compressstore.v64i8(<64 x i8> %0, i8* %__P, <64 x i1> %1) ret void } @@ -227,9 +230,11 @@ define <8 x i64> @test_mm512_mask_expandloadu_epi16(<8 x i64> %__S, i32 %__U, i8 ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__S to <32 x i16> - %1 = tail call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %__P, <32 x i16> %0, i32 %__U) - %2 = bitcast <32 x i16> %1 to <8 x i64> - ret <8 x i64> %2 + %1 = bitcast i8* %__P to i16* + %2 = bitcast i32 %__U to <32 x i1> + %3 = tail call <32 x i16> @llvm.masked.expandload.v32i16(i16* %1, <32 x i1> %2, <32 x i16> %0) + %4 = bitcast <32 x i16> %3 to <8 x i64> + ret <8 x i64> %4 } define <8 x i64> @test_mm512_maskz_expandloadu_epi16(i32 %__U, i8* readonly %__P) { @@ -246,9 +251,11 @@ define <8 x i64> @test_mm512_maskz_expandloadu_epi16(i32 %__U, i8* readonly %__P ; X64-NEXT: vpexpandw (%rsi), %zmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %__P, <32 x i16> zeroinitializer, i32 %__U) - %1 = bitcast <32 x i16> %0 to <8 x i64> - ret <8 x i64> %1 + %0 = bitcast i8* %__P to i16* + %1 = bitcast i32 %__U to <32 x i1> + %2 = tail call <32 x i16> @llvm.masked.expandload.v32i16(i16* %0, <32 x i1> %1, <32 x i16> zeroinitializer) + %3 = bitcast <32 x i16> %2 to <8 x i64> + ret <8 x i64> %3 } define <8 x i64> @test_mm512_mask_expandloadu_epi8(<8 x i64> %__S, i64 %__U, i8* readonly %__P) { @@ -268,9 +275,10 @@ define <8 x i64> @test_mm512_mask_expandloadu_epi8(<8 x i64> %__S, i64 %__U, i8* ; X64-NEXT: retq entry: %0 = bitcast <8 x i64> %__S to <64 x i8> - %1 = tail call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %__P, <64 x i8> %0, i64 %__U) - %2 = bitcast <64 x i8> %1 to <8 x i64> - ret <8 x i64> %2 + %1 = bitcast i64 %__U to <64 x i1> + %2 = tail call <64 x i8> @llvm.masked.expandload.v64i8(i8* %__P, <64 x i1> %1, <64 x i8> %0) + %3 = bitcast <64 x i8> %2 to <8 x i64> + ret <8 x i64> %3 } define <8 x i64> @test_mm512_maskz_expandloadu_epi8(i64 %__U, i8* readonly %__P) { @@ -289,9 +297,10 @@ define <8 x i64> @test_mm512_maskz_expandloadu_epi8(i64 %__U, i8* readonly %__P) ; X64-NEXT: vpexpandb (%rsi), %zmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %__P, <64 x i8> zeroinitializer, i64 %__U) - %1 = bitcast <64 x i8> %0 to <8 x i64> - ret <8 x i64> %1 + %0 = bitcast i64 %__U to <64 x i1> + %1 = tail call <64 x i8> @llvm.masked.expandload.v64i8(i8* %__P, <64 x i1> %0, <64 x i8> zeroinitializer) + %2 = bitcast <64 x i8> %1 to <8 x i64> + ret <8 x i64> %2 } define <8 x i64> @test_mm512_mask_shldi_epi64(<8 x i64> %__S, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) { @@ -932,12 +941,12 @@ entry: declare <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16>, <32 x i16>, i32) declare <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8>, <64 x i8>, i64) -declare void @llvm.x86.avx512.mask.compress.store.w.512(i8*, <32 x i16>, i32) -declare void @llvm.x86.avx512.mask.compress.store.b.512(i8*, <64 x i8>, i64) +declare void @llvm.masked.compressstore.v32i16(<32 x i16>, i16*, <32 x i1>) +declare void @llvm.masked.compressstore.v64i8(<64 x i8>, i8*, <64 x i1>) declare <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16>, <32 x i16>, i32) declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8>, <64 x i8>, i64) -declare <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8*, <32 x i16>, i32) -declare <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8*, <64 x i8>, i64) +declare <32 x i16> @llvm.masked.expandload.v32i16(i16*, <32 x i1>, <32 x i16>) +declare <64 x i8> @llvm.masked.expandload.v64i8(i8*, <64 x i1>, <64 x i8>) declare <8 x i64> @llvm.x86.avx512.mask.vpshldv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) declare <8 x i64> @llvm.x86.avx512.maskz.vpshldv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) declare <16 x i32> @llvm.x86.avx512.mask.vpshldv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) diff --git a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll index 4036d656c685..963609542213 100644 --- a/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vbmi2vl-intrinsics-fast-isel.ll @@ -100,7 +100,9 @@ define void @test_mm_mask_compressstoreu_epi16(i8* %__P, i8 zeroext %__U, <2 x i ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__D to <8 x i16> - tail call void @llvm.x86.avx512.mask.compress.store.w.128(i8* %__P, <8 x i16> %0, i8 %__U) + %1 = bitcast i8* %__P to i16* + %2 = bitcast i8 %__U to <8 x i1> + tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %0, i16* %1, <8 x i1> %2) ret void } @@ -119,7 +121,8 @@ define void @test_mm_mask_compressstoreu_epi8(i8* %__P, i16 zeroext %__U, <2 x i ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__D to <16 x i8> - tail call void @llvm.x86.avx512.mask.compress.store.b.128(i8* %__P, <16 x i8> %0, i16 %__U) + %1 = bitcast i16 %__U to <16 x i1> + tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %0, i8* %__P, <16 x i1> %1) ret void } @@ -219,9 +222,11 @@ define <2 x i64> @test_mm_mask_expandloadu_epi16(<2 x i64> %__S, i8 zeroext %__U ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__S to <8 x i16> - %1 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %__P, <8 x i16> %0, i8 %__U) - %2 = bitcast <8 x i16> %1 to <2 x i64> - ret <2 x i64> %2 + %1 = bitcast i8* %__P to i16* + %2 = bitcast i8 %__U to <8 x i1> + %3 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %1, <8 x i1> %2, <8 x i16> %0) + %4 = bitcast <8 x i16> %3 to <2 x i64> + ret <2 x i64> %4 } define <2 x i64> @test_mm_maskz_expandloadu_epi16(i8 zeroext %__U, i8* readonly %__P) { @@ -239,9 +244,11 @@ define <2 x i64> @test_mm_maskz_expandloadu_epi16(i8 zeroext %__U, i8* readonly ; X64-NEXT: vpexpandw (%rsi), %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %__P, <8 x i16> zeroinitializer, i8 %__U) - %1 = bitcast <8 x i16> %0 to <2 x i64> - ret <2 x i64> %1 + %0 = bitcast i8* %__P to i16* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %0, <8 x i1> %1, <8 x i16> zeroinitializer) + %3 = bitcast <8 x i16> %2 to <2 x i64> + ret <2 x i64> %3 } define <2 x i64> @test_mm_mask_expandloadu_epi8(<2 x i64> %__S, i16 zeroext %__U, i8* readonly %__P) { @@ -259,9 +266,10 @@ define <2 x i64> @test_mm_mask_expandloadu_epi8(<2 x i64> %__S, i16 zeroext %__U ; X64-NEXT: retq entry: %0 = bitcast <2 x i64> %__S to <16 x i8> - %1 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %__P, <16 x i8> %0, i16 %__U) - %2 = bitcast <16 x i8> %1 to <2 x i64> - ret <2 x i64> %2 + %1 = bitcast i16 %__U to <16 x i1> + %2 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %1, <16 x i8> %0) + %3 = bitcast <16 x i8> %2 to <2 x i64> + ret <2 x i64> %3 } define <2 x i64> @test_mm_maskz_expandloadu_epi8(i16 zeroext %__U, i8* readonly %__P) { @@ -278,9 +286,10 @@ define <2 x i64> @test_mm_maskz_expandloadu_epi8(i16 zeroext %__U, i8* readonly ; X64-NEXT: vpexpandb (%rsi), %xmm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %__P, <16 x i8> zeroinitializer, i16 %__U) - %1 = bitcast <16 x i8> %0 to <2 x i64> - ret <2 x i64> %1 + %0 = bitcast i16 %__U to <16 x i1> + %1 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %0, <16 x i8> zeroinitializer) + %2 = bitcast <16 x i8> %1 to <2 x i64> + ret <2 x i64> %2 } define <4 x i64> @test_mm256_mask_compress_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) { @@ -378,7 +387,9 @@ define void @test_mm256_mask_compressstoreu_epi16(i8* %__P, i16 zeroext %__U, <4 ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__D to <16 x i16> - tail call void @llvm.x86.avx512.mask.compress.store.w.256(i8* %__P, <16 x i16> %0, i16 %__U) + %1 = bitcast i8* %__P to i16* + %2 = bitcast i16 %__U to <16 x i1> + tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %0, i16* %1, <16 x i1> %2) ret void } @@ -399,7 +410,8 @@ define void @test_mm256_mask_compressstoreu_epi8(i8* %__P, i32 %__U, <4 x i64> % ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__D to <32 x i8> - tail call void @llvm.x86.avx512.mask.compress.store.b.256(i8* %__P, <32 x i8> %0, i32 %__U) + %1 = bitcast i32 %__U to <32 x i1> + tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %0, i8* %__P, <32 x i1> %1) ret void } @@ -496,9 +508,11 @@ define <4 x i64> @test_mm256_mask_expandloadu_epi16(<4 x i64> %__S, i16 zeroext ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__S to <16 x i16> - %1 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %__P, <16 x i16> %0, i16 %__U) - %2 = bitcast <16 x i16> %1 to <4 x i64> - ret <4 x i64> %2 + %1 = bitcast i8* %__P to i16* + %2 = bitcast i16 %__U to <16 x i1> + %3 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %1, <16 x i1> %2, <16 x i16> %0) + %4 = bitcast <16 x i16> %3 to <4 x i64> + ret <4 x i64> %4 } define <4 x i64> @test_mm256_maskz_expandloadu_epi16(i16 zeroext %__U, i8* readonly %__P) { @@ -515,9 +529,11 @@ define <4 x i64> @test_mm256_maskz_expandloadu_epi16(i16 zeroext %__U, i8* reado ; X64-NEXT: vpexpandw (%rsi), %ymm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %__P, <16 x i16> zeroinitializer, i16 %__U) - %1 = bitcast <16 x i16> %0 to <4 x i64> - ret <4 x i64> %1 + %0 = bitcast i8* %__P to i16* + %1 = bitcast i16 %__U to <16 x i1> + %2 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %0, <16 x i1> %1, <16 x i16> zeroinitializer) + %3 = bitcast <16 x i16> %2 to <4 x i64> + ret <4 x i64> %3 } define <4 x i64> @test_mm256_mask_expandloadu_epi8(<4 x i64> %__S, i32 %__U, i8* readonly %__P) { @@ -535,9 +551,10 @@ define <4 x i64> @test_mm256_mask_expandloadu_epi8(<4 x i64> %__S, i32 %__U, i8* ; X64-NEXT: retq entry: %0 = bitcast <4 x i64> %__S to <32 x i8> - %1 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %__P, <32 x i8> %0, i32 %__U) - %2 = bitcast <32 x i8> %1 to <4 x i64> - ret <4 x i64> %2 + %1 = bitcast i32 %__U to <32 x i1> + %2 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %1, <32 x i8> %0) + %3 = bitcast <32 x i8> %2 to <4 x i64> + ret <4 x i64> %3 } define <4 x i64> @test_mm256_maskz_expandloadu_epi8(i32 %__U, i8* readonly %__P) { @@ -554,9 +571,10 @@ define <4 x i64> @test_mm256_maskz_expandloadu_epi8(i32 %__U, i8* readonly %__P) ; X64-NEXT: vpexpandb (%rsi), %ymm0 {%k1} {z} ; X64-NEXT: retq entry: - %0 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %__P, <32 x i8> zeroinitializer, i32 %__U) - %1 = bitcast <32 x i8> %0 to <4 x i64> - ret <4 x i64> %1 + %0 = bitcast i32 %__U to <32 x i1> + %1 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %0, <32 x i8> zeroinitializer) + %2 = bitcast <32 x i8> %1 to <4 x i64> + ret <4 x i64> %2 } define <4 x i64> @test_mm256_mask_shldi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) { @@ -1857,20 +1875,20 @@ entry: declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16>, <8 x i16>, i8) declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8>, <16 x i8>, i16) -declare void @llvm.x86.avx512.mask.compress.store.w.128(i8*, <8 x i16>, i8) -declare void @llvm.x86.avx512.mask.compress.store.b.128(i8*, <16 x i8>, i16) +declare void @llvm.masked.compressstore.v8i16(<8 x i16>, i16*, <8 x i1>) +declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>) declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16>, <8 x i16>, i8) declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8>, <16 x i8>, i16) -declare <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8*, <8 x i16>, i8) -declare <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8*, <16 x i8>, i16) +declare <8 x i16> @llvm.masked.expandload.v8i16(i16*, <8 x i1>, <8 x i16>) +declare <16 x i8> @llvm.masked.expandload.v16i8(i8*, <16 x i1>, <16 x i8>) declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16>, <16 x i16>, i16) declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8>, <32 x i8>, i32) -declare void @llvm.x86.avx512.mask.compress.store.w.256(i8*, <16 x i16>, i16) -declare void @llvm.x86.avx512.mask.compress.store.b.256(i8*, <32 x i8>, i32) +declare void @llvm.masked.compressstore.v16i16(<16 x i16>, i16*, <16 x i1>) +declare void @llvm.masked.compressstore.v32i8(<32 x i8>, i8*, <32 x i1>) declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16>, <16 x i16>, i16) declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8>, <32 x i8>, i32) -declare <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8*, <16 x i16>, i16) -declare <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8*, <32 x i8>, i32) +declare <16 x i16> @llvm.masked.expandload.v16i16(i16*, <16 x i1>, <16 x i16>) +declare <32 x i8> @llvm.masked.expandload.v32i8(i8*, <32 x i1>, <32 x i8>) declare <4 x i64> @llvm.x86.avx512.mask.vpshldv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) declare <4 x i64> @llvm.x86.avx512.maskz.vpshldv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) declare <2 x i64> @llvm.x86.avx512.mask.vpshldv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll index 3a797b094d3a..b119dbcdc9c4 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -5831,6 +5831,545 @@ entry: ret <8 x float> %2 } +define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_mask_expandloadu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_expandloadu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W) + ret <2 x double> %2 +} + +define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_maskz_expandloadu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_maskz_expandloadu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer) + ret <2 x double> %2 +} + +define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_mask_expandloadu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_expandloadu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W) + ret <4 x double> %2 +} + +define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_maskz_expandloadu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_expandloadu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer) + ret <4 x double> %2 +} + +define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_mask_expandloadu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_expandloadu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10 + ret <2 x i64> %2 +} + +define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_maskz_expandloadu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_maskz_expandloadu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer) + ret <2 x i64> %2 +} + +define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_mask_expandloadu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_expandloadu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10 + ret <4 x i64> %2 +} + +define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_maskz_expandloadu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_expandloadu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer) + ret <4 x i64> %2 +} + +define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_mask_expandloadu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_expandloadu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W) + ret <4 x float> %2 +} + +define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_maskz_expandloadu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_maskz_expandloadu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer) + ret <4 x float> %2 +} + +define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_mask_expandloadu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_expandloadu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W) + ret <8 x float> %2 +} + +define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_maskz_expandloadu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_expandloadu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer) + ret <8 x float> %2 +} + +define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_mask_expandloadu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_expandloadu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__W to <4 x i32> + %1 = bitcast i8* %__P to i32* + %2 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0) + %4 = bitcast <4 x i32> %3 to <2 x i64> + ret <2 x i64> %4 +} + +define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm_maskz_expandloadu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_maskz_expandloadu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i32* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer) + %3 = bitcast <4 x i32> %2 to <2 x i64> + ret <2 x i64> %3 +} + +define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_mask_expandloadu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_expandloadu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__W to <8 x i32> + %1 = bitcast i8* %__P to i32* + %2 = bitcast i8 %__U to <8 x i1> + %3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0) + %4 = bitcast <8 x i32> %3 to <4 x i64> + ret <4 x i64> %4 +} + +define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) { +; X86-LABEL: test_mm256_maskz_expandloadu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movb {{[0-9]+}}(%esp), %cl +; X86-NEXT: kmovw %ecx, %k1 +; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z} +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_expandloadu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i32* + %1 = bitcast i8 %__U to <8 x i1> + %2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer) + %3 = bitcast <8 x i32> %2 to <4 x i64> + ret <4 x i64> %3 +} + +define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) { +; X86-LABEL: test_mm_mask_compressstoreu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_compressstoreu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i) + ret void +} + +define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) { +; X86-LABEL: test_mm256_mask_compressstoreu_pd: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_compressstoreu_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to double* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i) + ret void +} + +define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { +; X86-LABEL: test_mm_mask_compressstoreu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_compressstoreu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i) + ret void +} + +define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { +; X86-LABEL: test_mm256_mask_compressstoreu_epi64: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_compressstoreu_epi64: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to i64* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i) + ret void +} + +define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) { +; X86-LABEL: test_mm_mask_compressstoreu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_compressstoreu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i) + ret void +} + +define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) { +; X86-LABEL: test_mm256_mask_compressstoreu_ps: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_compressstoreu_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast i8* %__P to float* + %1 = bitcast i8 %__U to <8 x i1> + tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1) + ret void +} + +define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) { +; X86-LABEL: test_mm_mask_compressstoreu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1} +; X86-NEXT: retl +; +; X64-LABEL: test_mm_mask_compressstoreu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1} +; X64-NEXT: retq +entry: + %0 = bitcast <2 x i64> %__A to <4 x i32> + %1 = bitcast i8* %__P to i32* + %2 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i) + ret void +} + +define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) { +; X86-LABEL: test_mm256_mask_compressstoreu_epi32: +; X86: # %bb.0: # %entry +; X86-NEXT: movb {{[0-9]+}}(%esp), %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: kmovw %eax, %k1 +; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1} +; X86-NEXT: vzeroupper +; X86-NEXT: retl +; +; X64-LABEL: test_mm256_mask_compressstoreu_epi32: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %esi, %k1 +; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1} +; X64-NEXT: vzeroupper +; X64-NEXT: retq +entry: + %0 = bitcast <4 x i64> %__A to <8 x i32> + %1 = bitcast i8* %__P to i32* + %2 = bitcast i8 %__U to <8 x i1> + tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10 + ret void +} + + declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8 declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8 declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8 @@ -5863,5 +6402,21 @@ declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, < declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>) declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>) +declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>) +declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>) +declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>) +declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>) +declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>) +declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>) +declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>) +declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>) +declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>) +declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>) +declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>) +declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>) +declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>) +declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>) +declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>) +declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>) !0 = !{i32 1}