[X86] Add expandload and compresstore fast-isel tests for avx512f and avx512vl. Update existing tests for avx512vbmi2 to use target independent intrinsics.

llvm-svn: 334368
This commit is contained in:
Craig Topper 2018-06-10 18:55:37 +00:00
parent 15bee8f1c0
commit 304bd747af
4 changed files with 902 additions and 54 deletions

View File

@ -5666,10 +5666,276 @@ entry:
ret <2 x double> %0
}
define <8 x i64> @test_mm512_mask_expandloadu_epi64(<8 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_mask_expandloadu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_expandloadu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> %__W)
ret <8 x i64> %2
}
define <8 x i64> @test_mm512_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_maskz_expandloadu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandq (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_expandloadu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandq (%rsi), %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x i64> @llvm.masked.expandload.v8i64(i64* %0, <8 x i1> %1, <8 x i64> zeroinitializer)
ret <8 x i64> %2
}
define <8 x double> @test_mm512_mask_expandloadu_pd(<8 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_mask_expandloadu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_expandloadu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> %__W)
ret <8 x double> %2
}
define <8 x double> @test_mm512_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_maskz_expandloadu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandpd (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_expandloadu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandpd (%rsi), %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x double> @llvm.masked.expandload.v8f64(double* %0, <8 x i1> %1, <8 x double> zeroinitializer)
ret <8 x double> %2
}
define <8 x i64> @test_mm512_mask_expandloadu_epi32(<8 x i64> %__W, i16 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_mask_expandloadu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_expandloadu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__W to <16 x i32>
%1 = bitcast i8* %__P to i32*
%2 = bitcast i16 %__U to <16 x i1>
%3 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %1, <16 x i1> %2, <16 x i32> %0) #11
%4 = bitcast <16 x i32> %3 to <8 x i64>
ret <8 x i64> %4
}
define <8 x i64> @test_mm512_maskz_expandloadu_epi32(i16 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_maskz_expandloadu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vpexpandd (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_expandloadu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandd (%rsi), %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i32*
%1 = bitcast i16 %__U to <16 x i1>
%2 = tail call <16 x i32> @llvm.masked.expandload.v16i32(i32* %0, <16 x i1> %1, <16 x i32> zeroinitializer)
%3 = bitcast <16 x i32> %2 to <8 x i64>
ret <8 x i64> %3
}
define <16 x float> @test_mm512_mask_expandloadu_ps(<16 x float> %__W, i16 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_mask_expandloadu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vexpandps (%eax), %zmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_expandloadu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i16 %__U to <16 x i1>
%2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> %__W) #11
ret <16 x float> %2
}
define <16 x float> @test_mm512_maskz_expandloadu_ps(i16 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm512_maskz_expandloadu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: vexpandps (%eax), %zmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_maskz_expandloadu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandps (%rsi), %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i16 %__U to <16 x i1>
%2 = tail call <16 x float> @llvm.masked.expandload.v16f32(float* %0, <16 x i1> %1, <16 x float> zeroinitializer)
ret <16 x float> %2
}
define void @test_mm512_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <8 x double> %__A) {
; X86-LABEL: test_mm512_mask_compressstoreu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vcompresspd %zmm0, (%ecx) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_compressstoreu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
tail call void @llvm.masked.compressstore.v8f64(<8 x double> %__A, double* %0, <8 x i1> %1)
ret void
}
define void @test_mm512_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <8 x i64> %__A) {
; X86-LABEL: test_mm512_mask_compressstoreu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpcompressq %zmm0, (%ecx) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_compressstoreu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
tail call void @llvm.masked.compressstore.v8i64(<8 x i64> %__A, i64* %0, <8 x i1> %1)
ret void
}
define void @test_mm512_mask_compressstoreu_ps(i8* %__P, i16 zeroext %__U, <16 x float> %__A) {
; X86-LABEL: test_mm512_mask_compressstoreu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vcompressps %zmm0, (%eax) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_compressstoreu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vcompressps %zmm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i16 %__U to <16 x i1>
tail call void @llvm.masked.compressstore.v16f32(<16 x float> %__A, float* %0, <16 x i1> %1)
ret void
}
define void @test_mm512_mask_compressstoreu_epi32(i8* %__P, i16 zeroext %__U, <8 x i64> %__A) {
; X86-LABEL: test_mm512_mask_compressstoreu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: vpcompressd %zmm0, (%eax) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm512_mask_compressstoreu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vpcompressd %zmm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__A to <16 x i32>
%1 = bitcast i8* %__P to i32*
%2 = bitcast i16 %__U to <16 x i1>
tail call void @llvm.masked.compressstore.v16i32(<16 x i32> %0, i32* %1, <16 x i1> %2)
ret void
}
declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>) #9
declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>) #9
declare float @llvm.fma.f32(float, float, float) #9
declare double @llvm.fma.f64(double, double, double) #9
declare <8 x i64> @llvm.masked.expandload.v8i64(i64*, <8 x i1>, <8 x i64>)
declare <8 x double> @llvm.masked.expandload.v8f64(double*, <8 x i1>, <8 x double>)
declare <16 x i32> @llvm.masked.expandload.v16i32(i32*, <16 x i1>, <16 x i32>) #10
declare <16 x float> @llvm.masked.expandload.v16f32(float*, <16 x i1>, <16 x float>)
declare void @llvm.masked.compressstore.v8f64(<8 x double>, double*, <8 x i1>)
declare void @llvm.masked.compressstore.v8i64(<8 x i64>, i64*, <8 x i1>)
declare void @llvm.masked.compressstore.v16f32(<16 x float>, float*, <16 x i1>)
declare void @llvm.masked.compressstore.v16i32(<16 x i32>, i32*, <16 x i1>)
!0 = !{i32 1}

View File

@ -103,7 +103,9 @@ define void @test_mm512_mask_compressstoreu_epi16(i8* %__P, i32 %__U, <8 x i64>
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__D to <32 x i16>
tail call void @llvm.x86.avx512.mask.compress.store.w.512(i8* %__P, <32 x i16> %0, i32 %__U)
%1 = bitcast i8* %__P to i16*
%2 = bitcast i32 %__U to <32 x i1>
tail call void @llvm.masked.compressstore.v32i16(<32 x i16> %0, i16* %1, <32 x i1> %2)
ret void
}
@ -126,7 +128,8 @@ define void @test_mm512_mask_compressstoreu_epi8(i8* %__P, i64 %__U, <8 x i64> %
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__D to <64 x i8>
tail call void @llvm.x86.avx512.mask.compress.store.b.512(i8* %__P, <64 x i8> %0, i64 %__U)
%1 = bitcast i64 %__U to <64 x i1>
tail call void @llvm.masked.compressstore.v64i8(<64 x i8> %0, i8* %__P, <64 x i1> %1)
ret void
}
@ -227,9 +230,11 @@ define <8 x i64> @test_mm512_mask_expandloadu_epi16(<8 x i64> %__S, i32 %__U, i8
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__S to <32 x i16>
%1 = tail call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %__P, <32 x i16> %0, i32 %__U)
%2 = bitcast <32 x i16> %1 to <8 x i64>
ret <8 x i64> %2
%1 = bitcast i8* %__P to i16*
%2 = bitcast i32 %__U to <32 x i1>
%3 = tail call <32 x i16> @llvm.masked.expandload.v32i16(i16* %1, <32 x i1> %2, <32 x i16> %0)
%4 = bitcast <32 x i16> %3 to <8 x i64>
ret <8 x i64> %4
}
define <8 x i64> @test_mm512_maskz_expandloadu_epi16(i32 %__U, i8* readonly %__P) {
@ -246,9 +251,11 @@ define <8 x i64> @test_mm512_maskz_expandloadu_epi16(i32 %__U, i8* readonly %__P
; X64-NEXT: vpexpandw (%rsi), %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = tail call <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8* %__P, <32 x i16> zeroinitializer, i32 %__U)
%1 = bitcast <32 x i16> %0 to <8 x i64>
ret <8 x i64> %1
%0 = bitcast i8* %__P to i16*
%1 = bitcast i32 %__U to <32 x i1>
%2 = tail call <32 x i16> @llvm.masked.expandload.v32i16(i16* %0, <32 x i1> %1, <32 x i16> zeroinitializer)
%3 = bitcast <32 x i16> %2 to <8 x i64>
ret <8 x i64> %3
}
define <8 x i64> @test_mm512_mask_expandloadu_epi8(<8 x i64> %__S, i64 %__U, i8* readonly %__P) {
@ -268,9 +275,10 @@ define <8 x i64> @test_mm512_mask_expandloadu_epi8(<8 x i64> %__S, i64 %__U, i8*
; X64-NEXT: retq
entry:
%0 = bitcast <8 x i64> %__S to <64 x i8>
%1 = tail call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %__P, <64 x i8> %0, i64 %__U)
%2 = bitcast <64 x i8> %1 to <8 x i64>
ret <8 x i64> %2
%1 = bitcast i64 %__U to <64 x i1>
%2 = tail call <64 x i8> @llvm.masked.expandload.v64i8(i8* %__P, <64 x i1> %1, <64 x i8> %0)
%3 = bitcast <64 x i8> %2 to <8 x i64>
ret <8 x i64> %3
}
define <8 x i64> @test_mm512_maskz_expandloadu_epi8(i64 %__U, i8* readonly %__P) {
@ -289,9 +297,10 @@ define <8 x i64> @test_mm512_maskz_expandloadu_epi8(i64 %__U, i8* readonly %__P)
; X64-NEXT: vpexpandb (%rsi), %zmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = tail call <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8* %__P, <64 x i8> zeroinitializer, i64 %__U)
%1 = bitcast <64 x i8> %0 to <8 x i64>
ret <8 x i64> %1
%0 = bitcast i64 %__U to <64 x i1>
%1 = tail call <64 x i8> @llvm.masked.expandload.v64i8(i8* %__P, <64 x i1> %0, <64 x i8> zeroinitializer)
%2 = bitcast <64 x i8> %1 to <8 x i64>
ret <8 x i64> %2
}
define <8 x i64> @test_mm512_mask_shldi_epi64(<8 x i64> %__S, i8 zeroext %__U, <8 x i64> %__A, <8 x i64> %__B) {
@ -932,12 +941,12 @@ entry:
declare <32 x i16> @llvm.x86.avx512.mask.compress.w.512(<32 x i16>, <32 x i16>, i32)
declare <64 x i8> @llvm.x86.avx512.mask.compress.b.512(<64 x i8>, <64 x i8>, i64)
declare void @llvm.x86.avx512.mask.compress.store.w.512(i8*, <32 x i16>, i32)
declare void @llvm.x86.avx512.mask.compress.store.b.512(i8*, <64 x i8>, i64)
declare void @llvm.masked.compressstore.v32i16(<32 x i16>, i16*, <32 x i1>)
declare void @llvm.masked.compressstore.v64i8(<64 x i8>, i8*, <64 x i1>)
declare <32 x i16> @llvm.x86.avx512.mask.expand.w.512(<32 x i16>, <32 x i16>, i32)
declare <64 x i8> @llvm.x86.avx512.mask.expand.b.512(<64 x i8>, <64 x i8>, i64)
declare <32 x i16> @llvm.x86.avx512.mask.expand.load.w.512(i8*, <32 x i16>, i32)
declare <64 x i8> @llvm.x86.avx512.mask.expand.load.b.512(i8*, <64 x i8>, i64)
declare <32 x i16> @llvm.masked.expandload.v32i16(i16*, <32 x i1>, <32 x i16>)
declare <64 x i8> @llvm.masked.expandload.v64i8(i8*, <64 x i1>, <64 x i8>)
declare <8 x i64> @llvm.x86.avx512.mask.vpshldv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <8 x i64> @llvm.x86.avx512.maskz.vpshldv.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
declare <16 x i32> @llvm.x86.avx512.mask.vpshldv.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)

View File

@ -100,7 +100,9 @@ define void @test_mm_mask_compressstoreu_epi16(i8* %__P, i8 zeroext %__U, <2 x i
; X64-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__D to <8 x i16>
tail call void @llvm.x86.avx512.mask.compress.store.w.128(i8* %__P, <8 x i16> %0, i8 %__U)
%1 = bitcast i8* %__P to i16*
%2 = bitcast i8 %__U to <8 x i1>
tail call void @llvm.masked.compressstore.v8i16(<8 x i16> %0, i16* %1, <8 x i1> %2)
ret void
}
@ -119,7 +121,8 @@ define void @test_mm_mask_compressstoreu_epi8(i8* %__P, i16 zeroext %__U, <2 x i
; X64-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__D to <16 x i8>
tail call void @llvm.x86.avx512.mask.compress.store.b.128(i8* %__P, <16 x i8> %0, i16 %__U)
%1 = bitcast i16 %__U to <16 x i1>
tail call void @llvm.masked.compressstore.v16i8(<16 x i8> %0, i8* %__P, <16 x i1> %1)
ret void
}
@ -219,9 +222,11 @@ define <2 x i64> @test_mm_mask_expandloadu_epi16(<2 x i64> %__S, i8 zeroext %__U
; X64-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__S to <8 x i16>
%1 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %__P, <8 x i16> %0, i8 %__U)
%2 = bitcast <8 x i16> %1 to <2 x i64>
ret <2 x i64> %2
%1 = bitcast i8* %__P to i16*
%2 = bitcast i8 %__U to <8 x i1>
%3 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %1, <8 x i1> %2, <8 x i16> %0)
%4 = bitcast <8 x i16> %3 to <2 x i64>
ret <2 x i64> %4
}
define <2 x i64> @test_mm_maskz_expandloadu_epi16(i8 zeroext %__U, i8* readonly %__P) {
@ -239,9 +244,11 @@ define <2 x i64> @test_mm_maskz_expandloadu_epi16(i8 zeroext %__U, i8* readonly
; X64-NEXT: vpexpandw (%rsi), %xmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = tail call <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8* %__P, <8 x i16> zeroinitializer, i8 %__U)
%1 = bitcast <8 x i16> %0 to <2 x i64>
ret <2 x i64> %1
%0 = bitcast i8* %__P to i16*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x i16> @llvm.masked.expandload.v8i16(i16* %0, <8 x i1> %1, <8 x i16> zeroinitializer)
%3 = bitcast <8 x i16> %2 to <2 x i64>
ret <2 x i64> %3
}
define <2 x i64> @test_mm_mask_expandloadu_epi8(<2 x i64> %__S, i16 zeroext %__U, i8* readonly %__P) {
@ -259,9 +266,10 @@ define <2 x i64> @test_mm_mask_expandloadu_epi8(<2 x i64> %__S, i16 zeroext %__U
; X64-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__S to <16 x i8>
%1 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %__P, <16 x i8> %0, i16 %__U)
%2 = bitcast <16 x i8> %1 to <2 x i64>
ret <2 x i64> %2
%1 = bitcast i16 %__U to <16 x i1>
%2 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %1, <16 x i8> %0)
%3 = bitcast <16 x i8> %2 to <2 x i64>
ret <2 x i64> %3
}
define <2 x i64> @test_mm_maskz_expandloadu_epi8(i16 zeroext %__U, i8* readonly %__P) {
@ -278,9 +286,10 @@ define <2 x i64> @test_mm_maskz_expandloadu_epi8(i16 zeroext %__U, i8* readonly
; X64-NEXT: vpexpandb (%rsi), %xmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = tail call <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8* %__P, <16 x i8> zeroinitializer, i16 %__U)
%1 = bitcast <16 x i8> %0 to <2 x i64>
ret <2 x i64> %1
%0 = bitcast i16 %__U to <16 x i1>
%1 = tail call <16 x i8> @llvm.masked.expandload.v16i8(i8* %__P, <16 x i1> %0, <16 x i8> zeroinitializer)
%2 = bitcast <16 x i8> %1 to <2 x i64>
ret <2 x i64> %2
}
define <4 x i64> @test_mm256_mask_compress_epi16(<4 x i64> %__S, i16 zeroext %__U, <4 x i64> %__D) {
@ -378,7 +387,9 @@ define void @test_mm256_mask_compressstoreu_epi16(i8* %__P, i16 zeroext %__U, <4
; X64-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__D to <16 x i16>
tail call void @llvm.x86.avx512.mask.compress.store.w.256(i8* %__P, <16 x i16> %0, i16 %__U)
%1 = bitcast i8* %__P to i16*
%2 = bitcast i16 %__U to <16 x i1>
tail call void @llvm.masked.compressstore.v16i16(<16 x i16> %0, i16* %1, <16 x i1> %2)
ret void
}
@ -399,7 +410,8 @@ define void @test_mm256_mask_compressstoreu_epi8(i8* %__P, i32 %__U, <4 x i64> %
; X64-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__D to <32 x i8>
tail call void @llvm.x86.avx512.mask.compress.store.b.256(i8* %__P, <32 x i8> %0, i32 %__U)
%1 = bitcast i32 %__U to <32 x i1>
tail call void @llvm.masked.compressstore.v32i8(<32 x i8> %0, i8* %__P, <32 x i1> %1)
ret void
}
@ -496,9 +508,11 @@ define <4 x i64> @test_mm256_mask_expandloadu_epi16(<4 x i64> %__S, i16 zeroext
; X64-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__S to <16 x i16>
%1 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %__P, <16 x i16> %0, i16 %__U)
%2 = bitcast <16 x i16> %1 to <4 x i64>
ret <4 x i64> %2
%1 = bitcast i8* %__P to i16*
%2 = bitcast i16 %__U to <16 x i1>
%3 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %1, <16 x i1> %2, <16 x i16> %0)
%4 = bitcast <16 x i16> %3 to <4 x i64>
ret <4 x i64> %4
}
define <4 x i64> @test_mm256_maskz_expandloadu_epi16(i16 zeroext %__U, i8* readonly %__P) {
@ -515,9 +529,11 @@ define <4 x i64> @test_mm256_maskz_expandloadu_epi16(i16 zeroext %__U, i8* reado
; X64-NEXT: vpexpandw (%rsi), %ymm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = tail call <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8* %__P, <16 x i16> zeroinitializer, i16 %__U)
%1 = bitcast <16 x i16> %0 to <4 x i64>
ret <4 x i64> %1
%0 = bitcast i8* %__P to i16*
%1 = bitcast i16 %__U to <16 x i1>
%2 = tail call <16 x i16> @llvm.masked.expandload.v16i16(i16* %0, <16 x i1> %1, <16 x i16> zeroinitializer)
%3 = bitcast <16 x i16> %2 to <4 x i64>
ret <4 x i64> %3
}
define <4 x i64> @test_mm256_mask_expandloadu_epi8(<4 x i64> %__S, i32 %__U, i8* readonly %__P) {
@ -535,9 +551,10 @@ define <4 x i64> @test_mm256_mask_expandloadu_epi8(<4 x i64> %__S, i32 %__U, i8*
; X64-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__S to <32 x i8>
%1 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %__P, <32 x i8> %0, i32 %__U)
%2 = bitcast <32 x i8> %1 to <4 x i64>
ret <4 x i64> %2
%1 = bitcast i32 %__U to <32 x i1>
%2 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %1, <32 x i8> %0)
%3 = bitcast <32 x i8> %2 to <4 x i64>
ret <4 x i64> %3
}
define <4 x i64> @test_mm256_maskz_expandloadu_epi8(i32 %__U, i8* readonly %__P) {
@ -554,9 +571,10 @@ define <4 x i64> @test_mm256_maskz_expandloadu_epi8(i32 %__U, i8* readonly %__P)
; X64-NEXT: vpexpandb (%rsi), %ymm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = tail call <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8* %__P, <32 x i8> zeroinitializer, i32 %__U)
%1 = bitcast <32 x i8> %0 to <4 x i64>
ret <4 x i64> %1
%0 = bitcast i32 %__U to <32 x i1>
%1 = tail call <32 x i8> @llvm.masked.expandload.v32i8(i8* %__P, <32 x i1> %0, <32 x i8> zeroinitializer)
%2 = bitcast <32 x i8> %1 to <4 x i64>
ret <4 x i64> %2
}
define <4 x i64> @test_mm256_mask_shldi_epi64(<4 x i64> %__S, i8 zeroext %__U, <4 x i64> %__A, <4 x i64> %__B) {
@ -1857,20 +1875,20 @@ entry:
declare <8 x i16> @llvm.x86.avx512.mask.compress.w.128(<8 x i16>, <8 x i16>, i8)
declare <16 x i8> @llvm.x86.avx512.mask.compress.b.128(<16 x i8>, <16 x i8>, i16)
declare void @llvm.x86.avx512.mask.compress.store.w.128(i8*, <8 x i16>, i8)
declare void @llvm.x86.avx512.mask.compress.store.b.128(i8*, <16 x i8>, i16)
declare void @llvm.masked.compressstore.v8i16(<8 x i16>, i16*, <8 x i1>)
declare void @llvm.masked.compressstore.v16i8(<16 x i8>, i8*, <16 x i1>)
declare <8 x i16> @llvm.x86.avx512.mask.expand.w.128(<8 x i16>, <8 x i16>, i8)
declare <16 x i8> @llvm.x86.avx512.mask.expand.b.128(<16 x i8>, <16 x i8>, i16)
declare <8 x i16> @llvm.x86.avx512.mask.expand.load.w.128(i8*, <8 x i16>, i8)
declare <16 x i8> @llvm.x86.avx512.mask.expand.load.b.128(i8*, <16 x i8>, i16)
declare <8 x i16> @llvm.masked.expandload.v8i16(i16*, <8 x i1>, <8 x i16>)
declare <16 x i8> @llvm.masked.expandload.v16i8(i8*, <16 x i1>, <16 x i8>)
declare <16 x i16> @llvm.x86.avx512.mask.compress.w.256(<16 x i16>, <16 x i16>, i16)
declare <32 x i8> @llvm.x86.avx512.mask.compress.b.256(<32 x i8>, <32 x i8>, i32)
declare void @llvm.x86.avx512.mask.compress.store.w.256(i8*, <16 x i16>, i16)
declare void @llvm.x86.avx512.mask.compress.store.b.256(i8*, <32 x i8>, i32)
declare void @llvm.masked.compressstore.v16i16(<16 x i16>, i16*, <16 x i1>)
declare void @llvm.masked.compressstore.v32i8(<32 x i8>, i8*, <32 x i1>)
declare <16 x i16> @llvm.x86.avx512.mask.expand.w.256(<16 x i16>, <16 x i16>, i16)
declare <32 x i8> @llvm.x86.avx512.mask.expand.b.256(<32 x i8>, <32 x i8>, i32)
declare <16 x i16> @llvm.x86.avx512.mask.expand.load.w.256(i8*, <16 x i16>, i16)
declare <32 x i8> @llvm.x86.avx512.mask.expand.load.b.256(i8*, <32 x i8>, i32)
declare <16 x i16> @llvm.masked.expandload.v16i16(i16*, <16 x i1>, <16 x i16>)
declare <32 x i8> @llvm.masked.expandload.v32i8(i8*, <32 x i1>, <32 x i8>)
declare <4 x i64> @llvm.x86.avx512.mask.vpshldv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
declare <4 x i64> @llvm.x86.avx512.maskz.vpshldv.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
declare <2 x i64> @llvm.x86.avx512.mask.vpshldv.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)

View File

@ -5831,6 +5831,545 @@ entry:
ret <8 x float> %2
}
define <2 x double> @test_mm_mask_expandloadu_pd(<2 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_mask_expandloadu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_expandloadu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
%2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> %__W)
ret <2 x double> %2
}
define <2 x double> @test_mm_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_maskz_expandloadu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandpd (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_maskz_expandloadu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandpd (%rsi), %xmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
%2 = tail call <2 x double> @llvm.masked.expandload.v2f64(double* %0, <2 x i1> %extract.i, <2 x double> zeroinitializer)
ret <2 x double> %2
}
define <4 x double> @test_mm256_mask_expandloadu_pd(<4 x double> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_mask_expandloadu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_expandloadu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> %__W)
ret <4 x double> %2
}
define <4 x double> @test_mm256_maskz_expandloadu_pd(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_maskz_expandloadu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandpd (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_expandloadu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandpd (%rsi), %ymm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = tail call <4 x double> @llvm.masked.expandload.v4f64(double* %0, <4 x i1> %extract.i, <4 x double> zeroinitializer)
ret <4 x double> %2
}
define <2 x i64> @test_mm_mask_expandloadu_epi64(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_mask_expandloadu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_expandloadu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
%2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> %__W) #10
ret <2 x i64> %2
}
define <2 x i64> @test_mm_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_maskz_expandloadu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandq (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_maskz_expandloadu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandq (%rsi), %xmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
%2 = tail call <2 x i64> @llvm.masked.expandload.v2i64(i64* %0, <2 x i1> %extract.i, <2 x i64> zeroinitializer)
ret <2 x i64> %2
}
define <4 x i64> @test_mm256_mask_expandloadu_epi64(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_mask_expandloadu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_expandloadu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> %__W) #10
ret <4 x i64> %2
}
define <4 x i64> @test_mm256_maskz_expandloadu_epi64(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_maskz_expandloadu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandq (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_expandloadu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandq (%rsi), %ymm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = tail call <4 x i64> @llvm.masked.expandload.v4i64(i64* %0, <4 x i1> %extract.i, <4 x i64> zeroinitializer)
ret <4 x i64> %2
}
define <4 x float> @test_mm_mask_expandloadu_ps(<4 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_mask_expandloadu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandps (%eax), %xmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_expandloadu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> %__W)
ret <4 x float> %2
}
define <4 x float> @test_mm_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_maskz_expandloadu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandps (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_maskz_expandloadu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandps (%rsi), %xmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = tail call <4 x float> @llvm.masked.expandload.v4f32(float* %0, <4 x i1> %extract.i, <4 x float> zeroinitializer)
ret <4 x float> %2
}
define <8 x float> @test_mm256_mask_expandloadu_ps(<8 x float> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_mask_expandloadu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandps (%eax), %ymm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_expandloadu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> %__W)
ret <8 x float> %2
}
define <8 x float> @test_mm256_maskz_expandloadu_ps(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_maskz_expandloadu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vexpandps (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_expandloadu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vexpandps (%rsi), %ymm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x float> @llvm.masked.expandload.v8f32(float* %0, <8 x i1> %1, <8 x float> zeroinitializer)
ret <8 x float> %2
}
define <2 x i64> @test_mm_mask_expandloadu_epi32(<2 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_mask_expandloadu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_expandloadu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__W to <4 x i32>
%1 = bitcast i8* %__P to i32*
%2 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%3 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %1, <4 x i1> %extract.i, <4 x i32> %0)
%4 = bitcast <4 x i32> %3 to <2 x i64>
ret <2 x i64> %4
}
define <2 x i64> @test_mm_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm_maskz_expandloadu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandd (%eax), %xmm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_maskz_expandloadu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandd (%rsi), %xmm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i32*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%2 = tail call <4 x i32> @llvm.masked.expandload.v4i32(i32* %0, <4 x i1> %extract.i, <4 x i32> zeroinitializer)
%3 = bitcast <4 x i32> %2 to <2 x i64>
ret <2 x i64> %3
}
define <4 x i64> @test_mm256_mask_expandloadu_epi32(<4 x i64> %__W, i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_mask_expandloadu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_expandloadu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__W to <8 x i32>
%1 = bitcast i8* %__P to i32*
%2 = bitcast i8 %__U to <8 x i1>
%3 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %1, <8 x i1> %2, <8 x i32> %0)
%4 = bitcast <8 x i32> %3 to <4 x i64>
ret <4 x i64> %4
}
define <4 x i64> @test_mm256_maskz_expandloadu_epi32(i8 zeroext %__U, i8* readonly %__P) {
; X86-LABEL: test_mm256_maskz_expandloadu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movb {{[0-9]+}}(%esp), %cl
; X86-NEXT: kmovw %ecx, %k1
; X86-NEXT: vpexpandd (%eax), %ymm0 {%k1} {z}
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_maskz_expandloadu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %edi, %k1
; X64-NEXT: vpexpandd (%rsi), %ymm0 {%k1} {z}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i32*
%1 = bitcast i8 %__U to <8 x i1>
%2 = tail call <8 x i32> @llvm.masked.expandload.v8i32(i32* %0, <8 x i1> %1, <8 x i32> zeroinitializer)
%3 = bitcast <8 x i32> %2 to <4 x i64>
ret <4 x i64> %3
}
define void @test_mm_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <2 x double> %__A) {
; X86-LABEL: test_mm_mask_compressstoreu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vcompresspd %xmm0, (%ecx) {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_compressstoreu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vcompresspd %xmm0, (%rdi) {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
tail call void @llvm.masked.compressstore.v2f64(<2 x double> %__A, double* %0, <2 x i1> %extract.i)
ret void
}
define void @test_mm256_mask_compressstoreu_pd(i8* %__P, i8 zeroext %__U, <4 x double> %__A) {
; X86-LABEL: test_mm256_mask_compressstoreu_pd:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vcompresspd %ymm0, (%ecx) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_compressstoreu_pd:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vcompresspd %ymm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to double*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
tail call void @llvm.masked.compressstore.v4f64(<4 x double> %__A, double* %0, <4 x i1> %extract.i)
ret void
}
define void @test_mm_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
; X86-LABEL: test_mm_mask_compressstoreu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpcompressq %xmm0, (%ecx) {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_compressstoreu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
tail call void @llvm.masked.compressstore.v2i64(<2 x i64> %__A, i64* %0, <2 x i1> %extract.i)
ret void
}
define void @test_mm256_mask_compressstoreu_epi64(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
; X86-LABEL: test_mm256_mask_compressstoreu_epi64:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpcompressq %ymm0, (%ecx) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_compressstoreu_epi64:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to i64*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
tail call void @llvm.masked.compressstore.v4i64(<4 x i64> %__A, i64* %0, <4 x i1> %extract.i)
ret void
}
define void @test_mm_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <4 x float> %__A) {
; X86-LABEL: test_mm_mask_compressstoreu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vcompressps %xmm0, (%ecx) {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_compressstoreu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vcompressps %xmm0, (%rdi) {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
tail call void @llvm.masked.compressstore.v4f32(<4 x float> %__A, float* %0, <4 x i1> %extract.i)
ret void
}
define void @test_mm256_mask_compressstoreu_ps(i8* %__P, i8 zeroext %__U, <8 x float> %__A) {
; X86-LABEL: test_mm256_mask_compressstoreu_ps:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vcompressps %ymm0, (%ecx) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_compressstoreu_ps:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vcompressps %ymm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast i8* %__P to float*
%1 = bitcast i8 %__U to <8 x i1>
tail call void @llvm.masked.compressstore.v8f32(<8 x float> %__A, float* %0, <8 x i1> %1)
ret void
}
define void @test_mm_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <2 x i64> %__A) {
; X86-LABEL: test_mm_mask_compressstoreu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpcompressd %xmm0, (%ecx) {%k1}
; X86-NEXT: retl
;
; X64-LABEL: test_mm_mask_compressstoreu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vpcompressd %xmm0, (%rdi) {%k1}
; X64-NEXT: retq
entry:
%0 = bitcast <2 x i64> %__A to <4 x i32>
%1 = bitcast i8* %__P to i32*
%2 = bitcast i8 %__U to <8 x i1>
%extract.i = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
tail call void @llvm.masked.compressstore.v4i32(<4 x i32> %0, i32* %1, <4 x i1> %extract.i)
ret void
}
define void @test_mm256_mask_compressstoreu_epi32(i8* %__P, i8 zeroext %__U, <4 x i64> %__A) {
; X86-LABEL: test_mm256_mask_compressstoreu_epi32:
; X86: # %bb.0: # %entry
; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: kmovw %eax, %k1
; X86-NEXT: vpcompressd %ymm0, (%ecx) {%k1}
; X86-NEXT: vzeroupper
; X86-NEXT: retl
;
; X64-LABEL: test_mm256_mask_compressstoreu_epi32:
; X64: # %bb.0: # %entry
; X64-NEXT: kmovw %esi, %k1
; X64-NEXT: vpcompressd %ymm0, (%rdi) {%k1}
; X64-NEXT: vzeroupper
; X64-NEXT: retq
entry:
%0 = bitcast <4 x i64> %__A to <8 x i32>
%1 = bitcast i8* %__P to i32*
%2 = bitcast i8 %__U to <8 x i1>
tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %0, i32* %1, <8 x i1> %2) #10
ret void
}
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) #8
declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) #8
declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) #8
@ -5863,5 +6402,21 @@ declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <
declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
declare <2 x i64> @llvm.x86.avx512.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>)
declare <4 x i64> @llvm.x86.avx512.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>)
declare <2 x double> @llvm.masked.expandload.v2f64(double*, <2 x i1>, <2 x double>)
declare <4 x double> @llvm.masked.expandload.v4f64(double*, <4 x i1>, <4 x double>)
declare <2 x i64> @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
declare <4 x i64> @llvm.masked.expandload.v4i64(i64*, <4 x i1>, <4 x i64>)
declare <4 x float> @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>)
declare <8 x float> @llvm.masked.expandload.v8f32(float*, <8 x i1>, <8 x float>)
declare <4 x i32> @llvm.masked.expandload.v4i32(i32*, <4 x i1>, <4 x i32>)
declare <8 x i32> @llvm.masked.expandload.v8i32(i32*, <8 x i1>, <8 x i32>)
declare void @llvm.masked.compressstore.v2f64(<2 x double>, double*, <2 x i1>)
declare void @llvm.masked.compressstore.v4f64(<4 x double>, double*, <4 x i1>)
declare void @llvm.masked.compressstore.v2i64(<2 x i64>, i64*, <2 x i1>)
declare void @llvm.masked.compressstore.v4i64(<4 x i64>, i64*, <4 x i1>)
declare void @llvm.masked.compressstore.v4f32(<4 x float>, float*, <4 x i1>)
declare void @llvm.masked.compressstore.v8f32(<8 x float>, float*, <8 x i1>)
declare void @llvm.masked.compressstore.v4i32(<4 x i32>, i32*, <4 x i1>)
declare void @llvm.masked.compressstore.v8i32(<8 x i32>, i32*, <8 x i1>)
!0 = !{i32 1}