[X86][AVX] Add AVX512DQ coverage for masked memory ops tests (PR34584)

llvm-svn: 359395
This commit is contained in:
Simon Pilgrim 2019-04-28 10:02:34 +00:00
parent 8651edf898
commit fed302ae37
4 changed files with 2444 additions and 156 deletions

View File

@ -4,7 +4,8 @@
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
;
; vXf64
@ -266,6 +267,15 @@ define void @compressstore_v8f64_v8i1(double* %base, <8 x double> %V, <8 x i1> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v8f64_v8i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1
; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v8f64_v8i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1
@ -789,6 +799,33 @@ define void @compressstore_v16f64_v16i1(double* %base, <16 x double> %V, <16 x i
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v16f64_v16i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
; AVX512VLDQ-NEXT: vpslld $31, %zmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512VLDQ-NEXT: kmovb %k1, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl %ecx
; AVX512VLDQ-NEXT: andl $-43, %ecx
; AVX512VLDQ-NEXT: subl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512VLDQ-NEXT: shrl $2, %eax
; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512VLDQ-NEXT: addl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl $4, %ecx
; AVX512VLDQ-NEXT: addl %eax, %ecx
; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
; AVX512VLDQ-NEXT: shrl $24, %eax
; AVX512VLDQ-NEXT: kshiftrw $8, %k1, %k2
; AVX512VLDQ-NEXT: vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
; AVX512VLDQ-NEXT: vcompresspd %zmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v16f64_v16i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $7, %xmm2, %xmm2
@ -919,13 +956,13 @@ define void @compressstore_v2f32_v2i32(float* %base, <2 x float> %V, <2 x i32> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v2f32_v2i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512VLBW-NEXT: vptestnmq %xmm1, %xmm1, %k1
; AVX512VLBW-NEXT: vcompressps %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: compressstore_v2f32_v2i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vcompressps %xmm0, (%rdi) {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.compressstore.v2f32(<2 x float> %V, float* %base, <2 x i1> %mask)
ret void
@ -1041,6 +1078,13 @@ define void @compressstore_v4f32_v4i1(float* %base, <4 x float> %V, <4 x i1> %ma
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v4f32_v4i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1
; AVX512VLDQ-NEXT: vcompressps %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v4f32_v4i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1
@ -1254,6 +1298,15 @@ define void @compressstore_v8f32_v8i1(float* %base, <8 x float> %V, <8 x i1> %ma
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v8f32_v8i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1
; AVX512VLDQ-NEXT: vcompressps %ymm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v8f32_v8i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1
@ -1347,6 +1400,14 @@ define void @compressstore_v16f32_const(float* %base, <16 x float> %V) {
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v16f32_const:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movw $-2049, %ax ## imm = 0xF7FF
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vcompressps %zmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v16f32_const:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movw $-2049, %ax ## imm = 0xF7FF
@ -2730,6 +2791,13 @@ define void @compressstore_v2i64_v2i1(i64* %base, <2 x i64> %V, <2 x i1> %mask)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v2i64_v2i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpsllq $63, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovq2m %xmm1, %k1
; AVX512VLDQ-NEXT: vpcompressq %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v2i64_v2i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllq $63, %xmm1, %xmm1
@ -2884,6 +2952,14 @@ define void @compressstore_v4i64_v4i1(i64* %base, <4 x i64> %V, <4 x i1> %mask)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v4i64_v4i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1
; AVX512VLDQ-NEXT: vpcompressq %ymm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v4i64_v4i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1
@ -3155,6 +3231,15 @@ define void @compressstore_v8i64_v8i1(i64* %base, <8 x i64> %V, <8 x i1> %mask)
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v8i64_v8i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1
; AVX512VLDQ-NEXT: vpcompressq %zmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v8i64_v8i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1
@ -3290,11 +3375,11 @@ define void @compressstore_v4i32_v4i32(i32* %base, <4 x i32> %V, <4 x i32> %trig
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v4i32_v4i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k1
; AVX512VLBW-NEXT: vpcompressd %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: compressstore_v4i32_v4i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vpcompressd %xmm0, (%rdi) {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.compressstore.v4i32(<4 x i32> %V, i32* %base, <4 x i1> %mask)
ret void
@ -3597,6 +3682,89 @@ define void @compressstore_v8i16_v8i16(i16* %base, <8 x i16> %V, <8 x i16> %trig
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v8i16_v8i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store
; AVX512VLDQ-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_2: ## %else
; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1
; AVX512VLDQ-NEXT: vpextrw $1, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store4
; AVX512VLDQ-NEXT: vpextrw $2, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_6: ## %else5
; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store7
; AVX512VLDQ-NEXT: vpextrw $3, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_8: ## %else8
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store10
; AVX512VLDQ-NEXT: vpextrw $4, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_10: ## %else11
; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store13
; AVX512VLDQ-NEXT: vpextrw $5, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_12: ## %else14
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store16
; AVX512VLDQ-NEXT: vpextrw $6, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_14: ## %else17
; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store19
; AVX512VLDQ-NEXT: vpextrw $7, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: LBB11_16: ## %else20
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v8i16_v8i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmw %xmm1, %xmm1, %k0
@ -4249,6 +4417,169 @@ define void @compressstore_v16i8_v16i8(i8* %base, <16 x i8> %V, <16 x i8> %trigg
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: compressstore_v16i8_v16i8:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store
; AVX512VLDQ-NEXT: vpextrb $0, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_2: ## %else
; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1
; AVX512VLDQ-NEXT: vpextrb $1, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store4
; AVX512VLDQ-NEXT: vpextrb $2, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_6: ## %else5
; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store7
; AVX512VLDQ-NEXT: vpextrb $3, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_8: ## %else8
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store10
; AVX512VLDQ-NEXT: vpextrb $4, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_10: ## %else11
; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store13
; AVX512VLDQ-NEXT: vpextrb $5, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_12: ## %else14
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store16
; AVX512VLDQ-NEXT: vpextrb $6, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_14: ## %else17
; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store19
; AVX512VLDQ-NEXT: vpextrb $7, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_16: ## %else20
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_18
; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store22
; AVX512VLDQ-NEXT: vpextrb $8, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_18: ## %else23
; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_20
; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store25
; AVX512VLDQ-NEXT: vpextrb $9, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_20: ## %else26
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_22
; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store28
; AVX512VLDQ-NEXT: vpextrb $10, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_22: ## %else29
; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_24
; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store31
; AVX512VLDQ-NEXT: vpextrb $11, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_24: ## %else32
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_26
; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store34
; AVX512VLDQ-NEXT: vpextrb $12, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_26: ## %else35
; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_28
; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store37
; AVX512VLDQ-NEXT: vpextrb $13, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_28: ## %else38
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_30
; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store40
; AVX512VLDQ-NEXT: vpextrb $14, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_30: ## %else41
; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_32
; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store43
; AVX512VLDQ-NEXT: vpextrb $15, %xmm0, (%rdi)
; AVX512VLDQ-NEXT: LBB12_32: ## %else44
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: compressstore_v16i8_v16i8:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmb %xmm1, %xmm1, %k0

View File

@ -4,7 +4,8 @@
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
;
; vXf64
@ -82,11 +83,11 @@ define <2 x double> @expandload_v2f64_v2i64(double* %base, <2 x double> %src0, <
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v2f64_v2i64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmq %xmm1, %xmm1, %k1
; AVX512VLBW-NEXT: vexpandpd (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: expandload_v2f64_v2i64:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vexpandpd (%rdi), %xmm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <2 x i64> %trigger, zeroinitializer
%res = call <2 x double> @llvm.masked.expandload.v2f64(double* %base, <2 x i1> %mask, <2 x double> %src0)
ret <2 x double>%res
@ -263,11 +264,11 @@ define <4 x double> @expandload_v4f64_v4i64(double* %base, <4 x double> %src0, <
; AVX512F-NEXT: ## kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v4f64_v4i64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmq %ymm1, %ymm1, %k1
; AVX512VLBW-NEXT: vexpandpd (%rdi), %ymm0 {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: expandload_v4f64_v4i64:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmq %ymm1, %ymm1, %k1
; AVX512VL-NEXT: vexpandpd (%rdi), %ymm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i64> %trigger, zeroinitializer
%res = call <4 x double> @llvm.masked.expandload.v4f64(double* %base, <4 x i1> %mask, <4 x double> %src0)
ret <4 x double>%res
@ -471,6 +472,14 @@ define <8 x double> @expandload_v8f64_v8i1(double* %base, <8 x double> %src0, <8
; AVX512F-NEXT: vexpandpd (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v8f64_v8i1:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
; AVX512VLDQ-NEXT: vpslld $31, %ymm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k1
; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v8f64_v8i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpsllw $15, %xmm1, %xmm1
@ -1180,6 +1189,31 @@ define <16 x double> @expandload_v16f64_v16i32(double* %base, <16 x double> %src
; AVX512F-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v16f64_v16i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; AVX512VLDQ-NEXT: vptestnmd %ymm3, %ymm3, %k1
; AVX512VLDQ-NEXT: vptestnmd %ymm2, %ymm2, %k2
; AVX512VLDQ-NEXT: kmovb %k2, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl %ecx
; AVX512VLDQ-NEXT: andl $-43, %ecx
; AVX512VLDQ-NEXT: subl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: andl $858993459, %ecx ## imm = 0x33333333
; AVX512VLDQ-NEXT: shrl $2, %eax
; AVX512VLDQ-NEXT: andl $858993459, %eax ## imm = 0x33333333
; AVX512VLDQ-NEXT: addl %ecx, %eax
; AVX512VLDQ-NEXT: movl %eax, %ecx
; AVX512VLDQ-NEXT: shrl $4, %ecx
; AVX512VLDQ-NEXT: addl %eax, %ecx
; AVX512VLDQ-NEXT: andl $252645135, %ecx ## imm = 0xF0F0F0F
; AVX512VLDQ-NEXT: imull $16843009, %ecx, %eax ## imm = 0x1010101
; AVX512VLDQ-NEXT: shrl $24, %eax
; AVX512VLDQ-NEXT: vexpandpd (%rdi,%rax,8), %zmm1 {%k1}
; AVX512VLDQ-NEXT: vexpandpd (%rdi), %zmm0 {%k2}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v16f64_v16i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vextracti64x4 $1, %zmm2, %ymm3
@ -1317,13 +1351,13 @@ define <2 x float> @expandload_v2f32_v2i1(float* %base, <2 x float> %src0, <2 x
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v2f32_v2i1:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512VLBW-NEXT: vptestnmq %xmm1, %xmm1, %k1
; AVX512VLBW-NEXT: vexpandps (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: expandload_v2f32_v2i1:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3]
; AVX512VL-NEXT: vptestnmq %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vexpandps (%rdi), %xmm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
%res = call <2 x float> @llvm.masked.expandload.v2f32(float* %base, <2 x i1> %mask, <2 x float> %src0)
ret <2 x float> %res
@ -1367,6 +1401,13 @@ define <4 x float> @expandload_v4f32_const(float* %base, <4 x float> %src0) {
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v4f32_const:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $7, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vexpandps (%rdi), %xmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v4f32_const:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $7, %al
@ -1444,6 +1485,13 @@ define <16 x float> @expandload_v16f32_const(float* %base, <16 x float> %src0) {
; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1}
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v16f32_const:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movw $30719, %ax ## imm = 0x77FF
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vexpandps (%rdi), %zmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v16f32_const:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movw $30719, %ax ## imm = 0x77FF
@ -1489,6 +1537,13 @@ define <16 x float> @expandload_v16f32_const_undef(float* %base) {
; AVX512F-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v16f32_const_undef:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movw $-2049, %ax ## imm = 0xF7FF
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vexpandps (%rdi), %zmm0 {%k1} {z}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v16f32_const_undef:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movw $-2049, %ax ## imm = 0xF7FF
@ -2954,6 +3009,13 @@ define <2 x i64> @expandload_v2i64_const(i64* %base, <2 x i64> %src0) {
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v2i64_const:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: movb $2, %al
; AVX512VLDQ-NEXT: kmovw %eax, %k1
; AVX512VLDQ-NEXT: vpexpandq (%rdi), %xmm0 {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v2i64_const:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: movb $2, %al
@ -3094,11 +3156,11 @@ define <4 x i32> @expandload_v4i32_v4i32(i32* %base, <4 x i32> %src0, <4 x i32>
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v4i32_v4i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmd %xmm1, %xmm1, %k1
; AVX512VLBW-NEXT: vpexpandd (%rdi), %xmm0 {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: expandload_v4i32_v4i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %xmm1, %xmm1, %k1
; AVX512VL-NEXT: vpexpandd (%rdi), %xmm0 {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
%res = call <4 x i32> @llvm.masked.expandload.v4i32(i32* %base, <4 x i1> %mask, <4 x i32> %src0)
ret <4 x i32>%res
@ -3393,6 +3455,89 @@ define <8 x i16> @expandload_v8i16_v8i16(i16* %base, <8 x i16> %src0, <8 x i16>
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v8i16_v8i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load
; AVX512VLDQ-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_2: ## %else
; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1
; AVX512VLDQ-NEXT: vpinsrw $1, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load5
; AVX512VLDQ-NEXT: vpinsrw $2, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_6: ## %else6
; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load9
; AVX512VLDQ-NEXT: vpinsrw $3, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_8: ## %else10
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load13
; AVX512VLDQ-NEXT: vpinsrw $4, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_10: ## %else14
; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load17
; AVX512VLDQ-NEXT: vpinsrw $5, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_12: ## %else18
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxwd %xmm1, %ymm1
; AVX512VLDQ-NEXT: vpmovd2m %ymm1, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load21
; AVX512VLDQ-NEXT: vpinsrw $6, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: addq $2, %rdi
; AVX512VLDQ-NEXT: LBB11_14: ## %else22
; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB11_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load25
; AVX512VLDQ-NEXT: vpinsrw $7, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: LBB11_16: ## %else26
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v8i16_v8i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmw %xmm1, %xmm1, %k0
@ -4120,6 +4265,169 @@ define <16 x i8> @expandload_v16i8_v16i8(i8* %base, <16 x i8> %src0, <16 x i8> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: expandload_v16i8_v16i8:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.load
; AVX512VLDQ-NEXT: vpinsrb $0, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_2: ## %else
; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.load1
; AVX512VLDQ-NEXT: vpinsrb $1, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.load5
; AVX512VLDQ-NEXT: vpinsrb $2, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_6: ## %else6
; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.load9
; AVX512VLDQ-NEXT: vpinsrb $3, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_8: ## %else10
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.load13
; AVX512VLDQ-NEXT: vpinsrb $4, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_10: ## %else14
; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.load17
; AVX512VLDQ-NEXT: vpinsrb $5, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_12: ## %else18
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.load21
; AVX512VLDQ-NEXT: vpinsrb $6, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_14: ## %else22
; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.load25
; AVX512VLDQ-NEXT: vpinsrb $7, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_16: ## %else26
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_18
; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.load29
; AVX512VLDQ-NEXT: vpinsrb $8, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_18: ## %else30
; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_20
; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.load33
; AVX512VLDQ-NEXT: vpinsrb $9, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_20: ## %else34
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_22
; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.load37
; AVX512VLDQ-NEXT: vpinsrb $10, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_22: ## %else38
; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_24
; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.load41
; AVX512VLDQ-NEXT: vpinsrb $11, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_24: ## %else42
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_26
; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.load45
; AVX512VLDQ-NEXT: vpinsrb $12, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_26: ## %else46
; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_28
; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.load49
; AVX512VLDQ-NEXT: vpinsrb $13, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_28: ## %else50
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovsxbd %xmm1, %zmm1
; AVX512VLDQ-NEXT: vpmovd2m %zmm1, %k0
; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_30
; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.load53
; AVX512VLDQ-NEXT: vpinsrb $14, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: incq %rdi
; AVX512VLDQ-NEXT: LBB12_30: ## %else54
; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB12_32
; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.load57
; AVX512VLDQ-NEXT: vpinsrb $15, (%rdi), %xmm0, %xmm0
; AVX512VLDQ-NEXT: LBB12_32: ## %else58
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: expandload_v16i8_v16i8:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmb %xmm1, %xmm1, %k0

File diff suppressed because it is too large Load Diff

View File

@ -4,7 +4,8 @@
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VLBW
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512dq,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLDQ
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512f,avx512bw,avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512VL,AVX512VLBW
;
; vXf64
@ -94,6 +95,12 @@ define void @store_v2f64_v2i64(<2 x i64> %trigger, <2 x double>* %addr, <2 x dou
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v2f64_v2i64:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovq2m %xmm0, %k1
; AVX512VLDQ-NEXT: vmovupd %xmm1, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v2f64_v2i64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
@ -201,6 +208,13 @@ define void @store_v4f64_v4i64(<4 x i64> %trigger, <4 x double>* %addr, <4 x dou
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v4f64_v4i64:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovq2m %ymm0, %k1
; AVX512VLDQ-NEXT: vmovupd %ymm1, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v4f64_v4i64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
@ -289,21 +303,18 @@ define void @store_v2f32_v2i32(<2 x i32> %trigger, <2 x float>* %addr, <2 x floa
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: store_v2f32_v2i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vmovups %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: store_v2f32_v2i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovups %xmm1, (%rdi) {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2f32.p0v2f32(<2 x float> %val, <2 x float>* %addr, i32 4, <2 x i1> %mask)
ret void
}
; PR34584: The mask bit for each data element is the most significant bit of the mask operand, so a compare isn't needed.
; FIXME: The AVX512 code should be improved to use 'vpmovd2m'. Add tests for 512-bit vectors when implementing that.
define void @store_v4f32_v4i32(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y, <4 x i32> %mask) {
; SSE2-LABEL: store_v4f32_v4i32:
; SSE2: ## %bb.0:
@ -391,6 +402,12 @@ define void @store_v4f32_v4i32(<4 x float> %x, <4 x float>* %ptr, <4 x float> %y
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v4f32_v4i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovd2m %xmm2, %k1
; AVX512VLDQ-NEXT: vmovups %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v4f32_v4i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
@ -555,6 +572,13 @@ define void @store_v8f32_v8i32(<8 x float> %x, <8 x float>* %ptr, <8 x float> %y
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v8f32_v8i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k1
; AVX512VLDQ-NEXT: vmovups %ymm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v8f32_v8i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
@ -835,13 +859,28 @@ define void @store_v16f32_v16i32(<16 x float> %x, <16 x float>* %ptr, <16 x floa
; AVX1OR2-NEXT: vzeroupper
; AVX1OR2-NEXT: retq
;
; AVX512-LABEL: store_v16f32_v16i32:
; AVX512: ## %bb.0:
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpgtd %zmm2, %zmm1, %k1
; AVX512-NEXT: vmovups %zmm0, (%rdi) {%k1}
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
; AVX512F-LABEL: store_v16f32_v16i32:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512F-NEXT: vpcmpgtd %zmm2, %zmm1, %k1
; AVX512F-NEXT: vmovups %zmm0, (%rdi) {%k1}
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v16f32_v16i32:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k1
; AVX512VLDQ-NEXT: vmovups %zmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v16f32_v16i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLBW-NEXT: vpcmpgtd %zmm2, %zmm1, %k1
; AVX512VLBW-NEXT: vmovups %zmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
%bool_mask = icmp slt <16 x i32> %mask, zeroinitializer
call void @llvm.masked.store.v16f32.p0v16f32(<16 x float> %x, <16 x float>* %ptr, i32 1, <16 x i1> %bool_mask)
ret void
@ -918,6 +957,12 @@ define void @store_v2i64_v2i64(<2 x i64> %trigger, <2 x i64>* %addr, <2 x i64> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v2i64_v2i64:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovq2m %xmm0, %k1
; AVX512VLDQ-NEXT: vmovdqu64 %xmm1, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v2i64_v2i64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
@ -1033,6 +1078,13 @@ define void @store_v4i64_v4i64(<4 x i64> %trigger, <4 x i64>* %addr, <4 x i64> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v4i64_v4i64:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpmovq2m %ymm0, %k1
; AVX512VLDQ-NEXT: vmovdqu64 %ymm1, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v4i64_v4i64:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
@ -1146,13 +1198,13 @@ define void @store_v2i32_v2i32(<2 x i32> %trigger, <2 x i32>* %addr, <2 x i32> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: store_v2i32_v2i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLBW-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512VLBW-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: store_v2i32_v2i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3]
; AVX512VL-NEXT: vptestnmq %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vpmovqd %xmm1, (%rdi) {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <2 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v2i32.p0v2i32(<2 x i32> %val, <2 x i32>* %addr, i32 4, <2 x i1> %mask)
ret void
@ -1251,11 +1303,11 @@ define void @store_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: store_v4i32_v4i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: store_v4i32_v4i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %xmm0, %xmm0, %k1
; AVX512VL-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1> %mask)
ret void
@ -1422,12 +1474,12 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: store_v8i32_v8i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VLBW-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: vzeroupper
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: store_v8i32_v8i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestnmd %ymm0, %ymm0, %k1
; AVX512VL-NEXT: vmovdqu32 %ymm1, (%rdi) {%k1}
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
%mask = icmp eq <8 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32> %val, <8 x i32>* %addr, i32 4, <8 x i1> %mask)
ret void
@ -1702,6 +1754,82 @@ define void @store_v8i16_v8i16(<8 x i16> %trigger, <8 x i16>* %addr, <8 x i16> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v8i16_v8i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store
; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi)
; AVX512VLDQ-NEXT: LBB13_2: ## %else
; AVX512VLDQ-NEXT: kshiftrb $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1
; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi)
; AVX512VLDQ-NEXT: LBB13_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3
; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi)
; AVX512VLDQ-NEXT: LBB13_6: ## %else4
; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5
; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi)
; AVX512VLDQ-NEXT: LBB13_8: ## %else6
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxwd %xmm2, %ymm2
; AVX512VLDQ-NEXT: vpmovd2m %ymm2, %k0
; AVX512VLDQ-NEXT: kshiftrb $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7
; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi)
; AVX512VLDQ-NEXT: LBB13_10: ## %else8
; AVX512VLDQ-NEXT: kshiftrb $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9
; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi)
; AVX512VLDQ-NEXT: LBB13_12: ## %else10
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0
; AVX512VLDQ-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512VLDQ-NEXT: vpmovd2m %ymm0, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11
; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi)
; AVX512VLDQ-NEXT: LBB13_14: ## %else12
; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB13_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13
; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi)
; AVX512VLDQ-NEXT: LBB13_16: ## %else14
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v8i16_v8i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmw %xmm0, %xmm0, %k1
@ -2376,6 +2504,162 @@ define void @store_v16i16_v16i16(<16 x i16> %trigger, <16 x i16>* %addr, <16 x i
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v16i16_v16i16:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store
; AVX512VLDQ-NEXT: vpextrw $0, %xmm1, (%rdi)
; AVX512VLDQ-NEXT: LBB14_2: ## %else
; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1
; AVX512VLDQ-NEXT: vpextrw $1, %xmm1, 2(%rdi)
; AVX512VLDQ-NEXT: LBB14_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3
; AVX512VLDQ-NEXT: vpextrw $2, %xmm1, 4(%rdi)
; AVX512VLDQ-NEXT: LBB14_6: ## %else4
; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5
; AVX512VLDQ-NEXT: vpextrw $3, %xmm1, 6(%rdi)
; AVX512VLDQ-NEXT: LBB14_8: ## %else6
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7
; AVX512VLDQ-NEXT: vpextrw $4, %xmm1, 8(%rdi)
; AVX512VLDQ-NEXT: LBB14_10: ## %else8
; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9
; AVX512VLDQ-NEXT: vpextrw $5, %xmm1, 10(%rdi)
; AVX512VLDQ-NEXT: LBB14_12: ## %else10
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11
; AVX512VLDQ-NEXT: vpextrw $6, %xmm1, 12(%rdi)
; AVX512VLDQ-NEXT: LBB14_14: ## %else12
; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13
; AVX512VLDQ-NEXT: vpextrw $7, %xmm1, 14(%rdi)
; AVX512VLDQ-NEXT: LBB14_16: ## %else14
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_18
; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrw $0, %xmm2, 16(%rdi)
; AVX512VLDQ-NEXT: LBB14_18: ## %else16
; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_20
; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrw $1, %xmm2, 18(%rdi)
; AVX512VLDQ-NEXT: LBB14_20: ## %else18
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_22
; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrw $2, %xmm2, 20(%rdi)
; AVX512VLDQ-NEXT: LBB14_22: ## %else20
; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_24
; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrw $3, %xmm2, 22(%rdi)
; AVX512VLDQ-NEXT: LBB14_24: ## %else22
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxwd %ymm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_26
; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrw $4, %xmm2, 24(%rdi)
; AVX512VLDQ-NEXT: LBB14_26: ## %else24
; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_28
; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrw $5, %xmm2, 26(%rdi)
; AVX512VLDQ-NEXT: LBB14_28: ## %else26
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm0
; AVX512VLDQ-NEXT: vpmovsxwd %ymm0, %zmm0
; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_30
; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpextrw $6, %xmm0, 28(%rdi)
; AVX512VLDQ-NEXT: LBB14_30: ## %else28
; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB14_32
; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpextrw $7, %xmm0, 30(%rdi)
; AVX512VLDQ-NEXT: LBB14_32: ## %else30
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v16i16_v16i16:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmw %ymm0, %ymm0, %k1
@ -2908,6 +3192,154 @@ define void @store_v16i8_v16i8(<16 x i8> %trigger, <16 x i8>* %addr, <16 x i8> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v16i8_v16i8:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store
; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi)
; AVX512VLDQ-NEXT: LBB15_2: ## %else
; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1
; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi)
; AVX512VLDQ-NEXT: LBB15_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3
; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi)
; AVX512VLDQ-NEXT: LBB15_6: ## %else4
; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5
; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi)
; AVX512VLDQ-NEXT: LBB15_8: ## %else6
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7
; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi)
; AVX512VLDQ-NEXT: LBB15_10: ## %else8
; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9
; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi)
; AVX512VLDQ-NEXT: LBB15_12: ## %else10
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11
; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi)
; AVX512VLDQ-NEXT: LBB15_14: ## %else12
; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13
; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi)
; AVX512VLDQ-NEXT: LBB15_16: ## %else14
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_18
; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15
; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi)
; AVX512VLDQ-NEXT: LBB15_18: ## %else16
; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_20
; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17
; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi)
; AVX512VLDQ-NEXT: LBB15_20: ## %else18
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_22
; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19
; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi)
; AVX512VLDQ-NEXT: LBB15_22: ## %else20
; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_24
; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21
; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi)
; AVX512VLDQ-NEXT: LBB15_24: ## %else22
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_26
; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23
; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi)
; AVX512VLDQ-NEXT: LBB15_26: ## %else24
; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_28
; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25
; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi)
; AVX512VLDQ-NEXT: LBB15_28: ## %else26
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %xmm2, %xmm0, %xmm0
; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_30
; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27
; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi)
; AVX512VLDQ-NEXT: LBB15_30: ## %else28
; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB15_32
; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29
; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi)
; AVX512VLDQ-NEXT: LBB15_32: ## %else30
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v16i8_v16i8:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmb %xmm0, %xmm0, %k1
@ -4253,6 +4685,337 @@ define void @store_v32i8_v32i8(<32 x i8> %trigger, <32 x i8>* %addr, <32 x i8> %
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: store_v32i8_v32i8:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_2
; AVX512VLDQ-NEXT: ## %bb.1: ## %cond.store
; AVX512VLDQ-NEXT: vpextrb $0, %xmm1, (%rdi)
; AVX512VLDQ-NEXT: LBB16_2: ## %else
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_4
; AVX512VLDQ-NEXT: ## %bb.3: ## %cond.store1
; AVX512VLDQ-NEXT: vpextrb $1, %xmm1, 1(%rdi)
; AVX512VLDQ-NEXT: LBB16_4: ## %else2
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_6
; AVX512VLDQ-NEXT: ## %bb.5: ## %cond.store3
; AVX512VLDQ-NEXT: vpextrb $2, %xmm1, 2(%rdi)
; AVX512VLDQ-NEXT: LBB16_6: ## %else4
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_8
; AVX512VLDQ-NEXT: ## %bb.7: ## %cond.store5
; AVX512VLDQ-NEXT: vpextrb $3, %xmm1, 3(%rdi)
; AVX512VLDQ-NEXT: LBB16_8: ## %else6
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_10
; AVX512VLDQ-NEXT: ## %bb.9: ## %cond.store7
; AVX512VLDQ-NEXT: vpextrb $4, %xmm1, 4(%rdi)
; AVX512VLDQ-NEXT: LBB16_10: ## %else8
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_12
; AVX512VLDQ-NEXT: ## %bb.11: ## %cond.store9
; AVX512VLDQ-NEXT: vpextrb $5, %xmm1, 5(%rdi)
; AVX512VLDQ-NEXT: LBB16_12: ## %else10
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_14
; AVX512VLDQ-NEXT: ## %bb.13: ## %cond.store11
; AVX512VLDQ-NEXT: vpextrb $6, %xmm1, 6(%rdi)
; AVX512VLDQ-NEXT: LBB16_14: ## %else12
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_16
; AVX512VLDQ-NEXT: ## %bb.15: ## %cond.store13
; AVX512VLDQ-NEXT: vpextrb $7, %xmm1, 7(%rdi)
; AVX512VLDQ-NEXT: LBB16_16: ## %else14
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_18
; AVX512VLDQ-NEXT: ## %bb.17: ## %cond.store15
; AVX512VLDQ-NEXT: vpextrb $8, %xmm1, 8(%rdi)
; AVX512VLDQ-NEXT: LBB16_18: ## %else16
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_20
; AVX512VLDQ-NEXT: ## %bb.19: ## %cond.store17
; AVX512VLDQ-NEXT: vpextrb $9, %xmm1, 9(%rdi)
; AVX512VLDQ-NEXT: LBB16_20: ## %else18
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_22
; AVX512VLDQ-NEXT: ## %bb.21: ## %cond.store19
; AVX512VLDQ-NEXT: vpextrb $10, %xmm1, 10(%rdi)
; AVX512VLDQ-NEXT: LBB16_22: ## %else20
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_24
; AVX512VLDQ-NEXT: ## %bb.23: ## %cond.store21
; AVX512VLDQ-NEXT: vpextrb $11, %xmm1, 11(%rdi)
; AVX512VLDQ-NEXT: LBB16_24: ## %else22
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_26
; AVX512VLDQ-NEXT: ## %bb.25: ## %cond.store23
; AVX512VLDQ-NEXT: vpextrb $12, %xmm1, 12(%rdi)
; AVX512VLDQ-NEXT: LBB16_26: ## %else24
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_28
; AVX512VLDQ-NEXT: ## %bb.27: ## %cond.store25
; AVX512VLDQ-NEXT: vpextrb $13, %xmm1, 13(%rdi)
; AVX512VLDQ-NEXT: LBB16_28: ## %else26
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm3
; AVX512VLDQ-NEXT: vpmovd2m %zmm3, %k0
; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_30
; AVX512VLDQ-NEXT: ## %bb.29: ## %cond.store27
; AVX512VLDQ-NEXT: vpextrb $14, %xmm1, 14(%rdi)
; AVX512VLDQ-NEXT: LBB16_30: ## %else28
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_32
; AVX512VLDQ-NEXT: ## %bb.31: ## %cond.store29
; AVX512VLDQ-NEXT: vpextrb $15, %xmm1, 15(%rdi)
; AVX512VLDQ-NEXT: LBB16_32: ## %else30
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_34
; AVX512VLDQ-NEXT: ## %bb.33: ## %cond.store31
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $0, %xmm2, 16(%rdi)
; AVX512VLDQ-NEXT: LBB16_34: ## %else32
; AVX512VLDQ-NEXT: kshiftrw $1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_36
; AVX512VLDQ-NEXT: ## %bb.35: ## %cond.store33
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $1, %xmm2, 17(%rdi)
; AVX512VLDQ-NEXT: LBB16_36: ## %else34
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_38
; AVX512VLDQ-NEXT: ## %bb.37: ## %cond.store35
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $2, %xmm2, 18(%rdi)
; AVX512VLDQ-NEXT: LBB16_38: ## %else36
; AVX512VLDQ-NEXT: kshiftrw $3, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_40
; AVX512VLDQ-NEXT: ## %bb.39: ## %cond.store37
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $3, %xmm2, 19(%rdi)
; AVX512VLDQ-NEXT: LBB16_40: ## %else38
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $4, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_42
; AVX512VLDQ-NEXT: ## %bb.41: ## %cond.store39
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $4, %xmm2, 20(%rdi)
; AVX512VLDQ-NEXT: LBB16_42: ## %else40
; AVX512VLDQ-NEXT: kshiftrw $5, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_44
; AVX512VLDQ-NEXT: ## %bb.43: ## %cond.store41
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $5, %xmm2, 21(%rdi)
; AVX512VLDQ-NEXT: LBB16_44: ## %else42
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $6, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_46
; AVX512VLDQ-NEXT: ## %bb.45: ## %cond.store43
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $6, %xmm2, 22(%rdi)
; AVX512VLDQ-NEXT: LBB16_46: ## %else44
; AVX512VLDQ-NEXT: kshiftrw $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_48
; AVX512VLDQ-NEXT: ## %bb.47: ## %cond.store45
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $7, %xmm2, 23(%rdi)
; AVX512VLDQ-NEXT: LBB16_48: ## %else46
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $8, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_50
; AVX512VLDQ-NEXT: ## %bb.49: ## %cond.store47
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $8, %xmm2, 24(%rdi)
; AVX512VLDQ-NEXT: LBB16_50: ## %else48
; AVX512VLDQ-NEXT: kshiftrw $9, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_52
; AVX512VLDQ-NEXT: ## %bb.51: ## %cond.store49
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $9, %xmm2, 25(%rdi)
; AVX512VLDQ-NEXT: LBB16_52: ## %else50
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $10, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_54
; AVX512VLDQ-NEXT: ## %bb.53: ## %cond.store51
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $10, %xmm2, 26(%rdi)
; AVX512VLDQ-NEXT: LBB16_54: ## %else52
; AVX512VLDQ-NEXT: kshiftrw $11, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_56
; AVX512VLDQ-NEXT: ## %bb.55: ## %cond.store53
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $11, %xmm2, 27(%rdi)
; AVX512VLDQ-NEXT: LBB16_56: ## %else54
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm2, %xmm2
; AVX512VLDQ-NEXT: vpmovsxbd %xmm2, %zmm2
; AVX512VLDQ-NEXT: vpmovd2m %zmm2, %k0
; AVX512VLDQ-NEXT: kshiftrw $12, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_58
; AVX512VLDQ-NEXT: ## %bb.57: ## %cond.store55
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $12, %xmm2, 28(%rdi)
; AVX512VLDQ-NEXT: LBB16_58: ## %else56
; AVX512VLDQ-NEXT: kshiftrw $13, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_60
; AVX512VLDQ-NEXT: ## %bb.59: ## %cond.store57
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX512VLDQ-NEXT: vpextrb $13, %xmm2, 29(%rdi)
; AVX512VLDQ-NEXT: LBB16_60: ## %else58
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm0
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm0, %xmm0
; AVX512VLDQ-NEXT: vpmovsxbd %xmm0, %zmm0
; AVX512VLDQ-NEXT: vpmovd2m %zmm0, %k0
; AVX512VLDQ-NEXT: kshiftrw $14, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %k1, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_62
; AVX512VLDQ-NEXT: ## %bb.61: ## %cond.store59
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpextrb $14, %xmm0, 30(%rdi)
; AVX512VLDQ-NEXT: LBB16_62: ## %else60
; AVX512VLDQ-NEXT: kshiftrw $15, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %k0, %eax
; AVX512VLDQ-NEXT: testb $1, %al
; AVX512VLDQ-NEXT: je LBB16_64
; AVX512VLDQ-NEXT: ## %bb.63: ## %cond.store61
; AVX512VLDQ-NEXT: vextracti128 $1, %ymm1, %xmm0
; AVX512VLDQ-NEXT: vpextrb $15, %xmm0, 31(%rdi)
; AVX512VLDQ-NEXT: LBB16_64: ## %else62
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: store_v32i8_v32i8:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestnmb %ymm0, %ymm0, %k1
@ -4293,11 +5056,11 @@ define void @mstore_constmask_v4i32_v4i32(<4 x i32> %trigger, <4 x i32>* %addr,
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: mstore_constmask_v4i32_v4i32:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: kxnorw %k0, %k0, %k1
; AVX512VLBW-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: mstore_constmask_v4i32_v4i32:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: kxnorw %k0, %k0, %k1
; AVX512VL-NEXT: vmovdqu32 %xmm1, (%rdi) {%k1}
; AVX512VL-NEXT: retq
%mask = icmp eq <4 x i32> %trigger, zeroinitializer
call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 true, i1 true, i1 true>)
ret void
@ -4491,6 +5254,14 @@ define void @masked_store_bool_mask_demand_trunc_sext(<4 x double> %x, <4 x doub
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: masked_store_bool_mask_demand_trunc_sext:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1
; AVX512VLDQ-NEXT: vmovupd %ymm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: vzeroupper
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: masked_store_bool_mask_demand_trunc_sext:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1
@ -4606,11 +5377,11 @@ define void @one_mask_bit_set1_variable(<4 x float>* %addr, <4 x float> %val, <4
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLBW-LABEL: one_mask_bit_set1_variable:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm1, %k1
; AVX512VLBW-NEXT: vmovups %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
; AVX512VL-LABEL: one_mask_bit_set1_variable:
; AVX512VL: ## %bb.0:
; AVX512VL-NEXT: vptestmd {{.*}}(%rip){1to4}, %xmm1, %k1
; AVX512VL-NEXT: vmovups %xmm0, (%rdi) {%k1}
; AVX512VL-NEXT: retq
%mask_signbit = and <4 x i32> %mask, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
%mask_bool = icmp ne <4 x i32> %mask_signbit, zeroinitializer
call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %val, <4 x float>* %addr, i32 1, <4 x i1> %mask_bool)
@ -4708,6 +5479,17 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512VLDQ-LABEL: widen_masked_store:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k0
; AVX512VLDQ-NEXT: vpmovm2d %k0, %xmm1
; AVX512VLDQ-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX512VLDQ-NEXT: vpblendd {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3]
; AVX512VLDQ-NEXT: vpmovd2m %xmm1, %k1
; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: widen_masked_store:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: vpslld $31, %xmm1, %xmm1