[AVX512] Don't switch unmasked subvector insert/extract instructions when AVX512DQI is enabled.

There's no reason to switch instructions with and without DQI. It just creates extra isel patterns and test divergences.

There is however value in enabling the masked version of the instructions with DQI.

This required introducing some new multiclasses to enabling this splitting.

Differential Revision: https://reviews.llvm.org/D36661

llvm-svn: 311091
This commit is contained in:
Craig Topper 2017-08-17 15:40:25 +00:00
parent 5960848060
commit 3a622a14f9
15 changed files with 638 additions and 1483 deletions

View File

@ -282,6 +282,28 @@ multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _,
MaskingConstraint, itin, IsCommutable,
IsKCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
// perserved vector elements come from a new dummy input operand tied to $dst.
// This version uses a separate dag for non-masking and masking.
multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskRHS,
InstrItinClass itin = NoItinerary,
bit IsCommutable = 0, bit IsKCommutable = 0,
SDNode Select = vselect> :
AVX512_maskable_custom<O, F, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm,
[(set _.RC:$dst, RHS)],
[(set _.RC:$dst,
(Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
[(set _.RC:$dst,
(Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
"$src0 = $dst", itin, IsCommutable, IsKCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
// perserved vector elements come from a new dummy input operand tied to $dst.
@ -512,28 +534,45 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
//===----------------------------------------------------------------------===//
// AVX-512 - VECTOR INSERT
//
multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
PatFrag vinsert_insert> {
// Supports two different pattern operators for mask and unmasked ops. Allows
// null_frag to be passed for one.
multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
X86VectorVTInfo To,
SDPatternOperator vinsert_insert,
SDPatternOperator vinsert_for_mask> {
let ExeDomain = To.ExeDomain in {
defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
(ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
"vinsert" # From.EltTypeName # "x" # From.NumElts,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(vinsert_insert:$src3 (To.VT To.RC:$src1),
(From.VT From.RC:$src2),
(iPTR imm))>, AVX512AIi8Base, EVEX_4V;
(iPTR imm)),
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
(From.VT From.RC:$src2),
(iPTR imm))>, AVX512AIi8Base, EVEX_4V;
defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
(ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
"vinsert" # From.EltTypeName # "x" # From.NumElts,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(vinsert_insert:$src3 (To.VT To.RC:$src1),
(From.VT (bitconvert (From.LdFrag addr:$src2))),
(iPTR imm)),
(vinsert_for_mask:$src3 (To.VT To.RC:$src1),
(From.VT (bitconvert (From.LdFrag addr:$src2))),
(iPTR imm))>, AVX512AIi8Base, EVEX_4V,
EVEX_CD8<From.EltSize, From.CD8TupleForm>;
}
}
// Passes the same pattern operator for masked and unmasked ops.
multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From,
X86VectorVTInfo To,
SDPatternOperator vinsert_insert> :
vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert>;
multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
X86VectorVTInfo To, PatFrag vinsert_insert,
SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
@ -573,22 +612,24 @@ multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
X86VectorVTInfo< 8, EltVT64, VR512>,
vinsert256_insert>, VEX_W, EVEX_V512;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasVLX, HasDQI] in
defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 4, EltVT64, VR256X>,
vinsert128_insert>, VEX_W, EVEX_V256;
null_frag, vinsert128_insert>, VEX_W, EVEX_V256;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 8, EltVT64, VR512>,
vinsert128_insert>, VEX_W, EVEX_V512;
null_frag, vinsert128_insert>, VEX_W, EVEX_V512;
defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
X86VectorVTInfo< 8, EltVT32, VR256X>,
X86VectorVTInfo<16, EltVT32, VR512>,
vinsert256_insert>, EVEX_V512;
null_frag, vinsert256_insert>, EVEX_V512;
}
}
@ -596,21 +637,21 @@ defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
// Codegen pattern with the alternative types,
// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
// Even with AVX512DQ we'll still use these for unmasked operations.
defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
// Codegen pattern with the alternative types insert VEC128 into VEC256
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
@ -647,16 +688,20 @@ def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
// AVX-512 VECTOR EXTRACT
//---
multiclass vextract_for_size<int Opcode,
X86VectorVTInfo From, X86VectorVTInfo To,
PatFrag vextract_extract> {
// Supports two different pattern operators for mask and unmasked ops. Allows
// null_frag to be passed for one.
multiclass vextract_for_size_split<int Opcode,
X86VectorVTInfo From, X86VectorVTInfo To,
SDPatternOperator vextract_extract,
SDPatternOperator vextract_for_mask> {
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
defm rr : AVX512_maskable<Opcode, MRMDestReg, To, (outs To.RC:$dst),
defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst),
(ins From.RC:$src1, u8imm:$idx),
"vextract" # To.EltTypeName # "x" # To.NumElts,
"$idx, $src1", "$src1, $idx",
(vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm))>,
(vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)),
(vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm))>,
AVX512AIi8Base, EVEX;
def mr : AVX512AIi8<Opcode, MRMDestMem, (outs),
(ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
@ -677,6 +722,12 @@ multiclass vextract_for_size<int Opcode,
}
}
// Passes the same pattern operator for masked and unmasked ops.
multiclass vextract_for_size<int Opcode, X86VectorVTInfo From,
X86VectorVTInfo To,
SDPatternOperator vextract_extract> :
vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract>;
// Codegen pattern for the alternative types
multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
X86VectorVTInfo To, PatFrag vextract_extract,
@ -713,22 +764,26 @@ multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
X86VectorVTInfo< 4, EltVT32, VR128X>,
vextract128_extract>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasVLX, HasDQI] in
defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
vextract128_extract>,
null_frag, vextract128_extract>,
VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
// Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
defm NAME # "64x2Z" : vextract_for_size<Opcode128,
defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
X86VectorVTInfo< 8, EltVT64, VR512>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
vextract128_extract>,
null_frag, vextract128_extract>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
defm NAME # "32x8Z" : vextract_for_size<Opcode256,
defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
X86VectorVTInfo<16, EltVT32, VR512>,
X86VectorVTInfo< 8, EltVT32, VR256X>,
vextract256_extract>,
null_frag, vextract256_extract>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
}
}
@ -737,21 +792,21 @@ defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
// extract_subvector codegen patterns with the alternative types.
// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
// Even with AVX512DQ we'll still use these for unmasked operations.
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
// Codegen pattern with the alternative types extract VEC128 from VEC256
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,

View File

@ -642,19 +642,12 @@ define <4 x i32> @fptosi03(<4 x double> %a) {
}
define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
; NODQ-LABEL: fptrunc00:
; NODQ: # BB#0:
; NODQ-NEXT: vcvtpd2ps %zmm0, %ymm0
; NODQ-NEXT: vcvtpd2ps %zmm1, %ymm1
; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: fptrunc00:
; DQ: # BB#0:
; DQ-NEXT: vcvtpd2ps %zmm0, %ymm0
; DQ-NEXT: vcvtpd2ps %zmm1, %ymm1
; DQ-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0
; DQ-NEXT: retq
; ALL-LABEL: fptrunc00:
; ALL: # BB#0:
; ALL-NEXT: vcvtpd2ps %zmm0, %ymm0
; ALL-NEXT: vcvtpd2ps %zmm1, %ymm1
; ALL-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: retq
%a = fptrunc <16 x double> %b to <16 x float>
ret <16 x float> %a
}
@ -876,21 +869,13 @@ define i32 @float_to_int(float %x) {
}
define <16 x double> @uitof64(<16 x i32> %a) nounwind {
; NODQ-LABEL: uitof64:
; NODQ: # BB#0:
; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm2
; NODQ-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; NODQ-NEXT: vcvtudq2pd %ymm0, %zmm1
; NODQ-NEXT: vmovaps %zmm2, %zmm0
; NODQ-NEXT: retq
;
; DQ-LABEL: uitof64:
; DQ: # BB#0:
; DQ-NEXT: vcvtudq2pd %ymm0, %zmm2
; DQ-NEXT: vextractf32x8 $1, %zmm0, %ymm0
; DQ-NEXT: vcvtudq2pd %ymm0, %zmm1
; DQ-NEXT: vmovaps %zmm2, %zmm0
; DQ-NEXT: retq
; ALL-LABEL: uitof64:
; ALL: # BB#0:
; ALL-NEXT: vcvtudq2pd %ymm0, %zmm2
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; ALL-NEXT: vcvtudq2pd %ymm0, %zmm1
; ALL-NEXT: vmovaps %zmm2, %zmm0
; ALL-NEXT: retq
%b = uitofp <16 x i32> %a to <16 x double>
ret <16 x double> %b
}

View File

@ -20,25 +20,15 @@ define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
}
define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
; KNL-LABEL: test2:
; KNL: ## BB#0:
; KNL-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; KNL-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
; KNL-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; KNL-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; KNL-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; KNL-NEXT: retq
; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test2:
; SKX: ## BB#0:
; SKX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; SKX-NEXT: vinsertf64x2 $0, %xmm2, %zmm0, %zmm2
; SKX-NEXT: vextractf64x2 $3, %zmm0, %xmm0
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; SKX-NEXT: vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
; SKX-NEXT: retq
; SKX-NEXT: ## -- End function
; CHECK-LABEL: test2:
; CHECK: ## BB#0:
; CHECK-NEXT: vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
; CHECK-NEXT: vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
; CHECK-NEXT: vextractf32x4 $3, %zmm0, %xmm0
; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
; CHECK-NEXT: vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
; CHECK-NEXT: retq
; CHECK-NEXT: ## -- End function
%rrr = load double, double* %br
%rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
%rrr3 = insertelement <8 x double> %rrr2, double %y, i32 6
@ -59,23 +49,14 @@ define <16 x float> @test3(<16 x float> %x) nounwind {
}
define <8 x i64> @test4(<8 x i64> %x) nounwind {
; KNL-LABEL: test4:
; KNL: ## BB#0:
; KNL-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; KNL-NEXT: vmovq %xmm1, %rax
; KNL-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
; KNL-NEXT: ## -- End function
;
; SKX-LABEL: test4:
; SKX: ## BB#0:
; SKX-NEXT: vextracti64x2 $2, %zmm0, %xmm1
; SKX-NEXT: vmovq %xmm1, %rax
; SKX-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
; SKX-NEXT: ## -- End function
; CHECK-LABEL: test4:
; CHECK: ## BB#0:
; CHECK-NEXT: vextracti32x4 $2, %zmm0, %xmm1
; CHECK-NEXT: vmovq %xmm1, %rax
; CHECK-NEXT: vpinsrq $1, %rax, %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
; CHECK-NEXT: ## -- End function
%eee = extractelement <8 x i64> %x, i32 4
%rrr2 = insertelement <8 x i64> %x, i64 %eee, i32 1
ret <8 x i64> %rrr2
@ -477,7 +458,7 @@ define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
; SKX-LABEL: extract_v8i64:
; SKX: ## BB#0:
; SKX-NEXT: vpextrq $1, %xmm0, %rax
; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0
; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; SKX-NEXT: vpextrq $1, %xmm0, (%rdi)
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@ -693,23 +674,14 @@ define i8 @extract_v16i8(<16 x i8> %x, i8* %dst) {
}
define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
; KNL-LABEL: insert_v8i64:
; KNL: ## BB#0:
; KNL-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
; KNL-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; KNL-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: insert_v8i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm1
; SKX-NEXT: vextracti64x2 $1, %zmm0, %xmm0
; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; SKX-NEXT: vinserti64x2 $1, %xmm0, %zmm1, %zmm0
; SKX-NEXT: retq
; CHECK-LABEL: insert_v8i64:
; CHECK: ## BB#0:
; CHECK-NEXT: vpinsrq $1, (%rsi), %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm1
; CHECK-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm0
; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0
; CHECK-NEXT: retq
%val = load i64, i64* %ptr
%r1 = insertelement <8 x i64> %x, i64 %val, i32 1
%r2 = insertelement <8 x i64> %r1, i64 %y, i32 3
@ -888,17 +860,11 @@ define <16 x i8> @insert_v16i8(<16 x i8> %x, i8 %y, i8* %ptr) {
}
define <8 x i64> @test_insert_128_v8i64(<8 x i64> %x, i64 %y) {
; KNL-LABEL: test_insert_128_v8i64:
; KNL: ## BB#0:
; KNL-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
; KNL-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_insert_128_v8i64:
; SKX: ## BB#0:
; SKX-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
; SKX-NEXT: vinserti64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
; CHECK-LABEL: test_insert_128_v8i64:
; CHECK: ## BB#0:
; CHECK-NEXT: vpinsrq $1, %rdi, %xmm0, %xmm1
; CHECK-NEXT: vinserti32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = insertelement <8 x i64> %x, i64 %y, i32 1
ret <8 x i64> %r
}
@ -914,17 +880,11 @@ define <16 x i32> @test_insert_128_v16i32(<16 x i32> %x, i32 %y) {
}
define <8 x double> @test_insert_128_v8f64(<8 x double> %x, double %y) {
; KNL-LABEL: test_insert_128_v8f64:
; KNL: ## BB#0:
; KNL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; KNL-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; KNL-NEXT: retq
;
; SKX-LABEL: test_insert_128_v8f64:
; SKX: ## BB#0:
; SKX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; SKX-NEXT: vinsertf64x2 $0, %xmm1, %zmm0, %zmm0
; SKX-NEXT: retq
; CHECK-LABEL: test_insert_128_v8f64:
; CHECK: ## BB#0:
; CHECK-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm0[0],xmm1[0]
; CHECK-NEXT: vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
; CHECK-NEXT: retq
%r = insertelement <8 x double> %x, double %y, i32 1
ret <8 x double> %r
}

View File

@ -726,27 +726,16 @@ define <8 x i8> @usat_trunc_wb_128(<8 x i16> %i) {
}
define <16 x i16> @usat_trunc_qw_1024(<16 x i64> %i) {
; KNL-LABEL: usat_trunc_qw_1024:
; KNL: ## BB#0:
; KNL-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
; KNL-NEXT: vpminuq %zmm2, %zmm1, %zmm1
; KNL-NEXT: vpminuq %zmm2, %zmm0, %zmm0
; KNL-NEXT: vpmovqd %zmm0, %ymm0
; KNL-NEXT: vpmovqd %zmm1, %ymm1
; KNL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; KNL-NEXT: vpmovdw %zmm0, %ymm0
; KNL-NEXT: retq
;
; SKX-LABEL: usat_trunc_qw_1024:
; SKX: ## BB#0:
; SKX-NEXT: vpbroadcastq {{.*}}(%rip), %zmm2
; SKX-NEXT: vpminuq %zmm2, %zmm1, %zmm1
; SKX-NEXT: vpminuq %zmm2, %zmm0, %zmm0
; SKX-NEXT: vpmovqd %zmm0, %ymm0
; SKX-NEXT: vpmovqd %zmm1, %ymm1
; SKX-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; SKX-NEXT: vpmovdw %zmm0, %ymm0
; SKX-NEXT: retq
; ALL-LABEL: usat_trunc_qw_1024:
; ALL: ## BB#0:
; ALL-NEXT: vpbroadcastq {{.*#+}} zmm2 = [65535,65535,65535,65535,65535,65535,65535,65535]
; ALL-NEXT: vpminuq %zmm2, %zmm1, %zmm1
; ALL-NEXT: vpminuq %zmm2, %zmm0, %zmm0
; ALL-NEXT: vpmovqd %zmm0, %ymm0
; ALL-NEXT: vpmovqd %zmm1, %ymm1
; ALL-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; ALL-NEXT: vpmovdw %zmm0, %ymm0
; ALL-NEXT: retq
%x3 = icmp ult <16 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%x5 = select <16 x i1> %x3, <16 x i64> %i, <16 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
%x6 = trunc <16 x i64> %x5 to <16 x i16>

View File

@ -6,7 +6,7 @@ declare <2 x double> @llvm.x86.avx512.mask.vextractf64x2.512(<8 x double>, i32,
define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0, <2 x double> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf64x2 $1, %zmm0, %xmm0
; CHECK-NEXT: vextractf32x4 $1, %zmm0, %xmm0
; CHECK-NEXT: kmovw %edi, %k0
; CHECK-NEXT: kshiftlb $7, %k0, %k1
; CHECK-NEXT: kshiftrb $7, %k1, %k1
@ -36,7 +36,7 @@ declare <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float>, i32, <
define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x float> %x2, i8 %x3) {
; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
; CHECK: ## BB#0:
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2
; CHECK-NEXT: vextractf64x4 $1, %zmm0, %ymm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1}
; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1
@ -56,7 +56,7 @@ declare <16 x float> @llvm.x86.avx512.mask.insertf32x8.512(<16 x float>, <8 x fl
define <16 x float>@test_int_x86_avx512_mask_insertf32x8_512(<16 x float> %x0, <8 x float> %x1, <16 x float> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf32x8_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinsertf32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
@ -76,7 +76,7 @@ declare <8 x double> @llvm.x86.avx512.mask.insertf64x2.512(<8 x double>, <2 x do
define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <2 x double> %x1,<8 x double> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
@ -96,7 +96,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.inserti32x8.512(<16 x i32>, <8 x i32>,
define <16 x i32>@test_int_x86_avx512_mask_inserti32x8_512(<16 x i32> %x0, <8 x i32> %x1, <16 x i32> %x3, i16 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti32x8_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0 {%k1} {z}
@ -116,7 +116,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.inserti64x2.512(<8 x i64>, <2 x i64>, i3
define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i64> %x1, <8 x i64> %x3, i8 %x4) {
; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
; CHECK: ## BB#0:
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
; CHECK-NEXT: vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
@ -162,7 +162,7 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0,
; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x8_512:
; CHECK: ## BB#0:
; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1
@ -231,7 +231,7 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x8_512(<8 x i32> %x0, <16
; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x8_512:
; CHECK: ## BB#0:
; CHECK-NEXT: ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm2
; CHECK-NEXT: kmovw %edi, %k1
; CHECK-NEXT: vinserti32x8 $1, %ymm0, %zmm0, %zmm1 {%k1}
; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1

View File

@ -334,7 +334,7 @@ define <32 x float> @test15(float* %base, <32 x float> %src0, <32 x i32> %trigge
define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %trigger) {
; SKX-LABEL: test16:
; SKX: # BB#0:
; SKX-NEXT: vextracti32x8 $1, %zmm2, %ymm3
; SKX-NEXT: vextracti64x4 $1, %zmm2, %ymm3
; SKX-NEXT: vpxor %xmm4, %xmm4, %xmm4
; SKX-NEXT: vpcmpeqd %ymm4, %ymm3, %k1
; SKX-NEXT: vpcmpeqd %ymm4, %ymm2, %k2

View File

@ -422,10 +422,10 @@ define <16 x i32> @test8(<16 x i32*> %ptr.random, <16 x i32> %ind, i16 %mask) {
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
; SKX-NEXT: kmovw %k1, %k3
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm4
; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
; SKX-NEXT: retq
;
@ -750,7 +750,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm0), %ymm1 {%k2}
; SKX-NEXT: vgatherqps (,%zmm0), %ymm2 {%k1}
; SKX-NEXT: vinsertf32x8 $1, %ymm1, %zmm2, %zmm0
; SKX-NEXT: vinsertf64x4 $1, %ymm1, %zmm2, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test14:
@ -1686,11 +1686,11 @@ define <16 x i32> @test_gather_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm2
; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
; SKX-NEXT: vinserti32x8 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16i32:
@ -1772,7 +1772,7 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
@ -1809,11 +1809,11 @@ define <16 x float> @test_gather_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16
; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
; SKX-NEXT: vpslld $31, %zmm2, %zmm2
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm2
; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
; SKX-NEXT: vgatherqps (,%zmm0), %ymm3 {%k1}
; SKX-NEXT: vinsertf32x8 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: vinsertf64x4 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_gather_16f32:
@ -1895,7 +1895,7 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
; SKX_32-NEXT: vextractf32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovapd %zmm2, %zmm0
; SKX_32-NEXT: movl %ebp, %esp
@ -1934,7 +1934,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextracti32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@ -2016,7 +2016,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
; SKX_32-NEXT: vextracti32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
@ -2055,7 +2055,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
; SKX-NEXT: vptestmd %zmm2, %zmm2, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; SKX-NEXT: vextractf32x8 $1, %zmm3, %ymm0
; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@ -2138,7 +2138,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
; SKX_32-NEXT: vextractf32x8 $1, %zmm0, %ymm0
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp

View File

@ -458,7 +458,7 @@ define <4 x float> @stack_fold_extracti32x4(<16 x float> %a0, <16 x float> %a1)
define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0, <8 x double> %a1) {
;CHECK-LABEL: stack_fold_extractf64x2
;CHECK: vextractf64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
;CHECK: vextractf32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
%1 = shufflevector <8 x double> %a0, <8 x double> %a1, <2 x i32> <i32 6, i32 7>
%2 = tail call <2 x double> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <2 x double> %1
@ -466,7 +466,7 @@ define <2 x double> @stack_fold_extractf64x2(<8 x double> %a0, <8 x double> %a1)
define <8 x float> @stack_fold_extracti32x8(<16 x float> %a0, <16 x float> %a1) {
;CHECK-LABEL: stack_fold_extracti32x8
;CHECK: vextractf32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
;CHECK: vextractf64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
%1 = shufflevector <16 x float> %a0, <16 x float> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
ret <8 x float> %1
@ -482,7 +482,7 @@ define <4 x double> @stack_fold_extractf64x4(<8 x double> %a0, <8 x double> %a1)
define <16 x float> @stack_fold_insertf32x8(<8 x float> %a0, <8 x float> %a1) {
;CHECK-LABEL: stack_fold_insertf32x8
;CHECK: vinsertf32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
;CHECK: vinsertf64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
%2 = shufflevector <8 x float> %a0, <8 x float> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
ret <16 x float> %2

View File

@ -130,7 +130,7 @@ define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {
define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
;CHECK-LABEL: stack_fold_extracti64x2
;CHECK: vextracti64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
;CHECK: vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
; add forces execution domain
%1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
%2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
@ -140,7 +140,7 @@ define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) {
;CHECK-LABEL: stack_fold_extracti32x8
;CHECK: vextracti32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
;CHECK: vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
; add forces execution domain
%1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
%2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@ -160,7 +160,7 @@ define <4 x i64> @stack_fold_extracti64x4(<8 x i64> %a0, <8 x i64> %a1) {
define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
;CHECK-LABEL: stack_fold_inserti32x8
;CHECK: vinserti32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
;CHECK: vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
%1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
%2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
; add forces execution domain

View File

@ -1457,26 +1457,12 @@ define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: reg_broadcast_4f32_16f32:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: reg_broadcast_4f32_16f32:
; X32-AVX512BW: # BB#0:
; X32-AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: reg_broadcast_4f32_16f32:
; X32-AVX512DQ: # BB#0:
; X32-AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512DQ-NEXT: retl
; X32-AVX512-LABEL: reg_broadcast_4f32_16f32:
; X32-AVX512: # BB#0:
; X32-AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_4f32_16f32:
; X64-AVX: # BB#0:
@ -1485,26 +1471,12 @@ define <16 x float> @reg_broadcast_4f32_16f32(<4 x float> %a0) nounwind {
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: reg_broadcast_4f32_16f32:
; X64-AVX512F: # BB#0:
; X64-AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: reg_broadcast_4f32_16f32:
; X64-AVX512BW: # BB#0:
; X64-AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: reg_broadcast_4f32_16f32:
; X64-AVX512DQ: # BB#0:
; X64-AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512DQ-NEXT: retq
; X64-AVX512-LABEL: reg_broadcast_4f32_16f32:
; X64-AVX512: # BB#0:
; X64-AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
%1 = shufflevector <4 x float> %a0, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x float> %1
}
@ -1515,46 +1487,22 @@ define <16 x float> @reg_broadcast_8f32_16f32(<8 x float> %a0) nounwind {
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: reg_broadcast_8f32_16f32:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: reg_broadcast_8f32_16f32:
; X32-AVX512BW: # BB#0:
; X32-AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: reg_broadcast_8f32_16f32:
; X32-AVX512DQ: # BB#0:
; X32-AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512DQ-NEXT: retl
; X32-AVX512-LABEL: reg_broadcast_8f32_16f32:
; X32-AVX512: # BB#0:
; X32-AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_8f32_16f32:
; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: reg_broadcast_8f32_16f32:
; X64-AVX512F: # BB#0:
; X64-AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: reg_broadcast_8f32_16f32:
; X64-AVX512BW: # BB#0:
; X64-AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: reg_broadcast_8f32_16f32:
; X64-AVX512DQ: # BB#0:
; X64-AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512DQ-NEXT: retq
; X64-AVX512-LABEL: reg_broadcast_8f32_16f32:
; X64-AVX512: # BB#0:
; X64-AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
%1 = shufflevector <8 x float> %a0, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <16 x float> %1
}
@ -1583,26 +1531,12 @@ define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: reg_broadcast_4i32_16i32:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: reg_broadcast_4i32_16i32:
; X32-AVX512BW: # BB#0:
; X32-AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: reg_broadcast_4i32_16i32:
; X32-AVX512DQ: # BB#0:
; X32-AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512DQ-NEXT: retl
; X32-AVX512-LABEL: reg_broadcast_4i32_16i32:
; X32-AVX512: # BB#0:
; X32-AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X32-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_4i32_16i32:
; X64-AVX: # BB#0:
@ -1611,26 +1545,12 @@ define <16 x i32> @reg_broadcast_4i32_16i32(<4 x i32> %a0) nounwind {
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: reg_broadcast_4i32_16i32:
; X64-AVX512F: # BB#0:
; X64-AVX512F-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: reg_broadcast_4i32_16i32:
; X64-AVX512BW: # BB#0:
; X64-AVX512BW-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512BW-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: reg_broadcast_4i32_16i32:
; X64-AVX512DQ: # BB#0:
; X64-AVX512DQ-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512DQ-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512DQ-NEXT: retq
; X64-AVX512-LABEL: reg_broadcast_4i32_16i32:
; X64-AVX512: # BB#0:
; X64-AVX512-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; X64-AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
%1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x i32> %1
}
@ -1641,46 +1561,22 @@ define <16 x i32> @reg_broadcast_8i32_16i32(<8 x i32> %a0) nounwind {
; X32-AVX-NEXT: vmovaps %ymm0, %ymm1
; X32-AVX-NEXT: retl
;
; X32-AVX512F-LABEL: reg_broadcast_8i32_16i32:
; X32-AVX512F: # BB#0:
; X32-AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512F-NEXT: retl
;
; X32-AVX512BW-LABEL: reg_broadcast_8i32_16i32:
; X32-AVX512BW: # BB#0:
; X32-AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512BW-NEXT: retl
;
; X32-AVX512DQ-LABEL: reg_broadcast_8i32_16i32:
; X32-AVX512DQ: # BB#0:
; X32-AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512DQ-NEXT: retl
; X32-AVX512-LABEL: reg_broadcast_8i32_16i32:
; X32-AVX512: # BB#0:
; X32-AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X32-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X32-AVX512-NEXT: retl
;
; X64-AVX-LABEL: reg_broadcast_8i32_16i32:
; X64-AVX: # BB#0:
; X64-AVX-NEXT: vmovaps %ymm0, %ymm1
; X64-AVX-NEXT: retq
;
; X64-AVX512F-LABEL: reg_broadcast_8i32_16i32:
; X64-AVX512F: # BB#0:
; X64-AVX512F-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512F-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512F-NEXT: retq
;
; X64-AVX512BW-LABEL: reg_broadcast_8i32_16i32:
; X64-AVX512BW: # BB#0:
; X64-AVX512BW-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512BW-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512BW-NEXT: retq
;
; X64-AVX512DQ-LABEL: reg_broadcast_8i32_16i32:
; X64-AVX512DQ: # BB#0:
; X64-AVX512DQ-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512DQ-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512DQ-NEXT: retq
; X64-AVX512-LABEL: reg_broadcast_8i32_16i32:
; X64-AVX512: # BB#0:
; X64-AVX512-NEXT: # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
; X64-AVX512-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; X64-AVX512-NEXT: retq
%1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
ret <16 x i32> %1
}

File diff suppressed because it is too large Load Diff

View File

@ -274,7 +274,7 @@ define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a
define <8 x i32> @test_v16i32_1_3_5_7_9_11_13_15(<16 x i32> %v) {
; ALL-LABEL: test_v16i32_1_3_5_7_9_11_13_15:
; ALL: # BB#0:
; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm1
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
; ALL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
; ALL-NEXT: retq
@ -314,7 +314,7 @@ define <8 x float> @shuffle_v16f32_extract_256(float* %RET, float* %a) {
define <8 x float> @test_v16f32_0_1_2_3_4_6_7_10 (<16 x float> %v) {
; ALL-LABEL: test_v16f32_0_1_2_3_4_6_7_10:
; ALL: # BB#0:
; ALL-NEXT: vextractf32x8 $1, %zmm0, %ymm1
; ALL-NEXT: vextractf64x4 $1, %zmm0, %ymm1
; ALL-NEXT: vmovsldup {{.*#+}} xmm1 = xmm1[0,0,2,2]
; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; ALL-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,7,u]
@ -684,7 +684,7 @@ define <16 x i32> @mask_shuffle_v4i32_v16i32_00_01_02_03_00_01_02_03_00_01_02_03
; ALL: # BB#0:
; ALL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x i32> %res
@ -695,7 +695,7 @@ define <16 x float> @mask_shuffle_v4f32_v16f32_00_01_02_03_00_01_02_03_00_01_02_
; ALL: # BB#0:
; ALL-NEXT: # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
; ALL-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
; ALL-NEXT: vinsertf32x8 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: vinsertf64x4 $1, %ymm0, %zmm0, %zmm0
; ALL-NEXT: retq
%res = shufflevector <4 x float> %a, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
ret <16 x float> %res

View File

@ -474,7 +474,7 @@ define <16 x float> @expand13(<8 x float> %a ) {
; SKX64-LABEL: expand13:
; SKX64: # BB#0:
; SKX64-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX64-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
; SKX64-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; SKX64-NEXT: retq
;
; KNL64-LABEL: expand13:
@ -486,7 +486,7 @@ define <16 x float> @expand13(<8 x float> %a ) {
; SKX32-LABEL: expand13:
; SKX32: # BB#0:
; SKX32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX32-NEXT: vinsertf32x8 $1, %ymm0, %zmm1, %zmm0
; SKX32-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0
; SKX32-NEXT: retl
;
; KNL32-LABEL: expand13:

View File

@ -171,7 +171,7 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
; VL_BW_DQ: # BB#0:
; VL_BW_DQ-NEXT: kmovd %edi, %k0
; VL_BW_DQ-NEXT: vpmovm2q %k0, %zmm0
; VL_BW_DQ-NEXT: vextracti64x2 $1, %zmm0, %xmm0
; VL_BW_DQ-NEXT: vextracti32x4 $1, %zmm0, %xmm0
; VL_BW_DQ-NEXT: vpbroadcastq %xmm0, %zmm0
; VL_BW_DQ-NEXT: vpmovq2m %zmm0, %k0
; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0

View File

@ -257,38 +257,16 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpaddq %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpaddq %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_add_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpaddq %zmm3, %zmm1, %zmm1
; AVX512-NEXT: vpaddq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -683,35 +661,15 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_add_const_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpaddb {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -1084,38 +1042,16 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_sub_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpsubq %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpsubq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpsubq %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpsubq %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpsubq %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_sub_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpsubq %zmm3, %zmm1, %zmm1
; AVX512-NEXT: vpsubq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -1510,38 +1446,16 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
; AVX512F-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
; AVX512BW-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
; AVX512DQ-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_sub_const_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm1, %zmm1
; AVX512-NEXT: vpsubq {{.*}}(%rip), %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -2186,7 +2100,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX512DQ-NEXT: vpmullq %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
@ -2732,38 +2646,16 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_mul_const_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vpmulld {{.*}}(%rip), %ymm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -3157,38 +3049,16 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_and_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_and_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpandq %zmm3, %zmm1, %zmm1
; AVX512-NEXT: vpandq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -3528,35 +3398,15 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_and_const_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -3913,38 +3763,16 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_xor_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpxorq %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpxorq %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpxorq %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_xor_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpxorq %zmm3, %zmm1, %zmm1
; AVX512-NEXT: vpxorq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -4284,35 +4112,15 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_xor_const_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpxor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -4669,38 +4477,16 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_or_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vporq %zmm3, %zmm1, %zmm1
; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vporq %zmm3, %zmm1, %zmm1
; AVX512BW-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vporq %zmm3, %zmm1, %zmm1
; AVX512DQ-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_or_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vporq %zmm3, %zmm1, %zmm1
; AVX512-NEXT: vporq %zmm2, %zmm0, %zmm0
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <16 x i64> %a0, %a1
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2
@ -5040,35 +4826,15 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512F: # BB#0:
; AVX512F-NEXT: vpmovqd %zmm0, %ymm0
; AVX512F-NEXT: vpmovqd %zmm1, %ymm1
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512F-NEXT: vpmovdb %zmm0, %xmm0
; AVX512F-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512F-NEXT: vzeroupper
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512BW: # BB#0:
; AVX512BW-NEXT: vpmovqd %zmm0, %ymm0
; AVX512BW-NEXT: vpmovqd %zmm1, %ymm1
; AVX512BW-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512BW-NEXT: vpmovdb %zmm0, %xmm0
; AVX512BW-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512BW-NEXT: vzeroupper
; AVX512BW-NEXT: retq
;
; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512DQ: # BB#0:
; AVX512DQ-NEXT: vpmovqd %zmm0, %ymm0
; AVX512DQ-NEXT: vpmovqd %zmm1, %ymm1
; AVX512DQ-NEXT: vinserti32x8 $1, %ymm1, %zmm0, %zmm0
; AVX512DQ-NEXT: vpmovdb %zmm0, %xmm0
; AVX512DQ-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512DQ-NEXT: vzeroupper
; AVX512DQ-NEXT: retq
; AVX512-LABEL: trunc_or_const_v16i64_v16i8:
; AVX512: # BB#0:
; AVX512-NEXT: vpmovqd %zmm0, %ymm0
; AVX512-NEXT: vpmovqd %zmm1, %ymm1
; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
; AVX512-NEXT: vpmovdb %zmm0, %xmm0
; AVX512-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
%1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
%2 = trunc <16 x i64> %1 to <16 x i8>
ret <16 x i8> %2