[X86] Rewrite to the vXi1 subvector insertion code to not rely on the value of bits that might be undef

The previous code tried to do a trick where we would extract the subvector from the location we were inserting. Then xor that with the new value. Take the xored value and clear out the bits above the subvector size. Then shift that xored subvector to the insert location. And finally xor that with the original vector. Since the old subvector was used in both xors, this would leave just the new subvector at the inserted location. Since the surrounding bits had been zeroed no other bits of the original vector would be modified.

Unfortunately, if the old subvector came from undef we might aggressively propagate the undef. Then we end up with the XORs not cancelling because they aren't using the same value for the two uses of the old subvector. @bkramer gave me a case that demonstrated this, but we haven't reduced it enough to make it easily readable to see what's happening.

This patch uses a safer, but more costly approach. It isolate the bits above the insertion and bits below the insert point and ORs those together leaving 0 for the insertion location. Then widens the subvector with 0s in the upper bits, shifts it into position with 0s in the lower bits. Then we do another OR.

Differential Revision: https://reviews.llvm.org/D68311

llvm-svn: 373495
This commit is contained in:
Craig Topper 2019-10-02 17:47:09 +00:00
parent 0cacf136fc
commit 74c7d6be28
8 changed files with 4311 additions and 3015 deletions

View File

@ -5769,23 +5769,35 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
// Move the current value of the bit to be replace to the lsbs.
Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getTargetConstant(IdxVal, dl, MVT::i8));
// Xor with the new bit.
Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
// Shift to MSB, filling bottom bits with 0.
// Clear the upper bits of the subvector and move it to its insert position.
unsigned ShiftLeft = NumElems - SubVecNumElems;
Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
// Shift to the final position, filling upper bits with 0.
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
// Xor with original vector leaving the new value.
Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
// Isolate the bits below the insertion point.
unsigned LowShift = NumElems - IdxVal;
SDValue Low = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec,
DAG.getTargetConstant(LowShift, dl, MVT::i8));
Low = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Low,
DAG.getTargetConstant(LowShift, dl, MVT::i8));
// Isolate the bits after the last inserted bit.
unsigned HighShift = IdxVal + SubVecNumElems;
SDValue High = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
DAG.getTargetConstant(HighShift, dl, MVT::i8));
High = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, High,
DAG.getTargetConstant(HighShift, dl, MVT::i8));
// Now OR all 3 pieces together.
Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Low, High);
SubVec = DAG.getNode(ISD::OR, dl, WideOpVT, SubVec, Vec);
// Reduce to original width if needed.
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
static SDValue concatSubVectors(SDValue V1, SDValue V2, SelectionDAG &DAG,

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -302,12 +302,15 @@ define i16 @test16(i1 *%addr, i16 %a) {
; KNL: ## %bb.0:
; KNL-NEXT: movb (%rdi), %al
; KNL-NEXT: kmovw %esi, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftrw $10, %k0, %k2
; KNL-NEXT: kxorw %k1, %k2, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $5, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kshiftrw $11, %k0, %k1
; KNL-NEXT: kshiftlw $11, %k1, %k1
; KNL-NEXT: kshiftlw $6, %k0, %k0
; KNL-NEXT: kshiftrw $6, %k0, %k0
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kshiftlw $15, %k2, %k2
; KNL-NEXT: kshiftrw $5, %k2, %k2
; KNL-NEXT: korw %k2, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $ax killed $ax killed $eax
; KNL-NEXT: retq
@ -316,11 +319,14 @@ define i16 @test16(i1 *%addr, i16 %a) {
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kshiftrw $10, %k1, %k2
; SKX-NEXT: kxorw %k0, %k2, %k0
; SKX-NEXT: kshiftrw $11, %k1, %k2
; SKX-NEXT: kshiftlw $11, %k2, %k2
; SKX-NEXT: kshiftlw $6, %k1, %k1
; SKX-NEXT: kshiftrw $6, %k1, %k1
; SKX-NEXT: kshiftlw $15, %k0, %k0
; SKX-NEXT: kshiftrw $5, %k0, %k0
; SKX-NEXT: kxorw %k0, %k1, %k0
; SKX-NEXT: korw %k0, %k2, %k0
; SKX-NEXT: korw %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $ax killed $ax killed $eax
; SKX-NEXT: retq
@ -336,12 +342,15 @@ define i8 @test17(i1 *%addr, i8 %a) {
; KNL: ## %bb.0:
; KNL-NEXT: movb (%rdi), %al
; KNL-NEXT: kmovw %esi, %k0
; KNL-NEXT: kshiftrw $5, %k0, %k1
; KNL-NEXT: kshiftlw $5, %k1, %k1
; KNL-NEXT: kshiftlw $12, %k0, %k0
; KNL-NEXT: kshiftrw $12, %k0, %k0
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftrw $4, %k0, %k2
; KNL-NEXT: kxorw %k1, %k2, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $11, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: korw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: retq
@ -350,11 +359,14 @@ define i8 @test17(i1 *%addr, i8 %a) {
; SKX: ## %bb.0:
; SKX-NEXT: kmovb (%rdi), %k0
; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: kshiftrb $4, %k1, %k2
; SKX-NEXT: kxorb %k0, %k2, %k0
; SKX-NEXT: kshiftrb $5, %k1, %k2
; SKX-NEXT: kshiftlb $5, %k2, %k2
; SKX-NEXT: kshiftlb $4, %k1, %k1
; SKX-NEXT: kshiftrb $4, %k1, %k1
; SKX-NEXT: kshiftlb $7, %k0, %k0
; SKX-NEXT: kshiftrb $3, %k0, %k0
; SKX-NEXT: kxorb %k0, %k1, %k0
; SKX-NEXT: korb %k0, %k2, %k0
; SKX-NEXT: korb %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq
@ -790,12 +802,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; KNL-NEXT: kmovw %k0, %ecx
; KNL-NEXT: shll $16, %ecx
; KNL-NEXT: vpcmpltud %zmm2, %zmm0, %k0
; KNL-NEXT: kshiftrw $4, %k0, %k1
; KNL-NEXT: kshiftrw $5, %k0, %k1
; KNL-NEXT: kshiftlw $5, %k1, %k1
; KNL-NEXT: kshiftlw $12, %k0, %k0
; KNL-NEXT: kshiftrw $12, %k0, %k0
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kxorw %k2, %k1, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $11, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: kshiftlw $15, %k2, %k2
; KNL-NEXT: kshiftrw $11, %k2, %k2
; KNL-NEXT: korw %k2, %k1, %k1
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: orl %ecx, %eax
; KNL-NEXT: vzeroupper
@ -808,12 +823,15 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
; SKX-NEXT: vpcmpltud %zmm2, %zmm0, %k0
; SKX-NEXT: vpcmpltud %zmm3, %zmm1, %k1
; SKX-NEXT: kunpckwd %k0, %k1, %k0
; SKX-NEXT: kshiftrd $4, %k0, %k1
; SKX-NEXT: kshiftrd $5, %k0, %k1
; SKX-NEXT: kshiftld $5, %k1, %k1
; SKX-NEXT: kshiftld $28, %k0, %k0
; SKX-NEXT: kshiftrd $28, %k0, %k0
; SKX-NEXT: kmovd %eax, %k2
; SKX-NEXT: kxord %k2, %k1, %k1
; SKX-NEXT: kshiftld $31, %k1, %k1
; SKX-NEXT: kshiftrd $27, %k1, %k1
; SKX-NEXT: kxord %k1, %k0, %k0
; SKX-NEXT: kshiftld $31, %k2, %k2
; SKX-NEXT: kshiftrd $27, %k2, %k2
; SKX-NEXT: kord %k2, %k1, %k1
; SKX-NEXT: kord %k1, %k0, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@ -832,12 +850,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; KNL-NEXT: cmpl %esi, %edi
; KNL-NEXT: setb %al
; KNL-NEXT: vpcmpltud %zmm1, %zmm0, %k0
; KNL-NEXT: kshiftrw $2, %k0, %k1
; KNL-NEXT: kmovw %eax, %k2
; KNL-NEXT: kxorw %k2, %k1, %k1
; KNL-NEXT: kshiftrw $3, %k0, %k1
; KNL-NEXT: kshiftlw $3, %k1, %k1
; KNL-NEXT: kshiftlw $14, %k0, %k0
; KNL-NEXT: kshiftrw $14, %k0, %k0
; KNL-NEXT: korw %k1, %k0, %k0
; KNL-NEXT: kmovw %eax, %k1
; KNL-NEXT: kshiftlw $15, %k1, %k1
; KNL-NEXT: kshiftrw $13, %k1, %k1
; KNL-NEXT: kxorw %k1, %k0, %k0
; KNL-NEXT: korw %k0, %k1, %k0
; KNL-NEXT: kmovw %k0, %eax
; KNL-NEXT: ## kill: def $al killed $al killed $eax
; KNL-NEXT: vzeroupper
@ -848,12 +869,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
; SKX-NEXT: cmpl %esi, %edi
; SKX-NEXT: setb %al
; SKX-NEXT: vpcmpltud %xmm1, %xmm0, %k0
; SKX-NEXT: kshiftrb $2, %k0, %k1
; SKX-NEXT: kmovd %eax, %k2
; SKX-NEXT: kxorb %k2, %k1, %k1
; SKX-NEXT: kshiftrb $3, %k0, %k1
; SKX-NEXT: kshiftlb $3, %k1, %k1
; SKX-NEXT: kshiftlb $6, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k0
; SKX-NEXT: korw %k1, %k0, %k0
; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $5, %k1, %k1
; SKX-NEXT: kxorw %k1, %k0, %k0
; SKX-NEXT: korw %k0, %k1, %k0
; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: ## kill: def $al killed $al killed $eax
; SKX-NEXT: retq

File diff suppressed because it is too large Load Diff

View File

@ -4913,24 +4913,30 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
; AVX512F-LABEL: widen_masked_store:
; AVX512F: ## %bb.0:
; AVX512F-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0
; AVX512F-NEXT: kmovw %edx, %k0
; AVX512F-NEXT: andl $1, %esi
; AVX512F-NEXT: kmovw %esi, %k1
; AVX512F-NEXT: kxorw %k0, %k0, %k2
; AVX512F-NEXT: kshiftrw $1, %k2, %k2
; AVX512F-NEXT: kshiftlw $1, %k2, %k2
; AVX512F-NEXT: korw %k1, %k2, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k2
; AVX512F-NEXT: kxorw %k0, %k2, %k0
; AVX512F-NEXT: kshiftlw $15, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k0
; AVX512F-NEXT: kxorw %k0, %k1, %k0
; AVX512F-NEXT: kmovw %esi, %k0
; AVX512F-NEXT: kxorw %k0, %k0, %k1
; AVX512F-NEXT: kshiftrw $1, %k1, %k1
; AVX512F-NEXT: kshiftlw $1, %k1, %k1
; AVX512F-NEXT: korw %k0, %k1, %k0
; AVX512F-NEXT: kshiftrw $2, %k0, %k1
; AVX512F-NEXT: kmovw %ecx, %k2
; AVX512F-NEXT: kxorw %k2, %k1, %k1
; AVX512F-NEXT: kshiftlw $2, %k1, %k1
; AVX512F-NEXT: kshiftlw $15, %k0, %k0
; AVX512F-NEXT: kshiftrw $15, %k0, %k0
; AVX512F-NEXT: kmovw %edx, %k2
; AVX512F-NEXT: kshiftlw $15, %k2, %k2
; AVX512F-NEXT: kshiftrw $14, %k2, %k2
; AVX512F-NEXT: korw %k2, %k1, %k1
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: kshiftrw $3, %k0, %k1
; AVX512F-NEXT: kshiftlw $3, %k1, %k1
; AVX512F-NEXT: kshiftlw $14, %k0, %k0
; AVX512F-NEXT: kshiftrw $14, %k0, %k0
; AVX512F-NEXT: korw %k1, %k0, %k0
; AVX512F-NEXT: kmovw %ecx, %k1
; AVX512F-NEXT: kshiftlw $15, %k1, %k1
; AVX512F-NEXT: kshiftrw $13, %k1, %k1
; AVX512F-NEXT: kxorw %k1, %k0, %k0
; AVX512F-NEXT: korw %k0, %k1, %k0
; AVX512F-NEXT: kshiftlw $12, %k0, %k0
; AVX512F-NEXT: kshiftrw $12, %k0, %k1
; AVX512F-NEXT: vmovdqu32 %zmm0, (%rdi) {%k1}
@ -4939,48 +4945,60 @@ define void @widen_masked_store(<3 x i32> %v, <3 x i32>* %p, <3 x i1> %mask) {
;
; AVX512VLDQ-LABEL: widen_masked_store:
; AVX512VLDQ: ## %bb.0:
; AVX512VLDQ-NEXT: kmovw %edx, %k0
; AVX512VLDQ-NEXT: kmovw %esi, %k1
; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1
; AVX512VLDQ-NEXT: kshiftrb $7, %k1, %k1
; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k2
; AVX512VLDQ-NEXT: kshiftrb $1, %k2, %k2
; AVX512VLDQ-NEXT: kshiftlb $1, %k2, %k2
; AVX512VLDQ-NEXT: korb %k1, %k2, %k1
; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k2
; AVX512VLDQ-NEXT: kxorb %k0, %k2, %k0
; AVX512VLDQ-NEXT: kmovw %esi, %k0
; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0
; AVX512VLDQ-NEXT: kxorb %k0, %k1, %k0
; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512VLDQ-NEXT: kxorw %k0, %k0, %k1
; AVX512VLDQ-NEXT: kshiftrb $1, %k1, %k1
; AVX512VLDQ-NEXT: kshiftlb $1, %k1, %k1
; AVX512VLDQ-NEXT: korb %k0, %k1, %k0
; AVX512VLDQ-NEXT: kshiftrb $2, %k0, %k1
; AVX512VLDQ-NEXT: kmovw %ecx, %k2
; AVX512VLDQ-NEXT: kxorb %k2, %k1, %k1
; AVX512VLDQ-NEXT: kshiftlb $2, %k1, %k1
; AVX512VLDQ-NEXT: kshiftlb $7, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $7, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %edx, %k2
; AVX512VLDQ-NEXT: kshiftlb $7, %k2, %k2
; AVX512VLDQ-NEXT: kshiftrb $6, %k2, %k2
; AVX512VLDQ-NEXT: korb %k2, %k1, %k1
; AVX512VLDQ-NEXT: korb %k1, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $3, %k0, %k1
; AVX512VLDQ-NEXT: kshiftlb $3, %k1, %k1
; AVX512VLDQ-NEXT: kshiftlb $6, %k0, %k0
; AVX512VLDQ-NEXT: kshiftrb $6, %k0, %k0
; AVX512VLDQ-NEXT: korw %k1, %k0, %k0
; AVX512VLDQ-NEXT: kmovw %ecx, %k1
; AVX512VLDQ-NEXT: kshiftlb $7, %k1, %k1
; AVX512VLDQ-NEXT: kshiftrb $5, %k1, %k1
; AVX512VLDQ-NEXT: kxorw %k1, %k0, %k1
; AVX512VLDQ-NEXT: korw %k0, %k1, %k1
; AVX512VLDQ-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLDQ-NEXT: retq
;
; AVX512VLBW-LABEL: widen_masked_store:
; AVX512VLBW: ## %bb.0:
; AVX512VLBW-NEXT: kmovd %edx, %k0
; AVX512VLBW-NEXT: andl $1, %esi
; AVX512VLBW-NEXT: kmovw %esi, %k1
; AVX512VLBW-NEXT: kxorw %k0, %k0, %k2
; AVX512VLBW-NEXT: kshiftrw $1, %k2, %k2
; AVX512VLBW-NEXT: kshiftlw $1, %k2, %k2
; AVX512VLBW-NEXT: korw %k1, %k2, %k1
; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k2
; AVX512VLBW-NEXT: kxorw %k0, %k2, %k0
; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0
; AVX512VLBW-NEXT: kxorw %k0, %k1, %k0
; AVX512VLBW-NEXT: kmovw %esi, %k0
; AVX512VLBW-NEXT: kxorw %k0, %k0, %k1
; AVX512VLBW-NEXT: kshiftrw $1, %k1, %k1
; AVX512VLBW-NEXT: kshiftlw $1, %k1, %k1
; AVX512VLBW-NEXT: korw %k0, %k1, %k0
; AVX512VLBW-NEXT: kshiftrw $2, %k0, %k1
; AVX512VLBW-NEXT: kmovd %ecx, %k2
; AVX512VLBW-NEXT: kxorw %k2, %k1, %k1
; AVX512VLBW-NEXT: kshiftlw $2, %k1, %k1
; AVX512VLBW-NEXT: kshiftlw $15, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $15, %k0, %k0
; AVX512VLBW-NEXT: kmovd %edx, %k2
; AVX512VLBW-NEXT: kshiftlw $15, %k2, %k2
; AVX512VLBW-NEXT: kshiftrw $14, %k2, %k2
; AVX512VLBW-NEXT: korw %k2, %k1, %k1
; AVX512VLBW-NEXT: korw %k1, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $3, %k0, %k1
; AVX512VLBW-NEXT: kshiftlw $3, %k1, %k1
; AVX512VLBW-NEXT: kshiftlw $14, %k0, %k0
; AVX512VLBW-NEXT: kshiftrw $14, %k0, %k0
; AVX512VLBW-NEXT: korw %k1, %k0, %k0
; AVX512VLBW-NEXT: kmovd %ecx, %k1
; AVX512VLBW-NEXT: kshiftlw $15, %k1, %k1
; AVX512VLBW-NEXT: kshiftrw $13, %k1, %k1
; AVX512VLBW-NEXT: kxorw %k1, %k0, %k1
; AVX512VLBW-NEXT: korw %k0, %k1, %k1
; AVX512VLBW-NEXT: vmovdqa32 %xmm0, (%rdi) {%k1}
; AVX512VLBW-NEXT: retq
call void @llvm.masked.store.v3i32.p0v3i32(<3 x i32> %v, <3 x i32>* %p, i32 16, <3 x i1> %mask)

View File

@ -1730,20 +1730,26 @@ define <2 x i32> @smulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
;
; AVX512-LABEL: smulo_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq %xmm1, %rax
; AVX512-NEXT: vmovq %xmm0, %rcx
; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
; AVX512-NEXT: vpextrq $1, %xmm0, %rsi
; AVX512-NEXT: vpextrq $1, %xmm1, %rax
; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
; AVX512-NEXT: vmovq %xmm1, %rdx
; AVX512-NEXT: vmovq %xmm0, %rsi
; AVX512-NEXT: imulq %rdx, %rsi
; AVX512-NEXT: vmovq %rsi, %xmm0
; AVX512-NEXT: seto %dl
; AVX512-NEXT: imulq %rax, %rcx
; AVX512-NEXT: vmovq %rcx, %xmm1
; AVX512-NEXT: vmovq %rcx, %xmm0
; AVX512-NEXT: vmovq %rsi, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512-NEXT: seto %al
; AVX512-NEXT: kmovd %eax, %k0
; AVX512-NEXT: kshiftlw $15, %k0, %k1
; AVX512-NEXT: kshiftrw $14, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: kshiftlw $15, %k0, %k0
; AVX512-NEXT: kshiftrw $14, %k0, %k0
; AVX512-NEXT: kmovd %edx, %k1
; AVX512-NEXT: kshiftlw $15, %k1, %k1
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kshiftlw $2, %k0, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: korw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
@ -2197,46 +2203,76 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
;
; AVX512-LABEL: smulo_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: vpslld $31, %xmm1, %xmm1
; AVX512-NEXT: vptestmd %xmm1, %xmm1, %k0
; AVX512-NEXT: kshiftrw $3, %k0, %k1
; AVX512-NEXT: kmovd %k1, %r9d
; AVX512-NEXT: andb $1, %r9b
; AVX512-NEXT: negb %r9b
; AVX512-NEXT: vpslld $31, %xmm1, %xmm0
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k1
; AVX512-NEXT: kshiftrw $3, %k1, %k2
; AVX512-NEXT: kmovd %k2, %r10d
; AVX512-NEXT: andb $1, %r10b
; AVX512-NEXT: negb %r10b
; AVX512-NEXT: kshiftrw $2, %k1, %k2
; AVX512-NEXT: kmovd %k1, %ecx
; AVX512-NEXT: andb $1, %cl
; AVX512-NEXT: negb %cl
; AVX512-NEXT: kshiftrw $2, %k0, %k1
; AVX512-NEXT: kmovd %k0, %esi
; AVX512-NEXT: kmovd %k2, %r11d
; AVX512-NEXT: andb $1, %r11b
; AVX512-NEXT: negb %r11b
; AVX512-NEXT: kshiftrw $2, %k0, %k2
; AVX512-NEXT: kmovd %k2, %ebx
; AVX512-NEXT: andb $1, %bl
; AVX512-NEXT: negb %bl
; AVX512-NEXT: kshiftrw $1, %k0, %k2
; AVX512-NEXT: kmovd %k2, %esi
; AVX512-NEXT: andb $1, %sil
; AVX512-NEXT: negb %sil
; AVX512-NEXT: kmovd %k1, %eax
; AVX512-NEXT: andb $1, %al
; AVX512-NEXT: negb %al
; AVX512-NEXT: kshiftrw $1, %k1, %k2
; AVX512-NEXT: kmovd %k2, %edx
; AVX512-NEXT: andb $1, %dl
; AVX512-NEXT: negb %dl
; AVX512-NEXT: kmovd %k1, %eax
; AVX512-NEXT: andb $1, %al
; AVX512-NEXT: negb %al
; AVX512-NEXT: kmovd %k0, %ecx
; AVX512-NEXT: andb $1, %cl
; AVX512-NEXT: negb %cl
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: imulb %dl
; AVX512-NEXT: imulb %cl
; AVX512-NEXT: movl %eax, %r8d
; AVX512-NEXT: seto %al
; AVX512-NEXT: movl %r8d, %edx
; AVX512-NEXT: andb $1, %dl
; AVX512-NEXT: negb %dl
; AVX512-NEXT: cmpb %r8b, %dl
; AVX512-NEXT: setne %dl
; AVX512-NEXT: orb %al, %dl
; AVX512-NEXT: movl %r8d, %ecx
; AVX512-NEXT: andb $1, %cl
; AVX512-NEXT: negb %cl
; AVX512-NEXT: cmpb %r8b, %cl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: orb %al, %cl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: movl %esi, %eax
; AVX512-NEXT: imulb %cl
; AVX512-NEXT: kmovd %eax, %k0
; AVX512-NEXT: kshiftlw $15, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k0, %k1
; AVX512-NEXT: kshiftlw $2, %k0, %k0
; AVX512-NEXT: movl %edx, %eax
; AVX512-NEXT: imulb %sil
; AVX512-NEXT: movl %eax, %edx
; AVX512-NEXT: seto %al
; AVX512-NEXT: movl %edx, %ecx
; AVX512-NEXT: andb $1, %cl
; AVX512-NEXT: negb %cl
; AVX512-NEXT: cmpb %dl, %cl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: orb %al, %cl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: kshiftlw $1, %k2, %k2
; AVX512-NEXT: korw %k2, %k0, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: kshiftlw $14, %k1, %k1
; AVX512-NEXT: kshiftrw $14, %k1, %k1
; AVX512-NEXT: kshiftlw $3, %k0, %k2
; AVX512-NEXT: movl %r11d, %eax
; AVX512-NEXT: imulb %bl
; AVX512-NEXT: movl %eax, %esi
; AVX512-NEXT: seto %al
; AVX512-NEXT: movl %esi, %ecx
@ -2246,26 +2282,22 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: setne %cl
; AVX512-NEXT: orb %al, %cl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: kshiftlw $15, %k0, %k0
; AVX512-NEXT: kshiftrw $14, %k0, %k0
; AVX512-NEXT: kxorw %k0, %k2, %k2
; AVX512-NEXT: kshiftrw $2, %k2, %k3
; AVX512-NEXT: kxorw %k1, %k3, %k1
; AVX512-NEXT: kshiftlw $2, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k2, %k1
; AVX512-NEXT: kmovd %eax, %k3
; AVX512-NEXT: kshiftlw $2, %k3, %k3
; AVX512-NEXT: korw %k3, %k2, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: kshiftlw $13, %k1, %k1
; AVX512-NEXT: kshiftrw $13, %k1, %k1
; AVX512-NEXT: movl %r9d, %eax
; AVX512-NEXT: imulb %r10b
; AVX512-NEXT: movl %r10d, %eax
; AVX512-NEXT: imulb %r9b
; AVX512-NEXT: # kill: def $al killed $al def $eax
; AVX512-NEXT: seto %cl
; AVX512-NEXT: movl %eax, %edx
; AVX512-NEXT: andb $1, %dl
; AVX512-NEXT: negb %dl
; AVX512-NEXT: cmpb %al, %dl
; AVX512-NEXT: setne %dl
; AVX512-NEXT: orb %cl, %dl
; AVX512-NEXT: movl %eax, %ebx
; AVX512-NEXT: andb $1, %bl
; AVX512-NEXT: negb %bl
; AVX512-NEXT: cmpb %al, %bl
; AVX512-NEXT: setne %bl
; AVX512-NEXT: orb %cl, %bl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: kmovd %ecx, %k2
; AVX512-NEXT: kshiftlw $3, %k2, %k2
@ -2273,21 +2305,34 @@ define <4 x i32> @smulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kmovd %r8d, %k1
; AVX512-NEXT: kmovd %esi, %k2
; AVX512-NEXT: kxorw %k0, %k2, %k0
; AVX512-NEXT: kshiftrw $2, %k0, %k2
; AVX512-NEXT: kxorw %k1, %k2, %k1
; AVX512-NEXT: kshiftlw $15, %k1, %k1
; AVX512-NEXT: kshiftrw $13, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kmovd %edx, %k2
; AVX512-NEXT: kshiftlw $15, %k2, %k2
; AVX512-NEXT: kshiftrw $14, %k2, %k2
; AVX512-NEXT: korw %k2, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k0
; AVX512-NEXT: kshiftrw $3, %k0, %k1
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kshiftlw $3, %k1, %k1
; AVX512-NEXT: kshiftlw $14, %k0, %k0
; AVX512-NEXT: kshiftrw $14, %k0, %k0
; AVX512-NEXT: kmovd %esi, %k2
; AVX512-NEXT: kshiftlw $15, %k2, %k2
; AVX512-NEXT: kshiftrw $13, %k2, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: korw %k1, %k0, %k0
; AVX512-NEXT: kshiftrw $4, %k0, %k1
; AVX512-NEXT: kshiftlw $4, %k1, %k1
; AVX512-NEXT: kshiftlw $13, %k0, %k0
; AVX512-NEXT: kshiftrw $13, %k0, %k0
; AVX512-NEXT: korw %k1, %k0, %k0
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: kshiftlw $15, %k1, %k1
; AVX512-NEXT: kshiftrw $12, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.smul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0

View File

@ -1532,21 +1532,28 @@ define <2 x i32> @umulo_v2i64(<2 x i64> %a0, <2 x i64> %a1, <2 x i64>* %p2) noun
;
; AVX512-LABEL: umulo_v2i64:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovq %xmm0, %rcx
; AVX512-NEXT: vmovq %xmm1, %rsi
; AVX512-NEXT: vpextrq $1, %xmm0, %rax
; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
; AVX512-NEXT: vpextrq $1, %xmm0, %rcx
; AVX512-NEXT: vpextrq $1, %xmm1, %r8
; AVX512-NEXT: vmovq %xmm0, %rax
; AVX512-NEXT: vmovq %xmm1, %rdx
; AVX512-NEXT: mulq %rdx
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: movq %rax, %rsi
; AVX512-NEXT: seto %r9b
; AVX512-NEXT: movq %rcx, %rax
; AVX512-NEXT: mulq %rsi
; AVX512-NEXT: vmovq %rax, %xmm1
; AVX512-NEXT: mulq %r8
; AVX512-NEXT: vmovq %rax, %xmm0
; AVX512-NEXT: vmovq %rsi, %xmm1
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
; AVX512-NEXT: seto %al
; AVX512-NEXT: kmovd %eax, %k0
; AVX512-NEXT: kshiftlw $15, %k0, %k1
; AVX512-NEXT: kshiftrw $14, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k1
; AVX512-NEXT: kshiftlw $15, %k0, %k0
; AVX512-NEXT: kshiftrw $14, %k0, %k0
; AVX512-NEXT: kmovd %r9d, %k1
; AVX512-NEXT: kshiftlw $15, %k1, %k1
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kshiftlw $2, %k0, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: korw %k1, %k0, %k1
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: vmovdqa %xmm1, (%rdi)
@ -1945,6 +1952,7 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
;
; AVX512-LABEL: umulo_v4i1:
; AVX512: # %bb.0:
; AVX512-NEXT: pushq %rbx
; AVX512-NEXT: vpslld $31, %xmm0, %xmm0
; AVX512-NEXT: vptestmd %xmm0, %xmm0, %k0
; AVX512-NEXT: kshiftrw $3, %k0, %k1
@ -1956,40 +1964,60 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: kmovd %k2, %r10d
; AVX512-NEXT: andb $1, %r10b
; AVX512-NEXT: kshiftrw $2, %k0, %k2
; AVX512-NEXT: kmovd %k0, %esi
; AVX512-NEXT: kmovd %k2, %r11d
; AVX512-NEXT: andb $1, %r11b
; AVX512-NEXT: kshiftrw $2, %k1, %k2
; AVX512-NEXT: kmovd %k2, %ebx
; AVX512-NEXT: andb $1, %bl
; AVX512-NEXT: kshiftrw $1, %k0, %k2
; AVX512-NEXT: kmovd %k2, %edx
; AVX512-NEXT: andb $1, %dl
; AVX512-NEXT: kshiftrw $1, %k1, %k2
; AVX512-NEXT: kmovd %k2, %esi
; AVX512-NEXT: andb $1, %sil
; AVX512-NEXT: kshiftrw $2, %k1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: andb $1, %al
; AVX512-NEXT: kmovd %k1, %ecx
; AVX512-NEXT: andb $1, %cl
; AVX512-NEXT: kmovd %k2, %eax
; AVX512-NEXT: andb $1, %al
; AVX512-NEXT: kmovd %k0, %edx
; AVX512-NEXT: andb $1, %dl
; AVX512-NEXT: # kill: def $al killed $al killed $eax
; AVX512-NEXT: mulb %dl
; AVX512-NEXT: mulb %cl
; AVX512-NEXT: movl %eax, %r8d
; AVX512-NEXT: seto %al
; AVX512-NEXT: testb $-2, %r8b
; AVX512-NEXT: setne %dl
; AVX512-NEXT: orb %al, %dl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: orb %al, %cl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: movl %esi, %eax
; AVX512-NEXT: mulb %cl
; AVX512-NEXT: kmovd %eax, %k0
; AVX512-NEXT: kshiftlw $15, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k0, %k1
; AVX512-NEXT: kshiftlw $2, %k0, %k0
; AVX512-NEXT: movl %edx, %eax
; AVX512-NEXT: mulb %sil
; AVX512-NEXT: movl %eax, %edx
; AVX512-NEXT: seto %al
; AVX512-NEXT: testb $-2, %dl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: orb %al, %cl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: kshiftlw $1, %k2, %k2
; AVX512-NEXT: korw %k2, %k0, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: kshiftlw $14, %k1, %k1
; AVX512-NEXT: kshiftrw $14, %k1, %k1
; AVX512-NEXT: kshiftlw $3, %k0, %k2
; AVX512-NEXT: movl %r11d, %eax
; AVX512-NEXT: mulb %bl
; AVX512-NEXT: movl %eax, %esi
; AVX512-NEXT: seto %al
; AVX512-NEXT: testb $-2, %sil
; AVX512-NEXT: setne %cl
; AVX512-NEXT: orb %al, %cl
; AVX512-NEXT: setne %al
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: kshiftlw $15, %k0, %k0
; AVX512-NEXT: kshiftrw $14, %k0, %k0
; AVX512-NEXT: kxorw %k0, %k2, %k2
; AVX512-NEXT: kshiftrw $2, %k2, %k3
; AVX512-NEXT: kxorw %k1, %k3, %k1
; AVX512-NEXT: kshiftlw $2, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k2, %k1
; AVX512-NEXT: kmovd %eax, %k3
; AVX512-NEXT: kshiftlw $2, %k3, %k3
; AVX512-NEXT: korw %k3, %k2, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: kshiftlw $13, %k1, %k1
; AVX512-NEXT: kshiftrw $13, %k1, %k1
; AVX512-NEXT: movl %r9d, %eax
@ -1997,8 +2025,8 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: # kill: def $al killed $al def $eax
; AVX512-NEXT: seto %cl
; AVX512-NEXT: testb $-2, %al
; AVX512-NEXT: setne %dl
; AVX512-NEXT: orb %cl, %dl
; AVX512-NEXT: setne %bl
; AVX512-NEXT: orb %cl, %bl
; AVX512-NEXT: setne %cl
; AVX512-NEXT: kmovd %ecx, %k2
; AVX512-NEXT: kshiftlw $3, %k2, %k2
@ -2006,21 +2034,34 @@ define <4 x i32> @umulo_v4i1(<4 x i1> %a0, <4 x i1> %a1, <4 x i1>* %p2) nounwind
; AVX512-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z}
; AVX512-NEXT: kmovd %r8d, %k1
; AVX512-NEXT: kmovd %esi, %k2
; AVX512-NEXT: kxorw %k0, %k2, %k0
; AVX512-NEXT: kshiftrw $2, %k0, %k2
; AVX512-NEXT: kxorw %k1, %k2, %k1
; AVX512-NEXT: kshiftlw $15, %k1, %k1
; AVX512-NEXT: kshiftrw $13, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: kshiftrw $15, %k1, %k1
; AVX512-NEXT: kmovd %edx, %k2
; AVX512-NEXT: kshiftlw $15, %k2, %k2
; AVX512-NEXT: kshiftrw $14, %k2, %k2
; AVX512-NEXT: korw %k2, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k0
; AVX512-NEXT: kshiftrw $3, %k0, %k1
; AVX512-NEXT: kmovd %eax, %k2
; AVX512-NEXT: kxorw %k2, %k1, %k1
; AVX512-NEXT: kshiftlw $3, %k1, %k1
; AVX512-NEXT: kshiftlw $14, %k0, %k0
; AVX512-NEXT: kshiftrw $14, %k0, %k0
; AVX512-NEXT: kmovd %esi, %k2
; AVX512-NEXT: kshiftlw $15, %k2, %k2
; AVX512-NEXT: kshiftrw $13, %k2, %k2
; AVX512-NEXT: korw %k2, %k1, %k1
; AVX512-NEXT: korw %k1, %k0, %k0
; AVX512-NEXT: kshiftrw $4, %k0, %k1
; AVX512-NEXT: kshiftlw $4, %k1, %k1
; AVX512-NEXT: kshiftlw $13, %k0, %k0
; AVX512-NEXT: kshiftrw $13, %k0, %k0
; AVX512-NEXT: korw %k1, %k0, %k0
; AVX512-NEXT: kmovd %eax, %k1
; AVX512-NEXT: kshiftlw $15, %k1, %k1
; AVX512-NEXT: kshiftrw $12, %k1, %k1
; AVX512-NEXT: kxorw %k1, %k0, %k0
; AVX512-NEXT: korw %k0, %k1, %k0
; AVX512-NEXT: kmovd %k0, %eax
; AVX512-NEXT: movb %al, (%rdi)
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: retq
%t = call {<4 x i1>, <4 x i1>} @llvm.umul.with.overflow.v4i1(<4 x i1> %a0, <4 x i1> %a1)
%val = extractvalue {<4 x i1>, <4 x i1>} %t, 0