[x86][CGP] enable target hook to sink funnel shift intrinsic's splatted shift amount

SDAG suffers when it can't see that a funnel operand is a splat value
(due to single-basic-block visibility), so invert the normal loop
hoisting rules to move a splat op closer to its use.

This would be part 1 of an enhancement similar to D63233.

This is needed to re-fix PR37426:
https://bugs.llvm.org/show_bug.cgi?id=37426
...because we got better at canonicalizing IR to funnel shift intrinsics.

The existing CGP code for shift opcodes is likely overstepping what it was
intended to do, so that will be fixed in a follow-up.

Differential Revision: https://reviews.llvm.org/D79718
This commit is contained in:
Sanjay Patel 2020-05-12 17:08:24 -04:00
parent a9e8562651
commit f490ca76b0
5 changed files with 125 additions and 82 deletions

View File

@ -2336,9 +2336,10 @@ public:
}
/// Return true if it's significantly cheaper to shift a vector by a uniform
/// scalar than by an amount which will vary across each lane. On x86, for
/// example, there is a "psllw" instruction for the former case, but no simple
/// instruction for a general "a << b" operation on vectors.
/// scalar than by an amount which will vary across each lane. On x86 before
/// AVX2 for example, there is a "psllw" instruction for the former case, but
/// no simple instruction for a general "a << b" operation on vectors.
/// This should also apply to lowering for vector funnel shifts (rotates).
virtual bool isVectorShiftByScalarCheap(Type *Ty) const {
return false;
}

View File

@ -30675,6 +30675,28 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
return false;
}
bool X86TargetLowering::shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const {
// A uniform shift amount in a vector shift or funnel shift may be much
// cheaper than a generic variable vector shift, so make that pattern visible
// to SDAG by sinking the shuffle instruction next to the shift.
// TODO: This should handle normal shift opcodes too.
if (auto *II = dyn_cast<IntrinsicInst>(I)) {
Intrinsic::ID ID = II->getIntrinsicID();
if (ID == Intrinsic::fshl || ID == Intrinsic::fshr) {
// The shift amount operand for these intrinsics is operand 2.
auto *Shuf = dyn_cast<ShuffleVectorInst>(II->getOperand(2));
if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
isVectorShiftByScalarCheap(I->getType())) {
Ops.push_back(&I->getOperandUse(2));
return true;
}
}
}
return false;
}
bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
return false;

View File

@ -1032,6 +1032,8 @@ namespace llvm {
int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
unsigned AS) const override;
/// This is used to enable splatted operand transforms for vector shifts
/// and vector funnel shifts.
bool isVectorShiftByScalarCheap(Type *Ty) const override;
/// Add x86-specific opcodes to the default list.
@ -1060,6 +1062,9 @@ namespace llvm {
bool isZExtFree(EVT VT1, EVT VT2) const override;
bool isZExtFree(SDValue Val, EVT VT2) const override;
bool shouldSinkOperands(Instruction *I,
SmallVectorImpl<Use *> &Ops) const override;
/// Return true if folding a vector load into ExtVal (a sign, zero, or any
/// extend node) is profitable.
bool isVectorLoadExtDesirable(SDValue) const override;

View File

@ -2143,27 +2143,21 @@ define void @sink_splatvar(i32* %p, i32 %shift_amt) {
; SSE2-LABEL: sink_splatvar:
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movd %esi, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE2-NEXT: pand {{.*}}(%rip), %xmm0
; SSE2-NEXT: pslld $23, %xmm0
; SSE2-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE2-NEXT: cvttps2dq %xmm0, %xmm0
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE2-NEXT: movd %xmm0, %ecx
; SSE2-NEXT: andl $31, %ecx
; SSE2-NEXT: movl $32, %edx
; SSE2-NEXT: subl %ecx, %edx
; SSE2-NEXT: movd %edx, %xmm0
; SSE2-NEXT: movd %ecx, %xmm1
; SSE2-NEXT: .p2align 4, 0x90
; SSE2-NEXT: .LBB8_1: # %loop
; SSE2-NEXT: # =>This Inner Loop Header: Depth=1
; SSE2-NEXT: movdqu 1024(%rdi,%rax), %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE2-NEXT: pmuludq %xmm0, %xmm2
; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
; SSE2-NEXT: pmuludq %xmm1, %xmm3
; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE2-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; SSE2-NEXT: por %xmm4, %xmm2
; SSE2-NEXT: movdqa %xmm2, %xmm3
; SSE2-NEXT: psrld %xmm0, %xmm3
; SSE2-NEXT: pslld %xmm1, %xmm2
; SSE2-NEXT: por %xmm3, %xmm2
; SSE2-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
; SSE2-NEXT: addq $16, %rax
; SSE2-NEXT: jne .LBB8_1
@ -2173,26 +2167,22 @@ define void @sink_splatvar(i32* %p, i32 %shift_amt) {
; SSE41-LABEL: sink_splatvar:
; SSE41: # %bb.0: # %entry
; SSE41-NEXT: movd %esi, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; SSE41-NEXT: movq $-1024, %rax # imm = 0xFC00
; SSE41-NEXT: pand {{.*}}(%rip), %xmm0
; SSE41-NEXT: pslld $23, %xmm0
; SSE41-NEXT: paddd {{.*}}(%rip), %xmm0
; SSE41-NEXT: cvttps2dq %xmm0, %xmm0
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0]
; SSE41-NEXT: pand {{.*}}(%rip), %xmm1
; SSE41-NEXT: movdqa {{.*#+}} xmm0 = [32,32,32,32]
; SSE41-NEXT: psubd %xmm1, %xmm0
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; SSE41-NEXT: .p2align 4, 0x90
; SSE41-NEXT: .LBB8_1: # %loop
; SSE41-NEXT: # =>This Inner Loop Header: Depth=1
; SSE41-NEXT: movdqu 1024(%rdi,%rax), %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; SSE41-NEXT: pmuludq %xmm1, %xmm3
; SSE41-NEXT: pmuludq %xmm0, %xmm2
; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; SSE41-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; SSE41-NEXT: pblendw {{.*#+}} xmm3 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; SSE41-NEXT: por %xmm4, %xmm3
; SSE41-NEXT: movdqu %xmm3, 1024(%rdi,%rax)
; SSE41-NEXT: movdqa %xmm2, %xmm3
; SSE41-NEXT: psrld %xmm0, %xmm3
; SSE41-NEXT: pslld %xmm1, %xmm2
; SSE41-NEXT: por %xmm3, %xmm2
; SSE41-NEXT: movdqu %xmm2, 1024(%rdi,%rax)
; SSE41-NEXT: addq $16, %rax
; SSE41-NEXT: jne .LBB8_1
; SSE41-NEXT: # %bb.2: # %end
@ -2201,25 +2191,20 @@ define void @sink_splatvar(i32* %p, i32 %shift_amt) {
; AVX1-LABEL: sink_splatvar:
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: vmovd %esi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: movq $-1024, %rax # imm = 0xFC00
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vpslld $23, %xmm0, %xmm0
; AVX1-NEXT: vpaddd {{.*}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vcvttps2dq %xmm0, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vpand {{.*}}(%rip), %xmm0, %xmm1
; AVX1-NEXT: vmovdqa {{.*#+}} xmm0 = [32,32,32,32]
; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
; AVX1-NEXT: .p2align 4, 0x90
; AVX1-NEXT: .LBB8_1: # %loop
; AVX1-NEXT: # =>This Inner Loop Header: Depth=1
; AVX1-NEXT: vmovdqu 1024(%rdi,%rax), %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; AVX1-NEXT: vpmuludq %xmm1, %xmm3, %xmm3
; AVX1-NEXT: vpmuludq %xmm0, %xmm2, %xmm2
; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm3[2,3],xmm4[4,5],xmm3[6,7]
; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
; AVX1-NEXT: vpor %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpsrld %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vmovdqu %xmm2, 1024(%rdi,%rax)
; AVX1-NEXT: addq $16, %rax
; AVX1-NEXT: jne .LBB8_1
@ -2380,29 +2365,23 @@ define void @sink_splatvar(i32* %p, i32 %shift_amt) {
; X32-SSE-NEXT: .cfi_def_cfa_offset 8
; X32-SSE-NEXT: .cfi_offset %esi, -8
; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; X32-SSE-NEXT: xorl %ecx, %ecx
; X32-SSE-NEXT: pand {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: pslld $23, %xmm0
; X32-SSE-NEXT: paddd {{\.LCPI.*}}, %xmm0
; X32-SSE-NEXT: cvttps2dq %xmm0, %xmm0
; X32-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
; X32-SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; X32-SSE-NEXT: movd %xmm0, %edx
; X32-SSE-NEXT: andl $31, %edx
; X32-SSE-NEXT: movl $32, %esi
; X32-SSE-NEXT: subl %edx, %esi
; X32-SSE-NEXT: movd %esi, %xmm0
; X32-SSE-NEXT: movd %edx, %xmm1
; X32-SSE-NEXT: xorl %edx, %edx
; X32-SSE-NEXT: .p2align 4, 0x90
; X32-SSE-NEXT: .LBB8_1: # %loop
; X32-SSE-NEXT: # =>This Inner Loop Header: Depth=1
; X32-SSE-NEXT: movdqu (%eax,%ecx,4), %xmm2
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
; X32-SSE-NEXT: pmuludq %xmm0, %xmm2
; X32-SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm2[1,3,2,3]
; X32-SSE-NEXT: pmuludq %xmm1, %xmm3
; X32-SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,3,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; X32-SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; X32-SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X32-SSE-NEXT: por %xmm4, %xmm2
; X32-SSE-NEXT: movdqa %xmm2, %xmm3
; X32-SSE-NEXT: psrld %xmm0, %xmm3
; X32-SSE-NEXT: pslld %xmm1, %xmm2
; X32-SSE-NEXT: por %xmm3, %xmm2
; X32-SSE-NEXT: movdqu %xmm2, (%eax,%ecx,4)
; X32-SSE-NEXT: addl $4, %ecx
; X32-SSE-NEXT: adcl $0, %edx

View File

@ -180,23 +180,59 @@ if_false:
}
define void @funnel_splatvar(i32* nocapture %arr, i32 %rot) {
; CHECK-LABEL: @funnel_splatvar(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
; CHECK-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
; CHECK-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK: for.cond.cleanup:
; CHECK-NEXT: ret void
; CHECK-SSE2-LABEL: @funnel_splatvar(
; CHECK-SSE2-NEXT: entry:
; CHECK-SSE2-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0
; CHECK-SSE2-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-SSE2: vector.body:
; CHECK-SSE2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-SSE2-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; CHECK-SSE2-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
; CHECK-SSE2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
; CHECK-SSE2-NEXT: [[TMP0:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
; CHECK-SSE2-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[TMP0]])
; CHECK-SSE2-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
; CHECK-SSE2-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-SSE2-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-SSE2-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK-SSE2: for.cond.cleanup:
; CHECK-SSE2-NEXT: ret void
;
; CHECK-XOP-LABEL: @funnel_splatvar(
; CHECK-XOP-NEXT: entry:
; CHECK-XOP-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0
; CHECK-XOP-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
; CHECK-XOP-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-XOP: vector.body:
; CHECK-XOP-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-XOP-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; CHECK-XOP-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
; CHECK-XOP-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
; CHECK-XOP-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
; CHECK-XOP-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
; CHECK-XOP-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-XOP-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-XOP-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK-XOP: for.cond.cleanup:
; CHECK-XOP-NEXT: ret void
;
; CHECK-AVX-LABEL: @funnel_splatvar(
; CHECK-AVX-NEXT: entry:
; CHECK-AVX-NEXT: [[BROADCAST_SPLATINSERT15:%.*]] = insertelement <8 x i32> undef, i32 [[ROT:%.*]], i32 0
; CHECK-AVX-NEXT: [[BROADCAST_SPLAT16:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT15]], <8 x i32> undef, <8 x i32> zeroinitializer
; CHECK-AVX-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK-AVX: vector.body:
; CHECK-AVX-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-AVX-NEXT: [[T0:%.*]] = getelementptr inbounds i32, i32* [[ARR:%.*]], i64 [[INDEX]]
; CHECK-AVX-NEXT: [[T1:%.*]] = bitcast i32* [[T0]] to <8 x i32>*
; CHECK-AVX-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[T1]], align 4
; CHECK-AVX-NEXT: [[T2:%.*]] = call <8 x i32> @llvm.fshl.v8i32(<8 x i32> [[WIDE_LOAD]], <8 x i32> [[WIDE_LOAD]], <8 x i32> [[BROADCAST_SPLAT16]])
; CHECK-AVX-NEXT: store <8 x i32> [[T2]], <8 x i32>* [[T1]], align 4
; CHECK-AVX-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 8
; CHECK-AVX-NEXT: [[T3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 65536
; CHECK-AVX-NEXT: br i1 [[T3]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]]
; CHECK-AVX: for.cond.cleanup:
; CHECK-AVX-NEXT: ret void
;
entry:
%broadcast.splatinsert15 = insertelement <8 x i32> undef, i32 %rot, i32 0