[x86] improve CMOV codegen by pushing add into operands, part 3

In this episode, we are trying to avoid an x86 micro-arch quirk where complex
(3 operand) LEA potentially costs significantly more than simple LEA. So we
simultaneously push and pull the math around the CMOV to balance the operations.

I looked at the debug spew during instruction selection and decided against
trying a later DAGToDAG transform -- it seems very difficult to match if the
trailing memops are already selected and managing the creation of extra
instructions at that level is always tricky.

Differential Revision: https://reviews.llvm.org/D106918
This commit is contained in:
Sanjay Patel 2021-07-28 09:07:45 -04:00
parent 960cb490dd
commit 4c41caa287
2 changed files with 44 additions and 21 deletions

View File

@ -49961,11 +49961,34 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG) {
if (!isSuitableCmov(Cmov)) if (!isSuitableCmov(Cmov))
return SDValue(); return SDValue();
// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
EVT VT = N->getValueType(0); EVT VT = N->getValueType(0);
SDLoc DL(N); SDLoc DL(N);
SDValue FalseOp = Cmov.getOperand(0); SDValue FalseOp = Cmov.getOperand(0);
SDValue TrueOp = Cmov.getOperand(1); SDValue TrueOp = Cmov.getOperand(1);
// We will push the add through the select, but we can potentially do better
// if we know there is another add in the sequence and this is pointer math.
// In that case, we can absorb an add into the trailing memory op and avoid
// a 3-operand LEA which is likely slower than a 2-operand LEA.
// TODO: If target has "slow3OpsLEA", do this even without the trailing memop?
if (OtherOp.getOpcode() == ISD::ADD && OtherOp.hasOneUse() &&
!isa<ConstantSDNode>(OtherOp.getOperand(0)) &&
all_of(N->uses(), [&](SDNode *Use) {
auto *MemNode = dyn_cast<MemSDNode>(Use);
return MemNode && MemNode->getBasePtr().getNode() == N;
})) {
// add (cmov C1, C2), add (X, Y) --> add (cmov (add X, C1), (add X, C2)), Y
// TODO: We are arbitrarily choosing op0 as the 1st piece of the sum, but
// it is possible that choosing op1 might be better.
SDValue X = OtherOp.getOperand(0), Y = OtherOp.getOperand(1);
FalseOp = DAG.getNode(ISD::ADD, DL, VT, X, FalseOp);
TrueOp = DAG.getNode(ISD::ADD, DL, VT, X, TrueOp);
Cmov = DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp,
Cmov.getOperand(2), Cmov.getOperand(3));
return DAG.getNode(ISD::ADD, DL, VT, Cmov, Y);
}
// add (cmov C1, C2), OtherOp --> cmov (add OtherOp, C1), (add OtherOp, C2)
FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp); FalseOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, FalseOp);
TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp); TrueOp = DAG.getNode(ISD::ADD, DL, VT, OtherOp, TrueOp);
return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2), return DAG.getNode(X86ISD::CMOV, DL, VT, FalseOp, TrueOp, Cmov.getOperand(2),

View File

@ -279,11 +279,11 @@ define void @bullet_load_store(i32 %x, i64 %y, %class.btAxis* %p) {
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: leaq (%rsi,%rsi,4), %rax ; CHECK-NEXT: leaq (%rsi,%rsi,4), %rax
; CHECK-NEXT: shlq $4, %rax ; CHECK-NEXT: shlq $4, %rax
; CHECK-NEXT: leaq 66(%rdx), %rcx
; CHECK-NEXT: addq $60, %rdx
; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: leaq 60(%rdx,%rax), %rcx ; CHECK-NEXT: cmovneq %rcx, %rdx
; CHECK-NEXT: leaq 66(%rdx,%rax), %rax ; CHECK-NEXT: decw (%rdx,%rax)
; CHECK-NEXT: cmoveq %rcx, %rax
; CHECK-NEXT: decw (%rax)
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%and = and i32 %x, 1 %and = and i32 %x, 1
%b = icmp eq i32 %and, 0 %b = icmp eq i32 %and, 0
@ -299,11 +299,11 @@ define void @bullet_load_store(i32 %x, i64 %y, %class.btAxis* %p) {
define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) { define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt1: ; CHECK-LABEL: complex_lea_alt1:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: leaq 60(%rdx,%rsi), %rax ; CHECK-NEXT: leaq 60(%rdx), %rax
; CHECK-NEXT: leaq 66(%rdx,%rsi), %rcx ; CHECK-NEXT: addq $66, %rdx
; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmovneq %rax, %rcx ; CHECK-NEXT: cmovneq %rax, %rdx
; CHECK-NEXT: decw (%rcx) ; CHECK-NEXT: decw (%rdx,%rsi)
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64 %i = ptrtoint i16* %ptr to i64
%sum = add i64 %idx, %i %sum = add i64 %idx, %i
@ -320,11 +320,11 @@ define void @complex_lea_alt1(i1 %b, i16* readnone %ptr, i64 %idx) {
define void @complex_lea_alt2(i1 %b, i16* readnone %ptr, i64 %idx) { define void @complex_lea_alt2(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt2: ; CHECK-LABEL: complex_lea_alt2:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: leaq 60(%rsi,%rdx), %rax ; CHECK-NEXT: leaq 60(%rsi), %rax
; CHECK-NEXT: leaq 66(%rsi,%rdx), %rcx ; CHECK-NEXT: addq $66, %rsi
; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmovneq %rax, %rcx ; CHECK-NEXT: cmovneq %rax, %rsi
; CHECK-NEXT: decw (%rcx) ; CHECK-NEXT: decw (%rsi,%rdx)
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64 %i = ptrtoint i16* %ptr to i64
%sum = add i64 %i, %idx %sum = add i64 %i, %idx
@ -433,11 +433,11 @@ define void @complex_lea_alt6(i1 %b, i16* readnone %ptr, i64 %idx) {
define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) { define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt7: ; CHECK-LABEL: complex_lea_alt7:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: leaq 60(%rdx,%rsi), %rax ; CHECK-NEXT: leaq 60(%rdx), %rax
; CHECK-NEXT: leaq 66(%rdx,%rsi), %rcx ; CHECK-NEXT: addq $66, %rdx
; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmovneq %rax, %rcx ; CHECK-NEXT: cmovneq %rax, %rdx
; CHECK-NEXT: decw (%rcx) ; CHECK-NEXT: decw (%rdx,%rsi)
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64 %i = ptrtoint i16* %ptr to i64
%o = add i64 %idx, %i %o = add i64 %idx, %i
@ -455,11 +455,11 @@ define void @complex_lea_alt7(i1 %b, i16* readnone %ptr, i64 %idx) {
define void @complex_lea_alt8(i1 %b, i16* readnone %ptr, i64 %idx) { define void @complex_lea_alt8(i1 %b, i16* readnone %ptr, i64 %idx) {
; CHECK-LABEL: complex_lea_alt8: ; CHECK-LABEL: complex_lea_alt8:
; CHECK: # %bb.0: ; CHECK: # %bb.0:
; CHECK-NEXT: leaq 60(%rsi,%rdx), %rax ; CHECK-NEXT: leaq 60(%rsi), %rax
; CHECK-NEXT: leaq 66(%rsi,%rdx), %rcx ; CHECK-NEXT: addq $66, %rsi
; CHECK-NEXT: testb $1, %dil ; CHECK-NEXT: testb $1, %dil
; CHECK-NEXT: cmovneq %rax, %rcx ; CHECK-NEXT: cmovneq %rax, %rsi
; CHECK-NEXT: decw (%rcx) ; CHECK-NEXT: decw (%rsi,%rdx)
; CHECK-NEXT: retq ; CHECK-NEXT: retq
%i = ptrtoint i16* %ptr to i64 %i = ptrtoint i16* %ptr to i64
%o = add i64 %i, %idx %o = add i64 %i, %idx