[AArch64][GlobalISel] Add a new reassociation for G_PTR_ADDs.

G_PTR_ADD (G_PTR_ADD X, C), Y) -> (G_PTR_ADD (G_PTR_ADD(X, Y), C)

Improves CTMark -Os on AArch64:

Program            before after  diff
           sqlite3 286932 287024  0.0%
                kc 432512 432508 -0.0%
             SPASS 412788 412764 -0.0%
    pairlocalalign 249460 249416 -0.0%
            bullet 475740 475512 -0.0%
    7zip-benchmark 568864 568356 -0.1%
  consumer-typeset 419088 418648 -0.1%
        tramp3d-v4 367628 367224 -0.1%
          clamscan 383184 382732 -0.1%
            lencod 430028 429284 -0.2%
Geomean difference               -0.1%

Differential Revision: https://reviews.llvm.org/D109528
This commit is contained in:
Amara Emerson 2021-09-09 10:14:17 -07:00
parent 1ac209ed76
commit 5ec1845cad
4 changed files with 216 additions and 86 deletions

View File

@ -577,6 +577,14 @@ public:
/// Match: shr (shl x, n), k -> sbfx/ubfx x, pos, width
bool matchBitfieldExtractFromShr(MachineInstr &MI, BuildFnTy &MatchInfo);
// Helpers for reassociation:
bool matchReassocConstantInnerRHS(GPtrAdd &MI, MachineInstr *RHS,
BuildFnTy &MatchInfo);
bool matchReassocFoldConstantsInSubTree(GPtrAdd &MI, MachineInstr *LHS,
MachineInstr *RHS,
BuildFnTy &MatchInfo);
bool matchReassocConstantInnerLHS(GPtrAdd &MI, MachineInstr *LHS,
MachineInstr *RHS, BuildFnTy &MatchInfo);
/// Reassociate pointer calculations with G_ADD involved, to allow better
/// addressing mode usage.
bool matchReassocPtrAdd(MachineInstr &MI, BuildFnTy &MatchInfo);

View File

@ -4090,9 +4090,91 @@ bool CombinerHelper::reassociationCanBreakAddressingModePattern(
return false;
}
bool CombinerHelper::matchReassocPtrAdd(
MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
assert(MI.getOpcode() == TargetOpcode::G_PTR_ADD);
bool CombinerHelper::matchReassocConstantInnerRHS(GPtrAdd &MI,
MachineInstr *RHS,
BuildFnTy &MatchInfo) {
// G_PTR_ADD(BASE, G_ADD(X, C)) -> G_PTR_ADD(G_PTR_ADD(BASE, X), C)
Register Src1Reg = MI.getOperand(1).getReg();
if (RHS->getOpcode() != TargetOpcode::G_ADD)
return false;
auto C2 = getConstantVRegVal(RHS->getOperand(2).getReg(), MRI);
if (!C2)
return false;
MatchInfo = [=, &MI](MachineIRBuilder &B) {
LLT PtrTy = MRI.getType(MI.getOperand(0).getReg());
auto NewBase =
Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg());
Observer.changingInstr(MI);
MI.getOperand(1).setReg(NewBase.getReg(0));
MI.getOperand(2).setReg(RHS->getOperand(2).getReg());
Observer.changedInstr(MI);
};
return !reassociationCanBreakAddressingModePattern(MI);
}
bool CombinerHelper::matchReassocConstantInnerLHS(GPtrAdd &MI,
MachineInstr *LHS,
MachineInstr *RHS,
BuildFnTy &MatchInfo) {
// G_PTR_ADD (G_PTR_ADD X, C), Y) -> (G_PTR_ADD (G_PTR_ADD(X, Y), C)
// if and only if (G_PTR_ADD X, C) has one use.
Register LHSBase;
Register LHSCstOff;
if (!mi_match(MI.getBaseReg(), MRI,
m_OneNonDBGUse(m_GPtrAdd(m_Reg(LHSBase), m_ICst(LHSCstOff)))))
return false;
auto *LHSPtrAdd = cast<GPtrAdd>(LHS);
MatchInfo = [=, &MI](MachineIRBuilder &B) {
// When we change LHSPtrAdd's offset register we might cause it to use a reg
// before its def. Sink the instruction so the outer PTR_ADD to ensure this
// doesn't happen.
LHSPtrAdd->moveBefore(&MI);
Register RHSReg = MI.getOffsetReg();
Observer.changingInstr(MI);
MI.getOperand(2).setReg(LHSCstOff);
Observer.changedInstr(MI);
Observer.changingInstr(*LHSPtrAdd);
LHSPtrAdd->getOperand(2).setReg(RHSReg);
Observer.changedInstr(*LHSPtrAdd);
};
return !reassociationCanBreakAddressingModePattern(MI);
}
bool CombinerHelper::matchReassocFoldConstantsInSubTree(GPtrAdd &MI,
MachineInstr *LHS,
MachineInstr *RHS,
BuildFnTy &MatchInfo) {
// G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2)
auto *LHSPtrAdd = dyn_cast<GPtrAdd>(LHS);
if (!LHSPtrAdd)
return false;
Register Src2Reg = MI.getOperand(2).getReg();
Register LHSSrc1 = LHSPtrAdd->getBaseReg();
Register LHSSrc2 = LHSPtrAdd->getOffsetReg();
auto C1 = getConstantVRegVal(LHSSrc2, MRI);
if (!C1)
return false;
auto C2 = getConstantVRegVal(Src2Reg, MRI);
if (!C2)
return false;
MatchInfo = [=, &MI](MachineIRBuilder &B) {
auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2);
Observer.changingInstr(MI);
MI.getOperand(1).setReg(LHSSrc1);
MI.getOperand(2).setReg(NewCst.getReg(0));
Observer.changedInstr(MI);
};
return !reassociationCanBreakAddressingModePattern(MI);
}
bool CombinerHelper::matchReassocPtrAdd(MachineInstr &MI,
BuildFnTy &MatchInfo) {
auto &PtrAdd = cast<GPtrAdd>(MI);
// We're trying to match a few pointer computation patterns here for
// re-association opportunities.
// 1) Isolating a constant operand to be on the RHS, e.g.:
@ -4101,49 +4183,26 @@ bool CombinerHelper::matchReassocPtrAdd(
// 2) Folding two constants in each sub-tree as long as such folding
// doesn't break a legal addressing mode.
// G_PTR_ADD(G_PTR_ADD(BASE, C1), C2) -> G_PTR_ADD(BASE, C1+C2)
Register Src1Reg = MI.getOperand(1).getReg();
Register Src2Reg = MI.getOperand(2).getReg();
MachineInstr *LHS = MRI.getVRegDef(Src1Reg);
MachineInstr *RHS = MRI.getVRegDef(Src2Reg);
//
// 3) Move a constant from the LHS of an inner op to the RHS of the outer.
// G_PTR_ADD (G_PTR_ADD X, C), Y) -> G_PTR_ADD (G_PTR_ADD(X, Y), C)
// iif (G_PTR_ADD X, C) has one use.
MachineInstr *LHS = MRI.getVRegDef(PtrAdd.getBaseReg());
MachineInstr *RHS = MRI.getVRegDef(PtrAdd.getOffsetReg());
if (LHS->getOpcode() != TargetOpcode::G_PTR_ADD) {
// Try to match example 1).
if (RHS->getOpcode() != TargetOpcode::G_ADD)
return false;
auto C2 = getConstantVRegVal(RHS->getOperand(2).getReg(), MRI);
if (!C2)
return false;
// Try to match example 2.
if (matchReassocFoldConstantsInSubTree(PtrAdd, LHS, RHS, MatchInfo))
return true;
MatchInfo = [=,&MI](MachineIRBuilder &B) {
LLT PtrTy = MRI.getType(MI.getOperand(0).getReg());
// Try to match example 3.
if (matchReassocConstantInnerLHS(PtrAdd, LHS, RHS, MatchInfo))
return true;
auto NewBase =
Builder.buildPtrAdd(PtrTy, Src1Reg, RHS->getOperand(1).getReg());
Observer.changingInstr(MI);
MI.getOperand(1).setReg(NewBase.getReg(0));
MI.getOperand(2).setReg(RHS->getOperand(2).getReg());
Observer.changedInstr(MI);
};
} else {
// Try to match example 2.
Register LHSSrc1 = LHS->getOperand(1).getReg();
Register LHSSrc2 = LHS->getOperand(2).getReg();
auto C1 = getConstantVRegVal(LHSSrc2, MRI);
if (!C1)
return false;
auto C2 = getConstantVRegVal(Src2Reg, MRI);
if (!C2)
return false;
// Try to match example 1.
if (matchReassocConstantInnerRHS(PtrAdd, RHS, MatchInfo))
return true;
MatchInfo = [=, &MI](MachineIRBuilder &B) {
auto NewCst = B.buildConstant(MRI.getType(Src2Reg), *C1 + *C2);
Observer.changingInstr(MI);
MI.getOperand(1).setReg(LHSSrc1);
MI.getOperand(2).setReg(NewCst.getReg(0));
Observer.changedInstr(MI);
};
}
return !reassociationCanBreakAddressingModePattern(MI);
return false;
}
bool CombinerHelper::matchConstantFold(MachineInstr &MI, APInt &MatchInfo) {

View File

@ -184,3 +184,82 @@ body: |
G_STORE %ptr_to_int(s64), %10(p0) :: (store 8)
$w0 = COPY %7(s32)
RET_ReallyLR implicit $w0
...
---
name: reassoc_cst_inner_lhs
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$w0' }
- { reg: '$x1' }
- { reg: '$x2' }
- { reg: '$x3' }
body: |
bb.1:
liveins: $w0, $x1, $x2, $x3
; CHECK-LABEL: name: reassoc_cst_inner_lhs
; CHECK: liveins: $w0, $x1, $x2, $x3
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x2
; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x3
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C1]](s64)
; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[SHL]](s64)
; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[C]](s64)
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32))
; CHECK: $w0 = COPY [[LOAD]](s32)
; CHECK: RET_ReallyLR
%1:_(p0) = COPY $x1
%2:_(p0) = COPY $x2
%3:_(s64) = COPY $x3
%8:_(s64) = G_CONSTANT i64 40
%9:_(p0) = G_PTR_ADD %2, %8(s64)
%10:_(s64) = G_CONSTANT i64 2
%11:_(s64) = G_SHL %3, %10
%12:_(p0) = G_PTR_ADD %9, %11(s64)
%14:_(s32) = G_LOAD %12(p0) :: (load (s32))
$w0 = COPY %14
RET_ReallyLR
...
---
name: reassoc_cst_inner_lhs_multiuse
alignment: 4
tracksRegLiveness: true
liveins:
- { reg: '$w0' }
- { reg: '$x1' }
- { reg: '$x2' }
- { reg: '$x3' }
body: |
bb.1:
liveins: $w0, $x1, $x2, $x3
; CHECK-LABEL: name: reassoc_cst_inner_lhs_multiuse
; CHECK: liveins: $w0, $x1, $x2, $x3
; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x2
; CHECK: [[COPY1:%[0-9]+]]:_(s64) = COPY $x3
; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 40
; CHECK: [[PTR_ADD:%[0-9]+]]:_(p0) = G_PTR_ADD [[COPY]], [[C]](s64)
; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 2
; CHECK: [[SHL:%[0-9]+]]:_(s64) = G_SHL [[COPY1]], [[C1]](s64)
; CHECK: [[PTR_ADD1:%[0-9]+]]:_(p0) = G_PTR_ADD [[PTR_ADD]], [[SHL]](s64)
; CHECK: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD1]](p0) :: (load (s32))
; CHECK: $w0 = COPY [[LOAD]](s32)
; CHECK: $x0 = COPY [[PTR_ADD]](p0)
; CHECK: RET_ReallyLR
%1:_(p0) = COPY $x1
%2:_(p0) = COPY $x2
%3:_(s64) = COPY $x3
%8:_(s64) = G_CONSTANT i64 40
%9:_(p0) = G_PTR_ADD %2, %8(s64)
%10:_(s64) = G_CONSTANT i64 2
%11:_(s64) = G_SHL %3, %10
%12:_(p0) = G_PTR_ADD %9, %11(s64)
%14:_(s32) = G_LOAD %12(p0) :: (load (s32))
$w0 = COPY %14
$x0 = COPY %9
RET_ReallyLR
...

View File

@ -336,32 +336,22 @@ define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset_offset256(i32 addrspace(
define amdgpu_ps void @mubuf_store_vgpr_ptr_sgpr_offset256_offset(i32 addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, s5
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: mubuf_store_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, s5
; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 offset:1024
; GFX7-NEXT: s_endpgm
%gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i32 256
%gep1 = getelementptr i32, i32 addrspace(1)* %gep0, i32 %soffset
@ -433,25 +423,27 @@ define amdgpu_ps void @mubuf_store_sgpr_ptr_offset4095_vgpr_offset(i32 addrspace
; GFX6-LABEL: mubuf_store_sgpr_ptr_offset4095_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX6-NEXT: s_add_u32 s0, s2, 0x3ffc
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX6-NEXT: s_addc_u32 s1, s3, 0
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: v_mov_b32_e32 v2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_movk_i32 s4, 0x3ffc
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64
; GFX6-NEXT: s_endpgm
;
; GFX7-LABEL: mubuf_store_sgpr_ptr_offset4095_vgpr_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX7-NEXT: s_add_u32 s0, s2, 0x3ffc
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX7-NEXT: s_addc_u32 s1, s3, 0
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: v_mov_b32_e32 v2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX7-NEXT: s_movk_i32 s4, 0x3ffc
; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], s4 addr64
; GFX7-NEXT: s_endpgm
%gep0 = getelementptr i32, i32 addrspace(1)* %ptr, i32 4095
%gep1 = getelementptr i32, i32 addrspace(1)* %gep0, i32 %voffset
@ -790,31 +782,21 @@ define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset_offset256(float addrspac
define amdgpu_ps float @mubuf_load_vgpr_ptr_sgpr_offset256_offset(float addrspace(1)* %ptr, i32 inreg %soffset) {
; GFX6-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_mov_b64 s[4:5], 0x400
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX6-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX6-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s2, s5
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: mubuf_load_vgpr_ptr_sgpr_offset256_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_mov_b64 s[4:5], 0x400
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: v_mov_b32_e32 v3, s5
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v2
; GFX7-NEXT: s_bfe_i64 s[0:1], s[2:3], 0x200000
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, v1, v3, vcc
; GFX7-NEXT: s_lshl_b64 s[0:1], s[0:1], 2
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_mov_b32 s2, s5
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 offset:1024 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 256
@ -887,24 +869,26 @@ define amdgpu_ps float @mubuf_load_sgpr_ptr_offset4095_vgpr_offset(float addrspa
; GFX6-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX6-NEXT: s_add_u32 s4, s2, 0x3ffc
; GFX6-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_addc_u32 s5, s3, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
; GFX6-NEXT: s_mov_b32 s0, s2
; GFX6-NEXT: s_mov_b32 s1, s3
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_movk_i32 s4, 0x3ffc
; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: mubuf_load_sgpr_ptr_offset4095_vgpr_offset:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX7-NEXT: s_add_u32 s4, s2, 0x3ffc
; GFX7-NEXT: v_lshl_b64 v[0:1], v[0:1], 2
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_addc_u32 s5, s3, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 glc
; GFX7-NEXT: s_mov_b32 s0, s2
; GFX7-NEXT: s_mov_b32 s1, s3
; GFX7-NEXT: s_mov_b32 s2, 0
; GFX7-NEXT: s_mov_b32 s3, 0xf000
; GFX7-NEXT: s_movk_i32 s4, 0x3ffc
; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], s4 addr64 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: ; return to shader part epilog
%gep0 = getelementptr float, float addrspace(1)* %ptr, i64 4095