forked from OSchip/llvm-project
- Assign load / store with shifter op address modes the right itinerary classes.
- For now, loads of [r, r] addressing mode is the same as the [r, r lsl/lsr/asr #] variants. ARMBaseInstrInfo::getOperandLatency() should identify the former case and reduce the output latency by 1. - Also identify [r, r << 2] case. This special form of shifter addressing mode is "free". llvm-svn: 117519
This commit is contained in:
parent
523fa3a2e8
commit
ff1c862f8e
|
@ -1823,8 +1823,8 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
|||
// This may be a def / use of a variable_ops instruction, the operand
|
||||
// latency might be determinable dynamically. Let the target try to
|
||||
// figure it out.
|
||||
bool LdmBypass = false;
|
||||
int DefCycle = -1;
|
||||
bool LdmBypass = false;
|
||||
switch (DefTID.getOpcode()) {
|
||||
default:
|
||||
DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
|
||||
|
@ -1922,8 +1922,38 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
|||
? (*DefMI->memoperands_begin())->getAlignment() : 0;
|
||||
unsigned UseAlign = UseMI->hasOneMemOperand()
|
||||
? (*UseMI->memoperands_begin())->getAlignment() : 0;
|
||||
return getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
|
||||
UseTID, UseIdx, UseAlign);
|
||||
int Latency = getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
|
||||
UseTID, UseIdx, UseAlign);
|
||||
|
||||
if (Latency > 1 &&
|
||||
(Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
|
||||
// FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
|
||||
// variants are one cycle cheaper.
|
||||
switch (DefTID.getOpcode()) {
|
||||
default: break;
|
||||
case ARM::LDRrs:
|
||||
case ARM::LDRBrs: {
|
||||
unsigned ShOpVal = DefMI->getOperand(3).getImm();
|
||||
unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
|
||||
if (ShImm == 0 ||
|
||||
(ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
|
||||
--Latency;
|
||||
break;
|
||||
}
|
||||
case ARM::t2LDRs:
|
||||
case ARM::t2LDRBs:
|
||||
case ARM::t2LDRHs:
|
||||
case ARM::t2LDRSHs: {
|
||||
// Thumb2 mode: lsl only.
|
||||
unsigned ShAmt = DefMI->getOperand(3).getImm();
|
||||
if (ShAmt == 0 || ShAmt == 2)
|
||||
--Latency;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Latency;
|
||||
}
|
||||
|
||||
int
|
||||
|
@ -1947,8 +1977,40 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
|
|||
const MachineSDNode *UseMN = dyn_cast<MachineSDNode>(UseNode);
|
||||
unsigned UseAlign = !UseMN->memoperands_empty()
|
||||
? (*UseMN->memoperands_begin())->getAlignment() : 0;
|
||||
return getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
|
||||
UseTID, UseIdx, UseAlign);
|
||||
int Latency = getOperandLatency(ItinData, DefTID, DefIdx, DefAlign,
|
||||
UseTID, UseIdx, UseAlign);
|
||||
|
||||
if (Latency > 1 &&
|
||||
(Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
|
||||
// FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
|
||||
// variants are one cycle cheaper.
|
||||
switch (DefTID.getOpcode()) {
|
||||
default: break;
|
||||
case ARM::LDRrs:
|
||||
case ARM::LDRBrs: {
|
||||
unsigned ShOpVal =
|
||||
cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
|
||||
unsigned ShImm = ARM_AM::getAM2Offset(ShOpVal);
|
||||
if (ShImm == 0 ||
|
||||
(ShImm == 2 && ARM_AM::getAM2ShiftOpc(ShOpVal) == ARM_AM::lsl))
|
||||
--Latency;
|
||||
break;
|
||||
}
|
||||
case ARM::t2LDRs:
|
||||
case ARM::t2LDRBs:
|
||||
case ARM::t2LDRHs:
|
||||
case ARM::t2LDRSHs: {
|
||||
// Thumb2 mode: lsl only.
|
||||
unsigned ShAmt =
|
||||
cast<ConstantSDNode>(DefNode->getOperand(2))->getZExtValue();
|
||||
if (ShAmt == 0 || ShAmt == 2)
|
||||
--Latency;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Latency;
|
||||
}
|
||||
|
||||
bool ARMBaseInstrInfo::
|
||||
|
|
|
@ -1438,13 +1438,13 @@ def RFE : ABXI<{1,0,0,?}, (outs), (ins addrmode4:$addr, GPR:$base),
|
|||
// Load
|
||||
|
||||
|
||||
defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_i, IIC_iLoad_r,
|
||||
defm LDR : AI_ldr1<0, "ldr", IIC_iLoad_r, IIC_iLoad_si,
|
||||
UnOpFrag<(load node:$Src)>>;
|
||||
defm LDRB : AI_ldr1<1, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_r,
|
||||
defm LDRB : AI_ldr1<1, "ldrb", IIC_iLoad_bh_r, IIC_iLoad_bh_si,
|
||||
UnOpFrag<(zextloadi8 node:$Src)>>;
|
||||
defm STR : AI_str1<0, "str", IIC_iStore_i, IIC_iStore_r,
|
||||
defm STR : AI_str1<0, "str", IIC_iStore_r, IIC_iStore_si,
|
||||
BinOpFrag<(store node:$LHS, node:$RHS)>>;
|
||||
defm STRB : AI_str1<1, "strb", IIC_iStore_bh_i, IIC_iStore_bh_r,
|
||||
defm STRB : AI_str1<1, "strb", IIC_iStore_bh_r, IIC_iStore_bh_si,
|
||||
BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
|
||||
|
||||
// Special LDR for loads from non-pc-relative constpools.
|
||||
|
|
|
@ -574,7 +574,7 @@ multiclass T2I_cmp_irs<bits<4> opcod, string opc,
|
|||
|
||||
/// T2I_ld - Defines a set of (op r, {imm12|imm8|so_reg}) load patterns.
|
||||
multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
|
||||
InstrItinClass iii, InstrItinClass iir, PatFrag opnode> {
|
||||
InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
|
||||
def i12 : T2Ii12<(outs GPR:$dst), (ins t2addrmode_imm12:$addr), iii,
|
||||
opc, ".w\t$dst, $addr",
|
||||
[(set GPR:$dst, (opnode t2addrmode_imm12:$addr))]> {
|
||||
|
@ -599,7 +599,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
|
|||
let Inst{10} = 1; // The P bit.
|
||||
let Inst{8} = 0; // The W bit.
|
||||
}
|
||||
def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), iir,
|
||||
def s : T2Iso <(outs GPR:$dst), (ins t2addrmode_so_reg:$addr), iis,
|
||||
opc, ".w\t$dst, $addr",
|
||||
[(set GPR:$dst, (opnode t2addrmode_so_reg:$addr))]> {
|
||||
let Inst{31-27} = 0b11111;
|
||||
|
@ -626,7 +626,7 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
|
|||
|
||||
/// T2I_st - Defines a set of (op r, {imm12|imm8|so_reg}) store patterns.
|
||||
multiclass T2I_st<bits<2> opcod, string opc,
|
||||
InstrItinClass iii, InstrItinClass iir, PatFrag opnode> {
|
||||
InstrItinClass iii, InstrItinClass iis, PatFrag opnode> {
|
||||
def i12 : T2Ii12<(outs), (ins GPR:$src, t2addrmode_imm12:$addr), iii,
|
||||
opc, ".w\t$src, $addr",
|
||||
[(opnode GPR:$src, t2addrmode_imm12:$addr)]> {
|
||||
|
@ -647,7 +647,7 @@ multiclass T2I_st<bits<2> opcod, string opc,
|
|||
let Inst{10} = 1; // The P bit.
|
||||
let Inst{8} = 0; // The W bit.
|
||||
}
|
||||
def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), iir,
|
||||
def s : T2Iso <(outs), (ins GPR:$src, t2addrmode_so_reg:$addr), iis,
|
||||
opc, ".w\t$src, $addr",
|
||||
[(opnode GPR:$src, t2addrmode_so_reg:$addr)]> {
|
||||
let Inst{31-27} = 0b11111;
|
||||
|
@ -916,19 +916,19 @@ def t2UDIV : T2I<(outs rGPR:$dst), (ins rGPR:$a, rGPR:$b), IIC_iALUi,
|
|||
|
||||
// Load
|
||||
let canFoldAsLoad = 1, isReMaterializable = 1 in
|
||||
defm t2LDR : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_r,
|
||||
defm t2LDR : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si,
|
||||
UnOpFrag<(load node:$Src)>>;
|
||||
|
||||
// Loads with zero extension
|
||||
defm t2LDRH : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_r,
|
||||
defm t2LDRH : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
|
||||
UnOpFrag<(zextloadi16 node:$Src)>>;
|
||||
defm t2LDRB : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_r,
|
||||
defm t2LDRB : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
|
||||
UnOpFrag<(zextloadi8 node:$Src)>>;
|
||||
|
||||
// Loads with sign extension
|
||||
defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_r,
|
||||
defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
|
||||
UnOpFrag<(sextloadi16 node:$Src)>>;
|
||||
defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_r,
|
||||
defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
|
||||
UnOpFrag<(sextloadi8 node:$Src)>>;
|
||||
|
||||
let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
|
||||
|
@ -1070,11 +1070,11 @@ def t2LDRSBT : T2IldT<1, 0b00, "ldrsbt", IIC_iLoad_bh_i>;
|
|||
def t2LDRSHT : T2IldT<1, 0b01, "ldrsht", IIC_iLoad_bh_i>;
|
||||
|
||||
// Store
|
||||
defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_r,
|
||||
defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si,
|
||||
BinOpFrag<(store node:$LHS, node:$RHS)>>;
|
||||
defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_r,
|
||||
defm t2STRB:T2I_st<0b00,"strb", IIC_iStore_bh_i, IIC_iStore_bh_si,
|
||||
BinOpFrag<(truncstorei8 node:$LHS, node:$RHS)>>;
|
||||
defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_r,
|
||||
defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
|
||||
BinOpFrag<(truncstorei16 node:$LHS, node:$RHS)>>;
|
||||
|
||||
// Store doubleword
|
||||
|
|
|
@ -574,7 +574,7 @@ def CortexA9Itineraries : ProcessorItineraries<
|
|||
InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrStage<9, [A9_DRegsN], 0, Reserved>,
|
||||
InstrStage<1, [A9_NPipe]>],
|
||||
[8, 0, 1, 1]>,
|
||||
[8, 1, 1, 1]>,
|
||||
//
|
||||
// Double-precision FP MAC
|
||||
InstrItinData<IIC_fpMAC64 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
|
@ -582,7 +582,7 @@ def CortexA9Itineraries : ProcessorItineraries<
|
|||
InstrStage<1, [A9_DRegsVFP], 0, Required>,
|
||||
InstrStage<10, [A9_DRegsN], 0, Reserved>,
|
||||
InstrStage<2, [A9_NPipe]>],
|
||||
[9, 0, 1, 1]>,
|
||||
[9, 1, 1, 1]>,
|
||||
//
|
||||
// Single-precision FP DIV
|
||||
InstrItinData<IIC_fpDIV32 , [InstrStage<1, [A9_Issue0, A9_Issue1], 0>,
|
||||
|
|
|
@ -24,4 +24,4 @@ declare float @fabsf(float)
|
|||
; CORTEXA8: test:
|
||||
; CORTEXA8: vabs.f32 d1, d1
|
||||
; CORTEXA9: test:
|
||||
; CORTEXA9: vabs.f32 s0, s0
|
||||
; CORTEXA9: vabs.f32 s1, s1
|
||||
|
|
|
@ -20,4 +20,4 @@ entry:
|
|||
; CORTEXA8: test:
|
||||
; CORTEXA8: vadd.f32 d0, d1, d0
|
||||
; CORTEXA9: test:
|
||||
; CORTEXA9: vadd.f32 s0, s0, s1
|
||||
; CORTEXA9: vadd.f32 s0, s1, s0
|
||||
|
|
|
@ -20,4 +20,4 @@ entry:
|
|||
; CORTEXA8: test:
|
||||
; CORTEXA8: vdiv.f32 s0, s1, s0
|
||||
; CORTEXA9: test:
|
||||
; CORTEXA9: vdiv.f32 s0, s0, s1
|
||||
; CORTEXA9: vdiv.f32 s0, s1, s0
|
||||
|
|
|
@ -21,4 +21,4 @@ entry:
|
|||
; CORTEXA8: test:
|
||||
; CORTEXA8: vmul.f32 d0, d1, d0
|
||||
; CORTEXA9: test:
|
||||
; CORTEXA9: vmla.f32 s0, s1, s2
|
||||
; CORTEXA9: vmla.f32 s2, s1, s0
|
||||
|
|
|
@ -19,6 +19,6 @@ entry:
|
|||
; NFP0: vnmls.f32 s2, s1, s0
|
||||
|
||||
; CORTEXA8: test:
|
||||
; CORTEXA8: vnmls.f32 s1, s2, s0
|
||||
; CORTEXA8: vnmls.f32 s2, s1, s0
|
||||
; CORTEXA9: test:
|
||||
; CORTEXA9: vnmls.f32 s0, s1, s2
|
||||
; CORTEXA9: vnmls.f32 s2, s1, s0
|
||||
|
|
|
@ -20,4 +20,4 @@ entry:
|
|||
; CORTEXA8: test:
|
||||
; CORTEXA8: vmul.f32 d0, d1, d0
|
||||
; CORTEXA9: test:
|
||||
; CORTEXA9: vmul.f32 s0, s0, s1
|
||||
; CORTEXA9: vmul.f32 s0, s1, s0
|
||||
|
|
|
@ -36,8 +36,8 @@ entry:
|
|||
|
||||
; lsl #2 is free
|
||||
; A9: test3:
|
||||
; A9: ldr r1, [r1, r2, lsl #2]
|
||||
; A9: ldr r0, [r0, r2, lsl #2]
|
||||
; A9: ldr r1, [r1, r2, lsl #2]
|
||||
%tmp1 = shl i32 %offset, 2
|
||||
%tmp2 = add i32 %base, %tmp1
|
||||
%tmp3 = inttoptr i32 %tmp2 to i32*
|
||||
|
|
Loading…
Reference in New Issue