forked from OSchip/llvm-project
Fix some latency computation bugs: if the use is not a machine opcode do not just return zero.
llvm-svn: 105061
This commit is contained in:
parent
bf91499f1a
commit
cc2efe11db
|
@ -1275,6 +1275,17 @@ bool hybrid_ls_rr_sort::operator()(const SUnit *left, const SUnit *right) const{
|
|||
return left->getHeight() > right->getHeight();
|
||||
} else if (RStall)
|
||||
return false;
|
||||
|
||||
// If either node is scheduling for latency, sort them by height and latency
|
||||
// first.
|
||||
if (left->SchedulingPref == Sched::Latency ||
|
||||
right->SchedulingPref == Sched::Latency) {
|
||||
if (left->getHeight() != right->getHeight())
|
||||
return left->getHeight() > right->getHeight();
|
||||
if (left->Latency != right->Latency)
|
||||
return left->Latency > right->Latency;
|
||||
}
|
||||
|
||||
return BURRSort(left, right, SPQ);
|
||||
}
|
||||
|
||||
|
|
|
@ -59,6 +59,10 @@ SUnit *ScheduleDAGSDNodes::NewSUnit(SDNode *N) {
|
|||
SUnits.back().OrigNode = &SUnits.back();
|
||||
SUnit *SU = &SUnits.back();
|
||||
const TargetLowering &TLI = DAG->getTargetLoweringInfo();
|
||||
if (N->isMachineOpcode() &&
|
||||
N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF)
|
||||
SU->SchedulingPref = Sched::None;
|
||||
else
|
||||
SU->SchedulingPref = TLI.getSchedulingPreference(N);
|
||||
return SU;
|
||||
}
|
||||
|
@ -364,8 +368,10 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
|
|||
if (Cost >= 0)
|
||||
PhysReg = 0;
|
||||
|
||||
// If this is a ctrl dep, latency is 1.
|
||||
unsigned OpLatency = isChain ? 1 : OpSU->Latency;
|
||||
const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
|
||||
OpSU->Latency, PhysReg);
|
||||
OpLatency, PhysReg);
|
||||
if (!isChain && !UnitLatencies) {
|
||||
ComputeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
|
||||
ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
|
||||
|
@ -427,15 +433,18 @@ void ScheduleDAGSDNodes::ComputeOperandLatency(SDNode *Def, SDNode *Use,
|
|||
return;
|
||||
|
||||
unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
|
||||
if (Def->isMachineOpcode() && Use->isMachineOpcode()) {
|
||||
if (Def->isMachineOpcode()) {
|
||||
const TargetInstrDesc &II = TII->get(Def->getMachineOpcode());
|
||||
if (DefIdx >= II.getNumDefs())
|
||||
return;
|
||||
int DefCycle = InstrItins.getOperandCycle(II.getSchedClass(), DefIdx);
|
||||
if (DefCycle < 0)
|
||||
return;
|
||||
int UseCycle = 1;
|
||||
if (Use->isMachineOpcode()) {
|
||||
const unsigned UseClass = TII->get(Use->getMachineOpcode()).getSchedClass();
|
||||
int UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
|
||||
UseCycle = InstrItins.getOperandCycle(UseClass, OpIdx);
|
||||
}
|
||||
if (UseCycle >= 0) {
|
||||
int Latency = DefCycle - UseCycle + 1;
|
||||
if (Latency >= 0)
|
||||
|
|
|
@ -4,14 +4,14 @@
|
|||
; constant offset addressing, so that each of the following stores
|
||||
; uses the same register.
|
||||
|
||||
; CHECK: vstr.32 s0, [r12, #-128]
|
||||
; CHECK: vstr.32 s0, [r12, #-96]
|
||||
; CHECK: vstr.32 s0, [r12, #-64]
|
||||
; CHECK: vstr.32 s0, [r12, #-32]
|
||||
; CHECK: vstr.32 s0, [r12]
|
||||
; CHECK: vstr.32 s0, [r12, #32]
|
||||
; CHECK: vstr.32 s0, [r12, #64]
|
||||
; CHECK: vstr.32 s0, [r12, #96]
|
||||
; CHECK: vstr.32 s0, [r9, #-128]
|
||||
; CHECK: vstr.32 s0, [r9, #-96]
|
||||
; CHECK: vstr.32 s0, [r9, #-64]
|
||||
; CHECK: vstr.32 s0, [r9, #-32]
|
||||
; CHECK: vstr.32 s0, [r9]
|
||||
; CHECK: vstr.32 s0, [r9, #32]
|
||||
; CHECK: vstr.32 s0, [r9, #64]
|
||||
; CHECK: vstr.32 s0, [r9, #96]
|
||||
|
||||
target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32-n32"
|
||||
|
||||
|
@ -626,8 +626,8 @@ bb24: ; preds = %bb23
|
|||
; LSR should use count-down iteration to avoid requiring the trip count
|
||||
; in a register, and it shouldn't require any reloads here.
|
||||
|
||||
; CHECK: sub.w r9, r9, #1
|
||||
; CHECK-NEXT: cmp.w r9, #0
|
||||
; CHECK: subs r3, #1
|
||||
; CHECK-NEXT: cmp r3, #0
|
||||
; CHECK-NEXT: bne.w
|
||||
|
||||
%92 = icmp eq i32 %tmp81, %indvar78 ; <i1> [#uses=1]
|
||||
|
|
|
@ -45,9 +45,9 @@ define arm_apcscc void @t2(i16* %i_ptr, i16* %o_ptr, %struct.int16x8_t* nocaptur
|
|||
entry:
|
||||
; CHECK: t2:
|
||||
; CHECK: vld1.16
|
||||
; CHECK: vld1.16
|
||||
; CHECK-NOT: vmov
|
||||
; CHECK: vmul.i16
|
||||
; CHECK-NOT: vmov
|
||||
; CHECK: vld1.16
|
||||
; CHECK: vmul.i16
|
||||
; CHECK-NOT: vmov
|
||||
; CHECK: vst1.16
|
||||
|
@ -238,8 +238,9 @@ bb14: ; preds = %bb6
|
|||
define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
|
||||
; CHECK: t9:
|
||||
; CHECK: vldr.64
|
||||
; CHECK-NOT: vmov d{{.*}}, d0
|
||||
; CHECK: vmov.i8 d1
|
||||
; CHECK-NEXT: vstmia r0, {d2,d3}
|
||||
; CHECK-NEXT: vstmia r0, {d0,d1}
|
||||
; CHECK-NEXT: vstmia r0, {d0,d1}
|
||||
%3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
|
||||
%4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
|
||||
|
|
|
@ -10,8 +10,8 @@ target triple = "powerpc-apple-darwin10.0"
|
|||
define void @foo(i32 %y) nounwind ssp {
|
||||
entry:
|
||||
; CHECK: foo
|
||||
; CHECK: add r4
|
||||
; CHECK: 0(r4)
|
||||
; CHECK: add r3
|
||||
; CHECK: 0(r3)
|
||||
%y_addr = alloca i32 ; <i32*> [#uses=2]
|
||||
%"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0]
|
||||
store i32 %y, i32* %y_addr
|
||||
|
|
Loading…
Reference in New Issue