[MCA] Improved handling of negative read-advance cycles.

Before this patch, register writes were always invalidated by the
RegisterFile at instruction commit stage. So,
the RegisterFile was often losing the knowledge about the `execute
cycle` of writes already committed. While this was not problematic
for non-delayed reads, this was sometimes leading to inaccurate read
latency computations in the presence of negative read-advance cycles.

This patch fixes the issue by changing how the RegisterFile component
internally keeps track of the `execute cycle` information of each
write. On every instruction executed, the RegisterFile gets notified
by the RetireStage, so that it can internally record the execute
cycle of each executed write.
The `execute cycle` information is stored within WriteRef itself, and
it is not invalidated when the write is committed.
This commit is contained in:
Andrea Di Biagio 2021-03-23 14:47:01 +00:00
parent 514bc01ca3
commit f5bdc88e4d
9 changed files with 321 additions and 68 deletions

View File

@ -28,7 +28,53 @@ namespace mca {
class ReadState;
class WriteState;
class WriteRef;
class Instruction;
/// A reference to a register write.
///
/// This class is mainly used by the register file to describe register
/// mappings. It correlates a register write to the source index of the
/// defining instruction.
class WriteRef {
unsigned IID;
unsigned WriteBackCycle;
unsigned WriteResID;
MCPhysReg RegisterID;
WriteState *Write;
static const unsigned INVALID_IID;
public:
WriteRef() : IID(INVALID_IID), WriteBackCycle(), WriteResID(), Write() {}
WriteRef(unsigned SourceIndex, WriteState *WS);
unsigned getSourceIndex() const { return IID; }
unsigned getWriteBackCycle() const;
const WriteState *getWriteState() const { return Write; }
WriteState *getWriteState() { return Write; }
unsigned getWriteResourceID() const;
MCPhysReg getRegisterID() const;
void commit();
void notifyExecuted(unsigned Cycle);
bool hasKnownWriteBackCycle() const;
bool isWriteZero() const;
bool isValid() const { return getSourceIndex() != INVALID_IID; }
/// Returns true if this register write has been executed, and the new
/// register value is therefore available to users.
bool isAvailable() const { return hasKnownWriteBackCycle(); }
bool operator==(const WriteRef &Other) const {
return Write && Other.Write && Write == Other.Write;
}
#ifndef NDEBUG
void dump() const;
#endif
};
/// Manages hardware register files, and tracks register definitions for
/// register renaming purposes.
@ -145,6 +191,8 @@ class RegisterFile : public HardwareUnit {
// the target. Bits are set for registers that are known to be zero.
APInt ZeroRegisters;
unsigned CurrentCycle;
// This method creates a new register file descriptor.
// The new register file owns all of the registers declared by register
// classes in the 'RegisterClasses' set.
@ -183,8 +231,9 @@ public:
unsigned NumRegs = 0);
// Collects writes that are in a RAW dependency with RS.
void collectWrites(const ReadState &RS,
SmallVectorImpl<WriteRef> &Writes) const;
void collectWrites(const MCSubtargetInfo &STI, const ReadState &RS,
SmallVectorImpl<WriteRef> &Writes,
SmallVectorImpl<WriteRef> &CommittedWrites) const;
// This method updates the register mappings inserting a new register
// definition. This method is also responsible for updating the number of
@ -223,9 +272,15 @@ public:
// Returns the number of PRFs implemented by this processor.
unsigned getNumRegisterFiles() const { return RegisterFiles.size(); }
unsigned getElapsedCyclesFromWriteBack(const WriteRef &WR) const;
void onInstructionExecuted(Instruction *IS);
// Notify each PRF that a new cycle just started.
void cycleStart();
void cycleEnd() { ++CurrentCycle; }
#ifndef NDEBUG
void dump() const;
#endif

View File

@ -595,45 +595,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const InstRef &IR) {
}
#endif
/// A reference to a register write.
///
/// This class is mainly used by the register file to describe register
/// mappings. It correlates a register write to the source index of the
/// defining instruction.
class WriteRef {
std::pair<unsigned, WriteState *> Data;
static const unsigned INVALID_IID;
public:
WriteRef() : Data(INVALID_IID, nullptr) {}
WriteRef(unsigned SourceIndex, WriteState *WS) : Data(SourceIndex, WS) {}
unsigned getSourceIndex() const { return Data.first; }
const WriteState *getWriteState() const { return Data.second; }
WriteState *getWriteState() { return Data.second; }
void invalidate() { Data.second = nullptr; }
bool isWriteZero() const {
assert(isValid() && "Invalid null WriteState found!");
return getWriteState()->isWriteZero();
}
/// Returns true if this register write has been executed, and the new
/// register value is therefore available to users.
bool isAvailable() const {
if (getSourceIndex() == INVALID_IID)
return false;
const WriteState *WS = getWriteState();
return !WS || WS->isExecuted();
}
bool isValid() const { return Data.second && Data.first != INVALID_IID; }
bool operator==(const WriteRef &Other) const { return Data == Other.Data; }
#ifndef NDEBUG
void dump() const;
#endif
};
} // namespace mca
} // namespace llvm

View File

@ -43,6 +43,7 @@ public:
return !RCU.isEmpty() || !RetireInst.empty();
}
Error cycleStart() override;
Error cycleEnd() override;
Error execute(InstRef &IR) override;
void notifyInstructionRetired(const InstRef &IR) const;
};

View File

@ -22,11 +22,47 @@
namespace llvm {
namespace mca {
const unsigned WriteRef::INVALID_IID = std::numeric_limits<unsigned>::max();
WriteRef::WriteRef(unsigned SourceIndex, WriteState *WS)
: IID(SourceIndex), WriteBackCycle(), WriteResID(), Write(WS) {}
void WriteRef::commit() {
assert(Write && Write->isExecuted() && "Cannot commit before write back!");
Write = nullptr;
}
void WriteRef::notifyExecuted(unsigned Cycle) {
assert(Write && Write->isExecuted() && "Not executed!");
WriteBackCycle = Cycle;
}
bool WriteRef::hasKnownWriteBackCycle() const {
return isValid() && (!Write || Write->isExecuted());
}
bool WriteRef::isWriteZero() const {
assert(isValid() && "Invalid null WriteState found!");
return getWriteState()->isWriteZero();
}
unsigned WriteRef::getWriteResourceID() const {
if (Write)
return Write->getWriteResourceID();
return WriteResID;
}
MCPhysReg WriteRef::getRegisterID() const {
if (Write)
return Write->getRegisterID();
return RegisterID;
}
RegisterFile::RegisterFile(const MCSchedModel &SM, const MCRegisterInfo &mri,
unsigned NumRegs)
: MRI(mri),
RegisterMappings(mri.getNumRegs(), {WriteRef(), RegisterRenamingInfo()}),
ZeroRegisters(mri.getNumRegs(), false) {
ZeroRegisters(mri.getNumRegs(), false), CurrentCycle() {
initialize(SM, NumRegs);
}
@ -63,6 +99,43 @@ void RegisterFile::cycleStart() {
RMT.NumMoveEliminated = 0;
}
void RegisterFile::onInstructionExecuted(Instruction *IS) {
assert(IS && IS->isExecuted() && "Unexpected internal state found!");
for (WriteState &WS : IS->getDefs()) {
if (WS.isEliminated())
return;
MCPhysReg RegID = WS.getRegisterID();
assert(RegID != 0 && "A write of an invalid register?");
assert(WS.getCyclesLeft() != UNKNOWN_CYCLES &&
"The number of cycles should be known at this point!");
assert(WS.getCyclesLeft() <= 0 && "Invalid cycles left for this write!");
MCPhysReg RenameAs = RegisterMappings[RegID].second.RenameAs;
if (RenameAs && RenameAs != RegID)
RegID = RenameAs;
WriteRef &WR = RegisterMappings[RegID].first;
if (WR.getWriteState() == &WS)
WR.notifyExecuted(CurrentCycle);
for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
WriteRef &OtherWR = RegisterMappings[*I].first;
if (OtherWR.getWriteState() == &WS)
OtherWR.notifyExecuted(CurrentCycle);
}
if (!WS.clearsSuperRegisters())
continue;
for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
WriteRef &OtherWR = RegisterMappings[*I].first;
if (OtherWR.getWriteState() == &WS)
OtherWR.notifyExecuted(CurrentCycle);
}
}
}
void RegisterFile::addRegisterFile(const MCRegisterFileDesc &RF,
ArrayRef<MCRegisterCostEntry> Entries) {
// A default register file is always allocated at index #0. That register file
@ -261,12 +334,12 @@ void RegisterFile::removeRegisterWrite(
WriteRef &WR = RegisterMappings[RegID].first;
if (WR.getWriteState() == &WS)
WR.invalidate();
WR.commit();
for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
WriteRef &OtherWR = RegisterMappings[*I].first;
if (OtherWR.getWriteState() == &WS)
OtherWR.invalidate();
OtherWR.commit();
}
if (!WS.clearsSuperRegisters())
@ -275,7 +348,7 @@ void RegisterFile::removeRegisterWrite(
for (MCSuperRegIterator I(RegID, &MRI); I.isValid(); ++I) {
WriteRef &OtherWR = RegisterMappings[*I].first;
if (OtherWR.getWriteState() == &WS)
OtherWR.invalidate();
OtherWR.commit();
}
}
@ -344,8 +417,25 @@ bool RegisterFile::tryEliminateMove(WriteState &WS, ReadState &RS) {
return true;
}
void RegisterFile::collectWrites(const ReadState &RS,
SmallVectorImpl<WriteRef> &Writes) const {
unsigned WriteRef::getWriteBackCycle() const {
assert(hasKnownWriteBackCycle() && "Instruction not executed!");
assert((!Write || Write->getCyclesLeft() <= 0) &&
"Inconsistent state found!");
return WriteBackCycle;
}
unsigned RegisterFile::getElapsedCyclesFromWriteBack(const WriteRef &WR) const {
assert(WR.hasKnownWriteBackCycle() && "Write hasn't been committed yet!");
return CurrentCycle - WR.getWriteBackCycle();
}
void RegisterFile::collectWrites(
const MCSubtargetInfo &STI, const ReadState &RS,
SmallVectorImpl<WriteRef> &Writes,
SmallVectorImpl<WriteRef> &CommittedWrites) const {
const ReadDescriptor &RD = RS.getDescriptor();
const MCSchedModel &SM = STI.getSchedModel();
const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
MCPhysReg RegID = RS.getRegisterID();
assert(RegID && RegID < RegisterMappings.size());
LLVM_DEBUG(dbgs() << "RegisterFile: collecting writes for register "
@ -357,14 +447,32 @@ void RegisterFile::collectWrites(const ReadState &RS,
RegID = RRI.AliasRegID;
const WriteRef &WR = RegisterMappings[RegID].first;
if (WR.isValid())
if (WR.getWriteState()) {
Writes.push_back(WR);
} else if (WR.hasKnownWriteBackCycle()) {
unsigned WriteResID = WR.getWriteResourceID();
int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID);
if (ReadAdvance < 0) {
unsigned Elapsed = getElapsedCyclesFromWriteBack(WR);
if (Elapsed < static_cast<unsigned>(-ReadAdvance))
CommittedWrites.push_back(WR);
}
}
// Handle potential partial register updates.
for (MCSubRegIterator I(RegID, &MRI); I.isValid(); ++I) {
const WriteRef &WR = RegisterMappings[*I].first;
if (WR.isValid())
if (WR.getWriteState()) {
Writes.push_back(WR);
} else if (WR.hasKnownWriteBackCycle()) {
unsigned WriteResID = WR.getWriteResourceID();
int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID);
if (ReadAdvance < 0) {
unsigned Elapsed = getElapsedCyclesFromWriteBack(WR);
if (Elapsed < static_cast<unsigned>(-ReadAdvance))
CommittedWrites.push_back(WR);
}
}
}
// Remove duplicate entries and resize the input vector.
@ -398,21 +506,34 @@ void RegisterFile::addRegisterRead(ReadState &RS,
RS.setReadZero();
SmallVector<WriteRef, 4> DependentWrites;
collectWrites(RS, DependentWrites);
RS.setDependentWrites(DependentWrites.size());
SmallVector<WriteRef, 4> CompletedWrites;
collectWrites(STI, RS, DependentWrites, CompletedWrites);
RS.setDependentWrites(DependentWrites.size() + CompletedWrites.size());
// We know that this read depends on all the writes in DependentWrites.
// For each write, check if we have ReadAdvance information, and use it
// to figure out in how many cycles this read becomes available.
// to figure out in how many cycles this read will be available.
const ReadDescriptor &RD = RS.getDescriptor();
const MCSchedModel &SM = STI.getSchedModel();
const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
for (WriteRef &WR : DependentWrites) {
unsigned WriteResID = WR.getWriteResourceID();
WriteState &WS = *WR.getWriteState();
unsigned WriteResID = WS.getWriteResourceID();
int ReadAdvance = STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID);
WS.addUser(WR.getSourceIndex(), &RS, ReadAdvance);
}
for (WriteRef &WR : CompletedWrites) {
unsigned WriteResID = WR.getWriteResourceID();
assert(WR.hasKnownWriteBackCycle() && "Invalid write!");
assert(STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID) < 0);
unsigned ReadAdvance = static_cast<unsigned>(
-STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID));
unsigned Elapsed = getElapsedCyclesFromWriteBack(WR);
assert(Elapsed < ReadAdvance && "Should not have been added to the set!");
RS.writeStartEvent(WR.getSourceIndex(), WR.getRegisterID(),
ReadAdvance - Elapsed);
}
}
unsigned RegisterFile::isAvailable(ArrayRef<MCPhysReg> Regs) const {
@ -463,6 +584,14 @@ unsigned RegisterFile::isAvailable(ArrayRef<MCPhysReg> Regs) const {
}
#ifndef NDEBUG
void WriteRef::dump() const {
dbgs() << "IID=" << getSourceIndex() << ' ';
if (isValid())
getWriteState()->dump();
else
dbgs() << "(null)";
}
void RegisterFile::dump() const {
for (unsigned I = 0, E = MRI.getNumRegs(); I < E; ++I) {
const RegisterMapping &RM = RegisterMappings[I];

View File

@ -27,7 +27,8 @@ void WriteState::writeStartEvent(unsigned IID, MCPhysReg RegID,
DependentWrite = nullptr;
}
void ReadState::writeStartEvent(unsigned IID, MCPhysReg RegID, unsigned Cycles) {
void ReadState::writeStartEvent(unsigned IID, MCPhysReg RegID,
unsigned Cycles) {
assert(DependentWrites);
assert(CyclesLeft == UNKNOWN_CYCLES);
@ -125,14 +126,6 @@ void WriteState::dump() const {
dbgs() << "{ OpIdx=" << WD->OpIndex << ", Lat=" << getLatency() << ", RegID "
<< getRegisterID() << ", Cycles Left=" << getCyclesLeft() << " }";
}
void WriteRef::dump() const {
dbgs() << "IID=" << getSourceIndex() << ' ';
if (isValid())
getWriteState()->dump();
else
dbgs() << "(null)";
}
#endif
const CriticalDependency &Instruction::computeCriticalRegDep() {
@ -248,7 +241,5 @@ void Instruction::cycleEvent() {
Stage = IS_EXECUTED;
}
const unsigned WriteRef::INVALID_IID = std::numeric_limits<unsigned>::max();
} // namespace mca
} // namespace llvm

View File

@ -136,8 +136,8 @@ Error DispatchStage::dispatch(InstRef IR) {
}
Error DispatchStage::cycleStart() {
PRF.cycleStart();
// The retire stage is responsible for calling method `cycleStart`
// on the PRF.
if (!CarryOver) {
AvailableEntries = DispatchWidth;
return ErrorSuccess();

View File

@ -91,12 +91,13 @@ static unsigned checkRegisterHazard(const RegisterFile &PRF,
const InstRef &IR) {
unsigned StallCycles = 0;
SmallVector<WriteRef, 4> Writes;
SmallVector<WriteRef, 4> CommittedWrites;
for (const ReadState &RS : IR.getInstruction()->getUses()) {
const ReadDescriptor &RD = RS.getDescriptor();
const MCSchedClassDesc *SC = SM.getSchedClassDesc(RD.SchedClassID);
PRF.collectWrites(RS, Writes);
PRF.collectWrites(STI, RS, Writes, CommittedWrites);
for (const WriteRef &WR : Writes) {
const WriteState *WS = WR.getWriteState();
unsigned WriteResID = WS->getWriteResourceID();
@ -118,6 +119,19 @@ static unsigned checkRegisterHazard(const RegisterFile &PRF,
}
}
Writes.clear();
for (const WriteRef &WR : CommittedWrites) {
unsigned WriteResID = WR.getWriteResourceID();
assert(!WR.getWriteState() && "Should be already committed!");
assert(WR.hasKnownWriteBackCycle() && "Invalid write!");
assert(STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID) < 0);
unsigned ReadAdvance = static_cast<unsigned>(
-STI.getReadAdvanceCycles(SC, RD.UseIndex, WriteResID));
unsigned Elapsed = PRF.getElapsedCyclesFromWriteBack(WR);
assert(Elapsed < ReadAdvance && "Should not have been added to the set!");
unsigned CyclesLeft = (ReadAdvance - Elapsed);
StallCycles = std::max(StallCycles, CyclesLeft);
}
}
return StallCycles;
@ -293,6 +307,8 @@ llvm::Error InOrderIssueStage::updateIssuedInst() {
llvm::Error InOrderIssueStage::cycleStart() {
NumIssued = 0;
PRF.cycleStart();
// Release consumed resources.
SmallVector<ResourceRef, 4> Freed;
RM->cycleEvent(Freed);
@ -320,6 +336,8 @@ llvm::Error InOrderIssueStage::cycleStart() {
}
llvm::Error InOrderIssueStage::cycleEnd() {
PRF.cycleEnd();
if (StallCyclesLeft > 0)
--StallCyclesLeft;

View File

@ -23,6 +23,8 @@ namespace llvm {
namespace mca {
llvm::Error RetireStage::cycleStart() {
PRF.cycleStart();
const unsigned MaxRetirePerCycle = RCU.getMaxRetirePerCycle();
unsigned NumRetired = 0;
while (!RCU.isEmpty()) {
@ -46,9 +48,15 @@ llvm::Error RetireStage::cycleStart() {
return llvm::ErrorSuccess();
}
llvm::Error RetireStage::cycleEnd() {
PRF.cycleEnd();
return llvm::ErrorSuccess();
}
llvm::Error RetireStage::execute(InstRef &IR) {
Instruction &IS = *IR.getInstruction();
PRF.onInstructionExecuted(&IS);
unsigned TokenID = IS.getRCUTokenID();
if (TokenID != RetireControlUnit::UnhandledTokenID) {
RCU.onInstructionExecuted(TokenID);

View File

@ -0,0 +1,90 @@
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=btver2 -iterations=1 -timeline < %s | FileCheck %s
add %ebx, %ebx
vpinsrd $1, %ebx, %xmm0, %xmm1
vpinsrd $1, %ebx, %xmm2, %xmm3
vpinsrd $2, %ebx, %xmm4, %xmm5
vpinsrd $2, %ebx, %xmm6, %xmm7
vpinsrd $3, %ebx, %xmm8, %xmm10
# CHECK: Iterations: 1
# CHECK-NEXT: Instructions: 6
# CHECK-NEXT: Total Cycles: 13
# CHECK-NEXT: Total uOps: 11
# CHECK: Dispatch Width: 2
# CHECK-NEXT: uOps Per Cycle: 0.85
# CHECK-NEXT: IPC: 0.46
# CHECK-NEXT: Block RThroughput: 5.5
# CHECK: Instruction Info:
# CHECK-NEXT: [1]: #uOps
# CHECK-NEXT: [2]: Latency
# CHECK-NEXT: [3]: RThroughput
# CHECK-NEXT: [4]: MayLoad
# CHECK-NEXT: [5]: MayStore
# CHECK-NEXT: [6]: HasSideEffects (U)
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
# CHECK-NEXT: 1 1 0.50 addl %ebx, %ebx
# CHECK-NEXT: 2 7 0.50 vpinsrd $1, %ebx, %xmm0, %xmm1
# CHECK-NEXT: 2 7 0.50 vpinsrd $1, %ebx, %xmm2, %xmm3
# CHECK-NEXT: 2 7 0.50 vpinsrd $2, %ebx, %xmm4, %xmm5
# CHECK-NEXT: 2 7 0.50 vpinsrd $2, %ebx, %xmm6, %xmm7
# CHECK-NEXT: 2 7 0.50 vpinsrd $3, %ebx, %xmm8, %xmm10
# CHECK: Resources:
# CHECK-NEXT: [0] - JALU0
# CHECK-NEXT: [1] - JALU1
# CHECK-NEXT: [2] - JDiv
# CHECK-NEXT: [3] - JFPA
# CHECK-NEXT: [4] - JFPM
# CHECK-NEXT: [5] - JFPU0
# CHECK-NEXT: [6] - JFPU1
# CHECK-NEXT: [7] - JLAGU
# CHECK-NEXT: [8] - JMul
# CHECK-NEXT: [9] - JSAGU
# CHECK-NEXT: [10] - JSTC
# CHECK-NEXT: [11] - JVALU0
# CHECK-NEXT: [12] - JVALU1
# CHECK-NEXT: [13] - JVIMUL
# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13]
# CHECK-NEXT: - 1.00 - - - 2.00 3.00 - - - - 2.00 3.00 -
# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions:
# CHECK-NEXT: - 1.00 - - - - - - - - - - - - addl %ebx, %ebx
# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpinsrd $1, %ebx, %xmm0, %xmm1
# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpinsrd $1, %ebx, %xmm2, %xmm3
# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpinsrd $2, %ebx, %xmm4, %xmm5
# CHECK-NEXT: - - - - - 1.00 - - - - - 1.00 - - vpinsrd $2, %ebx, %xmm6, %xmm7
# CHECK-NEXT: - - - - - - 1.00 - - - - - 1.00 - vpinsrd $3, %ebx, %xmm8, %xmm10
# CHECK: Timeline view:
# CHECK-NEXT: 012
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeER . . . addl %ebx, %ebx
# CHECK-NEXT: [0,1] .D======eER . vpinsrd $1, %ebx, %xmm0, %xmm1
# CHECK-NEXT: [0,2] . D=====eER . vpinsrd $1, %ebx, %xmm2, %xmm3
# CHECK-NEXT: [0,3] . D=====eER. vpinsrd $2, %ebx, %xmm4, %xmm5
# CHECK-NEXT: [0,4] . D====eER. vpinsrd $2, %ebx, %xmm6, %xmm7
# CHECK-NEXT: [0,5] . D====eER vpinsrd $3, %ebx, %xmm8, %xmm10
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 addl %ebx, %ebx
# CHECK-NEXT: 1. 1 7.0 0.0 0.0 vpinsrd $1, %ebx, %xmm0, %xmm1
# CHECK-NEXT: 2. 1 6.0 0.0 0.0 vpinsrd $1, %ebx, %xmm2, %xmm3
# CHECK-NEXT: 3. 1 6.0 1.0 0.0 vpinsrd $2, %ebx, %xmm4, %xmm5
# CHECK-NEXT: 4. 1 5.0 1.0 0.0 vpinsrd $2, %ebx, %xmm6, %xmm7
# CHECK-NEXT: 5. 1 5.0 2.0 0.0 vpinsrd $3, %ebx, %xmm8, %xmm10
# CHECK-NEXT: 1 5.0 0.8 0.0 <total>