forked from OSchip/llvm-project
[X86][MCA] Address the latest issues with MULX reported in PR51495.
It turns out that SchedWrite WriteIMulH was always assigned to the low half of the result of a MULX (rather than to the high half). To avoid confusion, this patch swaps the two MULX writes in the tablegen definition of MULX32/64. That way, write names better describe what they actually refer to; this also avoids further complications if in future we decide to reuse the same MulH writes to also model other scalar integer multiply instructions. I also had to swap the latency values for the two MULX writes to make sure that the change is effectively an NFC. In fact, none of the existing x86 tests were affected by this small refactoring. This patch also fixes a bug in MCA: a wrong latency value was propagated for instructions that perform multiple writes to a same register. This last issue was found by Roman while testing MULX on targets that define a different latency for the Low/High part of the result. Differential Revision: https://reviews.llvm.org/D108727
This commit is contained in:
parent
b475ce39e8
commit
4a5b191703
|
@ -288,6 +288,17 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
|
|||
// If this move has been eliminated, then method tryEliminateMoveOrSwap should
|
||||
// have already updated all the register mappings.
|
||||
if (!IsEliminated) {
|
||||
// Check if this is one of multiple writes performed by this
|
||||
// instruction to register RegID.
|
||||
const WriteRef &OtherWrite = RegisterMappings[RegID].first;
|
||||
const WriteState *OtherWS = OtherWrite.getWriteState();
|
||||
if (OtherWS && OtherWrite.getSourceIndex() == Write.getSourceIndex()) {
|
||||
if (OtherWS->getLatency() > WS.getLatency()) {
|
||||
// Conservatively keep the slowest write to RegID.
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
// Update the mapping for register RegID including its sub-registers.
|
||||
RegisterMappings[RegID].first = Write;
|
||||
RegisterMappings[RegID].second.AliasRegID = 0U;
|
||||
|
|
|
@ -1497,13 +1497,13 @@ multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
|
|||
let hasSideEffects = 0 in {
|
||||
def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
|
||||
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
|
||||
[]>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
|
||||
[]>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>;
|
||||
|
||||
let mayLoad = 1 in
|
||||
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
|
||||
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
|
||||
[]>, T8XD, VEX_4V,
|
||||
Sched<[sched.Folded, WriteIMulHLd,
|
||||
Sched<[WriteIMulHLd, sched.Folded,
|
||||
// Memory operand.
|
||||
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
|
||||
// Implicit read of EDX/RDX
|
||||
|
|
|
@ -142,14 +142,14 @@ defm : X86WriteRes<WriteIMul16Imm, [BWPort1,BWPort0156], 4, [1,1], 2>;
|
|||
defm : X86WriteRes<WriteIMul16ImmLd, [BWPort1,BWPort0156,BWPort23], 8, [1,1,1], 3>;
|
||||
defm : BWWriteResPair<WriteIMul16Reg, [BWPort1], 3>;
|
||||
defm : BWWriteResPair<WriteIMul32, [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
|
||||
defm : BWWriteResPair<WriteMULX32, [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
|
||||
defm : BWWriteResPair<WriteMULX32, [BWPort1,BWPort06,BWPort0156], 3, [1,1,1], 3>;
|
||||
defm : BWWriteResPair<WriteIMul32Imm, [BWPort1], 3>;
|
||||
defm : BWWriteResPair<WriteIMul32Reg, [BWPort1], 3>;
|
||||
defm : BWWriteResPair<WriteIMul64, [BWPort1,BWPort5], 4, [1,1], 2>;
|
||||
defm : BWWriteResPair<WriteMULX64, [BWPort1,BWPort5], 4, [1,1], 2>;
|
||||
defm : BWWriteResPair<WriteMULX64, [BWPort1,BWPort5], 3, [1,1], 2>;
|
||||
defm : BWWriteResPair<WriteIMul64Imm, [BWPort1], 3>;
|
||||
defm : BWWriteResPair<WriteIMul64Reg, [BWPort1], 3>;
|
||||
def BWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
|
||||
def BWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
|
||||
def : WriteRes<WriteIMulHLd, []> {
|
||||
let Latency = !add(BWWriteIMulH.Latency, BroadwellModel.LoadLatency);
|
||||
}
|
||||
|
|
|
@ -144,14 +144,14 @@ defm : X86WriteRes<WriteIMul16Imm, [HWPort1,HWPort0156], 4, [1,1], 2>;
|
|||
defm : X86WriteRes<WriteIMul16ImmLd, [HWPort1,HWPort0156,HWPort23], 8, [1,1,1], 3>;
|
||||
defm : HWWriteResPair<WriteIMul16Reg, [HWPort1], 3>;
|
||||
defm : HWWriteResPair<WriteIMul32, [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
|
||||
defm : HWWriteResPair<WriteMULX32, [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
|
||||
defm : HWWriteResPair<WriteMULX32, [HWPort1,HWPort06,HWPort0156], 3, [1,1,1], 3>;
|
||||
defm : HWWriteResPair<WriteIMul32Imm, [HWPort1], 3>;
|
||||
defm : HWWriteResPair<WriteIMul32Reg, [HWPort1], 3>;
|
||||
defm : HWWriteResPair<WriteIMul64, [HWPort1,HWPort6], 4, [1,1], 2>;
|
||||
defm : HWWriteResPair<WriteMULX64, [HWPort1,HWPort6], 4, [1,1], 2>;
|
||||
defm : HWWriteResPair<WriteMULX64, [HWPort1,HWPort6], 3, [1,1], 2>;
|
||||
defm : HWWriteResPair<WriteIMul64Imm, [HWPort1], 3>;
|
||||
defm : HWWriteResPair<WriteIMul64Reg, [HWPort1], 3>;
|
||||
def HWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
|
||||
def HWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
|
||||
def : WriteRes<WriteIMulHLd, []> {
|
||||
let Latency = !add(HWWriteIMulH.Latency, HaswellModel.LoadLatency);
|
||||
}
|
||||
|
|
|
@ -124,14 +124,14 @@ defm : X86WriteRes<WriteIMul16Imm, [SBPort1,SBPort015], 4, [1,1], 2>;
|
|||
defm : X86WriteRes<WriteIMul16ImmLd, [SBPort1,SBPort015,SBPort23], 8, [1,1,1], 3>;
|
||||
defm : SBWriteResPair<WriteIMul16Reg, [SBPort1], 3>;
|
||||
defm : SBWriteResPair<WriteIMul32, [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
|
||||
defm : SBWriteResPair<WriteMULX32, [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
|
||||
defm : SBWriteResPair<WriteMULX32, [SBPort1,SBPort05,SBPort015], 3, [1,1,1], 3>;
|
||||
defm : SBWriteResPair<WriteIMul32Imm, [SBPort1], 3>;
|
||||
defm : SBWriteResPair<WriteIMul32Reg, [SBPort1], 3>;
|
||||
defm : SBWriteResPair<WriteIMul64, [SBPort1,SBPort0], 4, [1,1], 2>;
|
||||
defm : SBWriteResPair<WriteMULX64, [SBPort1,SBPort0], 4, [1,1], 2>;
|
||||
defm : SBWriteResPair<WriteMULX64, [SBPort1,SBPort0], 3, [1,1], 2>;
|
||||
defm : SBWriteResPair<WriteIMul64Imm, [SBPort1], 3>;
|
||||
defm : SBWriteResPair<WriteIMul64Reg, [SBPort1], 3>;
|
||||
def SBWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
|
||||
def SBWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
|
||||
def : WriteRes<WriteIMulHLd, []> {
|
||||
let Latency = !add(SBWriteIMulH.Latency, SandyBridgeModel.LoadLatency);
|
||||
}
|
||||
|
|
|
@ -122,14 +122,14 @@ defm : X86WriteRes<WriteIMul16Imm, [SKLPort1,SKLPort0156], 4, [1,1], 2>;
|
|||
defm : X86WriteRes<WriteIMul16ImmLd, [SKLPort1,SKLPort0156,SKLPort23], 8, [1,1,1], 3>;
|
||||
defm : SKLWriteResPair<WriteIMul16Reg, [SKLPort1], 3>;
|
||||
defm : SKLWriteResPair<WriteIMul32, [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
|
||||
defm : SKLWriteResPair<WriteMULX32, [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
|
||||
defm : SKLWriteResPair<WriteMULX32, [SKLPort1,SKLPort06,SKLPort0156], 3, [1,1,1], 3>;
|
||||
defm : SKLWriteResPair<WriteIMul32Imm, [SKLPort1], 3>;
|
||||
defm : SKLWriteResPair<WriteIMul32Reg, [SKLPort1], 3>;
|
||||
defm : SKLWriteResPair<WriteIMul64, [SKLPort1,SKLPort5], 4, [1,1], 2>;
|
||||
defm : SKLWriteResPair<WriteMULX64, [SKLPort1,SKLPort5], 4, [1,1], 2>;
|
||||
defm : SKLWriteResPair<WriteMULX64, [SKLPort1,SKLPort5], 3, [1,1], 2>;
|
||||
defm : SKLWriteResPair<WriteIMul64Imm, [SKLPort1], 3>;
|
||||
defm : SKLWriteResPair<WriteIMul64Reg, [SKLPort1], 3>;
|
||||
def SKLWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
|
||||
def SKLWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
|
||||
def : WriteRes<WriteIMulHLd, []> {
|
||||
let Latency = !add(SKLWriteIMulH.Latency, SkylakeClientModel.LoadLatency);
|
||||
}
|
||||
|
|
|
@ -123,14 +123,14 @@ defm : X86WriteRes<WriteIMul16ImmLd, [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1
|
|||
defm : X86WriteRes<WriteIMul16Reg, [SKXPort1], 3, [1], 1>;
|
||||
defm : X86WriteRes<WriteIMul16RegLd, [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1,1], 3>;
|
||||
defm : SKXWriteResPair<WriteIMul32, [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
|
||||
defm : SKXWriteResPair<WriteMULX32, [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
|
||||
defm : SKXWriteResPair<WriteMULX32, [SKXPort1,SKXPort06,SKXPort0156], 3, [1,1,1], 3>;
|
||||
defm : SKXWriteResPair<WriteIMul32Imm, [SKXPort1], 3>;
|
||||
defm : SKXWriteResPair<WriteIMul32Reg, [SKXPort1], 3>;
|
||||
defm : SKXWriteResPair<WriteIMul64, [SKXPort1,SKXPort5], 4, [1,1], 2>;
|
||||
defm : SKXWriteResPair<WriteMULX64, [SKXPort1,SKXPort5], 4, [1,1], 2>;
|
||||
defm : SKXWriteResPair<WriteMULX64, [SKXPort1,SKXPort5], 3, [1,1], 2>;
|
||||
defm : SKXWriteResPair<WriteIMul64Imm, [SKXPort1], 3>;
|
||||
defm : SKXWriteResPair<WriteIMul64Reg, [SKXPort1], 3>;
|
||||
def SKXWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
|
||||
def SKXWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
|
||||
def : WriteRes<WriteIMulHLd, []> {
|
||||
let Latency = !add(SKXWriteIMulH.Latency, SkylakeServerModel.LoadLatency);
|
||||
}
|
||||
|
|
|
@ -16,12 +16,12 @@ mulxq %rax, %rax, %rax
|
|||
|
||||
# CHECK: Iterations: 2
|
||||
# CHECK-NEXT: Instructions: 2
|
||||
# CHECK-NEXT: Total Cycles: 10
|
||||
# CHECK-NEXT: Total Cycles: 11
|
||||
# CHECK-NEXT: Total uOps: 8
|
||||
|
||||
# CHECK: Dispatch Width: 4
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.80
|
||||
# CHECK-NEXT: IPC: 0.20
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.73
|
||||
# CHECK-NEXT: IPC: 0.18
|
||||
# CHECK-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
|
@ -56,10 +56,11 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: - - 0.50 1.00 - - - 0.50 1.00 - mulxl %eax, %eax, %eax
|
||||
|
||||
# CHECK: Timeline view:
|
||||
# CHECK-NEXT: 0
|
||||
# CHECK-NEXT: Index 0123456789
|
||||
|
||||
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
|
||||
# CHECK-NEXT: [1,0] .D==eeeeER mulxl %eax, %eax, %eax
|
||||
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
|
||||
# CHECK-NEXT: [1,0] .D===eeeeER mulxl %eax, %eax, %eax
|
||||
|
||||
# CHECK: Average Wait times (based on the timeline view):
|
||||
# CHECK-NEXT: [0]: Executions
|
||||
|
@ -68,18 +69,18 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
|
||||
|
||||
# CHECK: [0] [1] [2] [3]
|
||||
# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxl %eax, %eax, %eax
|
||||
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl %eax, %eax, %eax
|
||||
|
||||
# CHECK: [1] Code Region
|
||||
|
||||
# CHECK: Iterations: 2
|
||||
# CHECK-NEXT: Instructions: 2
|
||||
# CHECK-NEXT: Total Cycles: 10
|
||||
# CHECK-NEXT: Total Cycles: 11
|
||||
# CHECK-NEXT: Total uOps: 6
|
||||
|
||||
# CHECK: Dispatch Width: 4
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.60
|
||||
# CHECK-NEXT: IPC: 0.20
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.55
|
||||
# CHECK-NEXT: IPC: 0.18
|
||||
# CHECK-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
|
@ -114,10 +115,11 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: - - - 1.00 - - - - 1.00 - mulxq %rax, %rax, %rax
|
||||
|
||||
# CHECK: Timeline view:
|
||||
# CHECK-NEXT: 0
|
||||
# CHECK-NEXT: Index 0123456789
|
||||
|
||||
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
|
||||
# CHECK-NEXT: [1,0] .D==eeeeER mulxq %rax, %rax, %rax
|
||||
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
|
||||
# CHECK-NEXT: [1,0] .D===eeeeER mulxq %rax, %rax, %rax
|
||||
|
||||
# CHECK: Average Wait times (based on the timeline view):
|
||||
# CHECK-NEXT: [0]: Executions
|
||||
|
@ -126,4 +128,4 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
|
||||
|
||||
# CHECK: [0] [1] [2] [3]
|
||||
# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxq %rax, %rax, %rax
|
||||
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq %rax, %rax, %rax
|
||||
|
|
|
@ -16,12 +16,12 @@ mulxq %rax, %rax, %rax
|
|||
|
||||
# CHECK: Iterations: 2
|
||||
# CHECK-NEXT: Instructions: 2
|
||||
# CHECK-NEXT: Total Cycles: 10
|
||||
# CHECK-NEXT: Total Cycles: 11
|
||||
# CHECK-NEXT: Total uOps: 8
|
||||
|
||||
# CHECK: Dispatch Width: 6
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.80
|
||||
# CHECK-NEXT: IPC: 0.20
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.73
|
||||
# CHECK-NEXT: IPC: 0.18
|
||||
# CHECK-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
|
@ -56,10 +56,11 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: - - 0.50 1.00 - - - 0.50 1.00 - mulxl %eax, %eax, %eax
|
||||
|
||||
# CHECK: Timeline view:
|
||||
# CHECK-NEXT: 0
|
||||
# CHECK-NEXT: Index 0123456789
|
||||
|
||||
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
|
||||
# CHECK-NEXT: [1,0] .D==eeeeER mulxl %eax, %eax, %eax
|
||||
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
|
||||
# CHECK-NEXT: [1,0] .D===eeeeER mulxl %eax, %eax, %eax
|
||||
|
||||
# CHECK: Average Wait times (based on the timeline view):
|
||||
# CHECK-NEXT: [0]: Executions
|
||||
|
@ -68,18 +69,18 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
|
||||
|
||||
# CHECK: [0] [1] [2] [3]
|
||||
# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxl %eax, %eax, %eax
|
||||
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl %eax, %eax, %eax
|
||||
|
||||
# CHECK: [1] Code Region
|
||||
|
||||
# CHECK: Iterations: 2
|
||||
# CHECK-NEXT: Instructions: 2
|
||||
# CHECK-NEXT: Total Cycles: 10
|
||||
# CHECK-NEXT: Total Cycles: 11
|
||||
# CHECK-NEXT: Total uOps: 6
|
||||
|
||||
# CHECK: Dispatch Width: 6
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.60
|
||||
# CHECK-NEXT: IPC: 0.20
|
||||
# CHECK-NEXT: uOps Per Cycle: 0.55
|
||||
# CHECK-NEXT: IPC: 0.18
|
||||
# CHECK-NEXT: Block RThroughput: 1.0
|
||||
|
||||
# CHECK: Instruction Info:
|
||||
|
@ -114,10 +115,11 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - mulxq %rax, %rax, %rax
|
||||
|
||||
# CHECK: Timeline view:
|
||||
# CHECK-NEXT: 0
|
||||
# CHECK-NEXT: Index 0123456789
|
||||
|
||||
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
|
||||
# CHECK-NEXT: [1,0] D===eeeeER mulxq %rax, %rax, %rax
|
||||
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
|
||||
# CHECK-NEXT: [1,0] D====eeeeER mulxq %rax, %rax, %rax
|
||||
|
||||
# CHECK: Average Wait times (based on the timeline view):
|
||||
# CHECK-NEXT: [0]: Executions
|
||||
|
@ -126,4 +128,4 @@ mulxq %rax, %rax, %rax
|
|||
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
|
||||
|
||||
# CHECK: [0] [1] [2] [3]
|
||||
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq %rax, %rax, %rax
|
||||
# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxq %rax, %rax, %rax
|
||||
|
|
Loading…
Reference in New Issue