[X86][MCA] Address the latest issues with MULX reported in PR51495.

It turns out that SchedWrite WriteIMulH was always assigned to the low half of
the result of a MULX (rather than to the high half).

To avoid confusion, this patch swaps the two MULX writes in the tablegen
definition of MULX32/64.  That way, write names better describe what they
actually refer to; this also avoids further complications if in future we decide
to reuse the same MulH writes to also model other scalar integer multiply
instructions.  I also had to swap the latency values for the two MULX writes to
make sure that the change is effectively an NFC. In fact, none of the existing
x86 tests were affected by this small refactoring.

This patch also fixes a bug in MCA: a wrong latency value was propagated for
instructions that perform multiple writes to a same register.  This last issue
was found by Roman while testing MULX on targets that define a different latency
for the Low/High part of the result.

Differential Revision: https://reviews.llvm.org/D108727
This commit is contained in:
Andrea Di Biagio 2021-08-25 21:34:35 +01:00
parent b475ce39e8
commit 4a5b191703
9 changed files with 56 additions and 41 deletions

View File

@ -288,6 +288,17 @@ void RegisterFile::addRegisterWrite(WriteRef Write,
// If this move has been eliminated, then method tryEliminateMoveOrSwap should
// have already updated all the register mappings.
if (!IsEliminated) {
// Check if this is one of multiple writes performed by this
// instruction to register RegID.
const WriteRef &OtherWrite = RegisterMappings[RegID].first;
const WriteState *OtherWS = OtherWrite.getWriteState();
if (OtherWS && OtherWrite.getSourceIndex() == Write.getSourceIndex()) {
if (OtherWS->getLatency() > WS.getLatency()) {
// Conservatively keep the slowest write to RegID.
return;
}
}
// Update the mapping for register RegID including its sub-registers.
RegisterMappings[RegID].first = Write;
RegisterMappings[RegID].second.AliasRegID = 0U;

View File

@ -1497,13 +1497,13 @@ multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
let hasSideEffects = 0 in {
def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
[]>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
[]>, T8XD, VEX_4V, Sched<[WriteIMulH, sched]>;
let mayLoad = 1 in
def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
[]>, T8XD, VEX_4V,
Sched<[sched.Folded, WriteIMulHLd,
Sched<[WriteIMulHLd, sched.Folded,
// Memory operand.
ReadDefault, ReadDefault, ReadDefault, ReadDefault, ReadDefault,
// Implicit read of EDX/RDX

View File

@ -142,14 +142,14 @@ defm : X86WriteRes<WriteIMul16Imm, [BWPort1,BWPort0156], 4, [1,1], 2>;
defm : X86WriteRes<WriteIMul16ImmLd, [BWPort1,BWPort0156,BWPort23], 8, [1,1,1], 3>;
defm : BWWriteResPair<WriteIMul16Reg, [BWPort1], 3>;
defm : BWWriteResPair<WriteIMul32, [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
defm : BWWriteResPair<WriteMULX32, [BWPort1,BWPort06,BWPort0156], 4, [1,1,1], 3>;
defm : BWWriteResPair<WriteMULX32, [BWPort1,BWPort06,BWPort0156], 3, [1,1,1], 3>;
defm : BWWriteResPair<WriteIMul32Imm, [BWPort1], 3>;
defm : BWWriteResPair<WriteIMul32Reg, [BWPort1], 3>;
defm : BWWriteResPair<WriteIMul64, [BWPort1,BWPort5], 4, [1,1], 2>;
defm : BWWriteResPair<WriteMULX64, [BWPort1,BWPort5], 4, [1,1], 2>;
defm : BWWriteResPair<WriteMULX64, [BWPort1,BWPort5], 3, [1,1], 2>;
defm : BWWriteResPair<WriteIMul64Imm, [BWPort1], 3>;
defm : BWWriteResPair<WriteIMul64Reg, [BWPort1], 3>;
def BWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
def BWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
def : WriteRes<WriteIMulHLd, []> {
let Latency = !add(BWWriteIMulH.Latency, BroadwellModel.LoadLatency);
}

View File

@ -144,14 +144,14 @@ defm : X86WriteRes<WriteIMul16Imm, [HWPort1,HWPort0156], 4, [1,1], 2>;
defm : X86WriteRes<WriteIMul16ImmLd, [HWPort1,HWPort0156,HWPort23], 8, [1,1,1], 3>;
defm : HWWriteResPair<WriteIMul16Reg, [HWPort1], 3>;
defm : HWWriteResPair<WriteIMul32, [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
defm : HWWriteResPair<WriteMULX32, [HWPort1,HWPort06,HWPort0156], 4, [1,1,1], 3>;
defm : HWWriteResPair<WriteMULX32, [HWPort1,HWPort06,HWPort0156], 3, [1,1,1], 3>;
defm : HWWriteResPair<WriteIMul32Imm, [HWPort1], 3>;
defm : HWWriteResPair<WriteIMul32Reg, [HWPort1], 3>;
defm : HWWriteResPair<WriteIMul64, [HWPort1,HWPort6], 4, [1,1], 2>;
defm : HWWriteResPair<WriteMULX64, [HWPort1,HWPort6], 4, [1,1], 2>;
defm : HWWriteResPair<WriteMULX64, [HWPort1,HWPort6], 3, [1,1], 2>;
defm : HWWriteResPair<WriteIMul64Imm, [HWPort1], 3>;
defm : HWWriteResPair<WriteIMul64Reg, [HWPort1], 3>;
def HWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
def HWWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
def : WriteRes<WriteIMulHLd, []> {
let Latency = !add(HWWriteIMulH.Latency, HaswellModel.LoadLatency);
}

View File

@ -124,14 +124,14 @@ defm : X86WriteRes<WriteIMul16Imm, [SBPort1,SBPort015], 4, [1,1], 2>;
defm : X86WriteRes<WriteIMul16ImmLd, [SBPort1,SBPort015,SBPort23], 8, [1,1,1], 3>;
defm : SBWriteResPair<WriteIMul16Reg, [SBPort1], 3>;
defm : SBWriteResPair<WriteIMul32, [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
defm : SBWriteResPair<WriteMULX32, [SBPort1,SBPort05,SBPort015], 4, [1,1,1], 3>;
defm : SBWriteResPair<WriteMULX32, [SBPort1,SBPort05,SBPort015], 3, [1,1,1], 3>;
defm : SBWriteResPair<WriteIMul32Imm, [SBPort1], 3>;
defm : SBWriteResPair<WriteIMul32Reg, [SBPort1], 3>;
defm : SBWriteResPair<WriteIMul64, [SBPort1,SBPort0], 4, [1,1], 2>;
defm : SBWriteResPair<WriteMULX64, [SBPort1,SBPort0], 4, [1,1], 2>;
defm : SBWriteResPair<WriteMULX64, [SBPort1,SBPort0], 3, [1,1], 2>;
defm : SBWriteResPair<WriteIMul64Imm, [SBPort1], 3>;
defm : SBWriteResPair<WriteIMul64Reg, [SBPort1], 3>;
def SBWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
def SBWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
def : WriteRes<WriteIMulHLd, []> {
let Latency = !add(SBWriteIMulH.Latency, SandyBridgeModel.LoadLatency);
}

View File

@ -122,14 +122,14 @@ defm : X86WriteRes<WriteIMul16Imm, [SKLPort1,SKLPort0156], 4, [1,1], 2>;
defm : X86WriteRes<WriteIMul16ImmLd, [SKLPort1,SKLPort0156,SKLPort23], 8, [1,1,1], 3>;
defm : SKLWriteResPair<WriteIMul16Reg, [SKLPort1], 3>;
defm : SKLWriteResPair<WriteIMul32, [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
defm : SKLWriteResPair<WriteMULX32, [SKLPort1,SKLPort06,SKLPort0156], 4, [1,1,1], 3>;
defm : SKLWriteResPair<WriteMULX32, [SKLPort1,SKLPort06,SKLPort0156], 3, [1,1,1], 3>;
defm : SKLWriteResPair<WriteIMul32Imm, [SKLPort1], 3>;
defm : SKLWriteResPair<WriteIMul32Reg, [SKLPort1], 3>;
defm : SKLWriteResPair<WriteIMul64, [SKLPort1,SKLPort5], 4, [1,1], 2>;
defm : SKLWriteResPair<WriteMULX64, [SKLPort1,SKLPort5], 4, [1,1], 2>;
defm : SKLWriteResPair<WriteMULX64, [SKLPort1,SKLPort5], 3, [1,1], 2>;
defm : SKLWriteResPair<WriteIMul64Imm, [SKLPort1], 3>;
defm : SKLWriteResPair<WriteIMul64Reg, [SKLPort1], 3>;
def SKLWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
def SKLWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
def : WriteRes<WriteIMulHLd, []> {
let Latency = !add(SKLWriteIMulH.Latency, SkylakeClientModel.LoadLatency);
}

View File

@ -123,14 +123,14 @@ defm : X86WriteRes<WriteIMul16ImmLd, [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1
defm : X86WriteRes<WriteIMul16Reg, [SKXPort1], 3, [1], 1>;
defm : X86WriteRes<WriteIMul16RegLd, [SKXPort1,SKXPort0156,SKXPort23], 8, [1,1,1], 3>;
defm : SKXWriteResPair<WriteIMul32, [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
defm : SKXWriteResPair<WriteMULX32, [SKXPort1,SKXPort06,SKXPort0156], 4, [1,1,1], 3>;
defm : SKXWriteResPair<WriteMULX32, [SKXPort1,SKXPort06,SKXPort0156], 3, [1,1,1], 3>;
defm : SKXWriteResPair<WriteIMul32Imm, [SKXPort1], 3>;
defm : SKXWriteResPair<WriteIMul32Reg, [SKXPort1], 3>;
defm : SKXWriteResPair<WriteIMul64, [SKXPort1,SKXPort5], 4, [1,1], 2>;
defm : SKXWriteResPair<WriteMULX64, [SKXPort1,SKXPort5], 4, [1,1], 2>;
defm : SKXWriteResPair<WriteMULX64, [SKXPort1,SKXPort5], 3, [1,1], 2>;
defm : SKXWriteResPair<WriteIMul64Imm, [SKXPort1], 3>;
defm : SKXWriteResPair<WriteIMul64Reg, [SKXPort1], 3>;
def SKXWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 3; }
def SKXWriteIMulH : WriteRes<WriteIMulH, []> { let Latency = 4; }
def : WriteRes<WriteIMulHLd, []> {
let Latency = !add(SKXWriteIMulH.Latency, SkylakeServerModel.LoadLatency);
}

View File

@ -16,12 +16,12 @@ mulxq %rax, %rax, %rax
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 10
# CHECK-NEXT: Total Cycles: 11
# CHECK-NEXT: Total uOps: 8
# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.80
# CHECK-NEXT: IPC: 0.20
# CHECK-NEXT: uOps Per Cycle: 0.73
# CHECK-NEXT: IPC: 0.18
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
@ -56,10 +56,11 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: - - 0.50 1.00 - - - 0.50 1.00 - mulxl %eax, %eax, %eax
# CHECK: Timeline view:
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
# CHECK-NEXT: [1,0] .D==eeeeER mulxl %eax, %eax, %eax
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
# CHECK-NEXT: [1,0] .D===eeeeER mulxl %eax, %eax, %eax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -68,18 +69,18 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxl %eax, %eax, %eax
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl %eax, %eax, %eax
# CHECK: [1] Code Region
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 10
# CHECK-NEXT: Total Cycles: 11
# CHECK-NEXT: Total uOps: 6
# CHECK: Dispatch Width: 4
# CHECK-NEXT: uOps Per Cycle: 0.60
# CHECK-NEXT: IPC: 0.20
# CHECK-NEXT: uOps Per Cycle: 0.55
# CHECK-NEXT: IPC: 0.18
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
@ -114,10 +115,11 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: - - - 1.00 - - - - 1.00 - mulxq %rax, %rax, %rax
# CHECK: Timeline view:
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
# CHECK-NEXT: [1,0] .D==eeeeER mulxq %rax, %rax, %rax
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
# CHECK-NEXT: [1,0] .D===eeeeER mulxq %rax, %rax, %rax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -126,4 +128,4 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxq %rax, %rax, %rax
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq %rax, %rax, %rax

View File

@ -16,12 +16,12 @@ mulxq %rax, %rax, %rax
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 10
# CHECK-NEXT: Total Cycles: 11
# CHECK-NEXT: Total uOps: 8
# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.80
# CHECK-NEXT: IPC: 0.20
# CHECK-NEXT: uOps Per Cycle: 0.73
# CHECK-NEXT: IPC: 0.18
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
@ -56,10 +56,11 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: - - 0.50 1.00 - - - 0.50 1.00 - mulxl %eax, %eax, %eax
# CHECK: Timeline view:
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
# CHECK-NEXT: [1,0] .D==eeeeER mulxl %eax, %eax, %eax
# CHECK: [0,0] DeeeeER . mulxl %eax, %eax, %eax
# CHECK-NEXT: [1,0] .D===eeeeER mulxl %eax, %eax, %eax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -68,18 +69,18 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 2.0 0.5 0.0 mulxl %eax, %eax, %eax
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxl %eax, %eax, %eax
# CHECK: [1] Code Region
# CHECK: Iterations: 2
# CHECK-NEXT: Instructions: 2
# CHECK-NEXT: Total Cycles: 10
# CHECK-NEXT: Total Cycles: 11
# CHECK-NEXT: Total uOps: 6
# CHECK: Dispatch Width: 6
# CHECK-NEXT: uOps Per Cycle: 0.60
# CHECK-NEXT: IPC: 0.20
# CHECK-NEXT: uOps Per Cycle: 0.55
# CHECK-NEXT: IPC: 0.18
# CHECK-NEXT: Block RThroughput: 1.0
# CHECK: Instruction Info:
@ -114,10 +115,11 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: - - - 1.00 - - - 1.00 - - mulxq %rax, %rax, %rax
# CHECK: Timeline view:
# CHECK-NEXT: 0
# CHECK-NEXT: Index 0123456789
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
# CHECK-NEXT: [1,0] D===eeeeER mulxq %rax, %rax, %rax
# CHECK: [0,0] DeeeeER . mulxq %rax, %rax, %rax
# CHECK-NEXT: [1,0] D====eeeeER mulxq %rax, %rax, %rax
# CHECK: Average Wait times (based on the timeline view):
# CHECK-NEXT: [0]: Executions
@ -126,4 +128,4 @@ mulxq %rax, %rax, %rax
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
# CHECK: [0] [1] [2] [3]
# CHECK-NEXT: 0. 2 2.5 0.5 0.0 mulxq %rax, %rax, %rax
# CHECK-NEXT: 0. 2 3.0 0.5 0.0 mulxq %rax, %rax, %rax