From 89310f56c80cbf277cd0edc8cbdda864f64c82d8 Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 29 Mar 2018 20:41:39 +0000 Subject: [PATCH] [X86] Correct the placement of ReadAfterLd in BEXTR and BZHI. Add dedicated SchedRW for BEXTR/BZHI. These instructions have the memory operand before the register operand. So we need to put ReadDefault for all the load ops first. Then the ReadAfterLd Differential Revision: https://reviews.llvm.org/D44838 llvm-svn: 328823 --- llvm/lib/Target/X86/X86InstrInfo.td | 19 ++++++++++++------- llvm/lib/Target/X86/X86SchedBroadwell.td | 16 +++++----------- llvm/lib/Target/X86/X86SchedHaswell.td | 16 +++++----------- llvm/lib/Target/X86/X86SchedSandyBridge.td | 5 +++++ llvm/lib/Target/X86/X86SchedSkylakeClient.td | 16 +++++----------- llvm/lib/Target/X86/X86SchedSkylakeServer.td | 16 +++++----------- llvm/lib/Target/X86/X86Schedule.td | 4 ++++ llvm/lib/Target/X86/X86ScheduleBtVer2.td | 4 ++++ llvm/lib/Target/X86/X86ScheduleSLM.td | 5 +++++ llvm/lib/Target/X86/X86ScheduleZnver1.td | 18 +++++------------- llvm/test/CodeGen/X86/bmi-schedule.ll | 8 ++++---- llvm/test/CodeGen/X86/bmi2-schedule.ll | 8 ++++---- 12 files changed, 63 insertions(+), 72 deletions(-) diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td index 5eaf55b09f2e..1dd464c1c3bf 100644 --- a/llvm/lib/Target/X86/X86InstrInfo.td +++ b/llvm/lib/Target/X86/X86InstrInfo.td @@ -2378,30 +2378,35 @@ let Predicates = [HasBMI] in { multiclass bmi_bextr_bzhi opc, string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, - PatFrag ld_frag> { + PatFrag ld_frag, X86FoldableSchedWrite Sched> { def rr : I, - T8PS, VEX, Sched<[WriteALU]>; + T8PS, VEX, Sched<[Sched]>; def rm : I, T8PS, VEX, - Sched<[WriteALULd, ReadAfterLd]>; + Sched<[Sched.Folded, + // x86memop:$src1 + ReadDefault, ReadDefault, ReadDefault, ReadDefault, + ReadDefault, + // RC:$src2 + ReadAfterLd]>; } let Predicates = [HasBMI], Defs = [EFLAGS] in { defm BEXTR32 : bmi_bextr_bzhi<0xF7, "bextr{l}", GR32, i32mem, - int_x86_bmi_bextr_32, loadi32>; + int_x86_bmi_bextr_32, loadi32, WriteBEXTR>; defm BEXTR64 : bmi_bextr_bzhi<0xF7, "bextr{q}", GR64, i64mem, - int_x86_bmi_bextr_64, loadi64>, VEX_W; + int_x86_bmi_bextr_64, loadi64, WriteBEXTR>, VEX_W; } let Predicates = [HasBMI2], Defs = [EFLAGS] in { defm BZHI32 : bmi_bextr_bzhi<0xF5, "bzhi{l}", GR32, i32mem, - int_x86_bmi_bzhi_32, loadi32>; + int_x86_bmi_bzhi_32, loadi32, WriteBZHI>; defm BZHI64 : bmi_bextr_bzhi<0xF5, "bzhi{q}", GR64, i64mem, - int_x86_bmi_bzhi_64, loadi64>, VEX_W; + int_x86_bmi_bzhi_64, loadi64, WriteBZHI>, VEX_W; } def CountTrailingOnes : SDNodeXForm; // Integer shifts and rotates. defm : BWWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +defm : BWWriteResPair; +defm : BWWriteResPair; + // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } def : WriteRes; @@ -492,7 +496,6 @@ def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr", "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r", "MMX_PABSBrr", "MMX_PABSDrr", @@ -780,8 +783,7 @@ def BWWriteResGroup19 : SchedWriteRes<[BWPort06,BWPort15]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[BWWriteResGroup19], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[BWWriteResGroup19], (instregex "BSWAP(16|32|64)r")>; def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> { let Latency = 2; @@ -1442,7 +1444,6 @@ def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm", "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MMX_PABSBrm", "MMX_PABSDrm", "MMX_PABSWrm", @@ -1833,13 +1834,6 @@ def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> { def: InstRW<[BWWriteResGroup84], (instregex "LRETQ", "RETQ")>; -def BWWriteResGroup85 : SchedWriteRes<[BWPort23,BWPort06,BWPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[BWWriteResGroup85], (instregex "BEXTR(32|64)rm")>; - def BWWriteResGroup86 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { let Latency = 7; let NumMicroOps = 3; diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td index 97825257cb7f..bd16dc6d5303 100644 --- a/llvm/lib/Target/X86/X86SchedHaswell.td +++ b/llvm/lib/Target/X86/X86SchedHaswell.td @@ -128,6 +128,10 @@ defm : HWWriteResPair; defm : HWWriteResPair; defm : HWWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +defm : HWWriteResPair; +defm : HWWriteResPair; + // This is quite rough, latency depends on the dividend. defm : HWWriteResPair; // Scalar and vector floating point. @@ -844,7 +848,6 @@ def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr", "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r", "MMX_PABSBrr", "MMX_PABSDrr", @@ -1230,7 +1233,6 @@ def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm", "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MMX_PABSBrm", "MMX_PABSDrm", "MMX_PABSWrm", @@ -1606,8 +1608,7 @@ def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[HWWriteResGroup34], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>; def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> { let Latency = 2; @@ -1711,13 +1712,6 @@ def: InstRW<[HWWriteResGroup41], (instregex "LRETQ", "RETL", "RETQ")>; -def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[HWWriteResGroup42], (instregex "BEXTR(32|64)rm")>; - def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { let Latency = 7; let NumMicroOps = 3; diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td index e5fc16844bf0..7316de6af724 100644 --- a/llvm/lib/Target/X86/X86SchedSandyBridge.td +++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td @@ -119,6 +119,11 @@ defm : SBWriteResPair; defm : SBWriteResPair; defm : SBWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +// NOTE: These don't exist on Sandy Bridge. Ports are guesses. +defm : SBWriteResPair; +defm : SBWriteResPair; + // Scalar and vector floating point. def : WriteRes; def : WriteRes { let Latency = 6; } diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td index bceb43541d67..2a6658e31ae8 100644 --- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td @@ -120,6 +120,10 @@ defm : SKLWriteResPair; // Integer shifts and rotates. defm : SKLWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +defm : SKLWriteResPair; +defm : SKLWriteResPair; + // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } def : WriteRes; @@ -558,7 +562,6 @@ def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr", "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r")>; def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> { @@ -802,8 +805,7 @@ def SKLWriteResGroup22 : SchedWriteRes<[SKLPort06,SKLPort15]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKLWriteResGroup22], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup22], (instregex "BSWAP(16|32|64)r")>; def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> { let Latency = 2; @@ -1464,7 +1466,6 @@ def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm", "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MOVBE(16|32|64)rm")>; def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> { @@ -1806,13 +1807,6 @@ def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> { def: InstRW<[SKLWriteResGroup98], (instregex "LRETQ", "RETQ")>; -def SKLWriteResGroup99 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SKLWriteResGroup99], (instregex "BEXTR(32|64)rm")>; - def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { let Latency = 7; let NumMicroOps = 5; diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td index d563ea2ebdd5..7f336fde9802 100755 --- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td +++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td @@ -120,6 +120,10 @@ defm : SKXWriteResPair; defm : SKXWriteResPair; defm : SKXWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +defm : SKXWriteResPair; +defm : SKXWriteResPair; + // Loads, stores, and moves, not folded with other operations. def : WriteRes { let Latency = 5; } def : WriteRes; @@ -1034,7 +1038,6 @@ def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr", "BLSI(32|64)rr", "BLSMSK(32|64)rr", "BLSR(32|64)rr", - "BZHI(32|64)rr", "LEA(16|32|64)(_32)?r")>; def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> { @@ -1597,8 +1600,7 @@ def SKXWriteResGroup22 : SchedWriteRes<[SKXPort06,SKXPort15]> { let NumMicroOps = 2; let ResourceCycles = [1,1]; } -def: InstRW<[SKXWriteResGroup22], (instregex "BEXTR(32|64)rr", - "BSWAP(16|32|64)r")>; +def: InstRW<[SKXWriteResGroup22], (instregex "BSWAP(16|32|64)r")>; def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> { let Latency = 2; @@ -3094,7 +3096,6 @@ def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm", "BLSI(32|64)rm", "BLSMSK(32|64)rm", "BLSR(32|64)rm", - "BZHI(32|64)rm", "MOVBE(16|32|64)rm")>; def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> { @@ -3753,13 +3754,6 @@ def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> { def: InstRW<[SKXWriteResGroup104], (instregex "LRETQ", "RETQ")>; -def SKXWriteResGroup105 : SchedWriteRes<[SKXPort23,SKXPort06,SKXPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [1,1,1]; -} -def: InstRW<[SKXWriteResGroup105], (instregex "BEXTR(32|64)rm")>; - def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { let Latency = 7; let NumMicroOps = 4; diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td index 6136d96fcfb7..b5cb26cee8ed 100644 --- a/llvm/lib/Target/X86/X86Schedule.td +++ b/llvm/lib/Target/X86/X86Schedule.td @@ -54,6 +54,10 @@ defm WriteTZCNT : X86SchedWritePair; // Trailing zero count. // Integer shifts and rotates. defm WriteShift : X86SchedWritePair; +// BMI1 BEXTR, BMI2 BZHI +defm WriteBEXTR : X86SchedWritePair; +defm WriteBZHI : X86SchedWritePair; + // Loads, stores, and moves, not folded with other operations. def WriteLoad : SchedWrite; def WriteStore : SchedWrite; diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index d61d27267f70..2994b31fe66a 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -141,6 +141,10 @@ defm : JWriteResIntPair; defm : JWriteResIntPair; defm : JWriteResIntPair; +// BMI1 BEXTR, BMI2 BZHI +defm : JWriteResIntPair; +defm : JWriteResIntPair; // NOTE: Doesn't exist on Jaguar. + def JWriteIMul64 : SchedWriteRes<[JALU1, JMul]> { let Latency = 6; let ResourceCycles = [1, 4]; diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td index 87b1bf26c6ef..64a2ec1a103e 100644 --- a/llvm/lib/Target/X86/X86ScheduleSLM.td +++ b/llvm/lib/Target/X86/X86ScheduleSLM.td @@ -104,6 +104,11 @@ defm : SLMWriteResPair; defm : SLMWriteResPair; defm : SLMWriteResPair; +// BMI1 BEXTR, BMI2 BZHI +// NOTE: These don't exist on Silvermont. Ports are guesses. +defm : SBWriteResPair; +defm : SBWriteResPair; + // This is quite rough, latency depends on the dividend. defm : SLMWriteResPair; diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td index 252243c6a7a5..33472c8252c1 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver1.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td @@ -162,6 +162,10 @@ defm : ZnWriteResPair; // Treat misc copies as a move. def : InstRW<[WriteMove], (instrs COPY)>; +// BMI1 BEXTR, BMI2 BZHI +defm : ZnWriteResPair; +defm : ZnWriteResPair; + // IDIV def : WriteRes { let Latency = 41; @@ -564,25 +568,13 @@ def : InstRW<[ZnWriteALULat2], (instregex "BLS(I|MSK|R)(32|64)rr")>; // r,m. def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "BLS(I|MSK|R)(32|64)rm")>; -// BEXTR. -// r,r,r. -def : InstRW<[WriteALU], (instregex "BEXTR(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BEXTR(32|64)rm")>; - -// BZHI. -// r,r,r. -def : InstRW<[WriteALU], (instregex "BZHI(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BZHI(32|64)rm")>; - // CLD STD. def : InstRW<[WriteALU], (instregex "STD", "CLD")>; // PDEP PEXT. // r,r,r. def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; -// r,m,r. +// r,r,m. def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; // ROR ROL. diff --git a/llvm/test/CodeGen/X86/bmi-schedule.ll b/llvm/test/CodeGen/X86/bmi-schedule.ll index e38bf5ddd39a..d5ab1cf5f508 100644 --- a/llvm/test/CodeGen/X86/bmi-schedule.ll +++ b/llvm/test/CodeGen/X86/bmi-schedule.ll @@ -172,8 +172,8 @@ define i64 @test_andn_i64(i64 %a0, i64 %a1, i64 *%a2) { define i32 @test_bextr_i32(i32 %a0, i32 %a1, i32 *%a2) { ; GENERIC-LABEL: test_bextr_i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx # sched: [5:0.50] -; GENERIC-NEXT: bextrl %edi, %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: bextrl %edi, (%rdx), %ecx # sched: [6:1.00] +; GENERIC-NEXT: bextrl %edi, %esi, %eax # sched: [2:1.00] ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -222,8 +222,8 @@ declare i32 @llvm.x86.bmi.bextr.32(i32, i32) define i64 @test_bextr_i64(i64 %a0, i64 %a1, i64 *%a2) { ; GENERIC-LABEL: test_bextr_i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [5:0.50] -; GENERIC-NEXT: bextrq %rdi, %rsi, %rax # sched: [1:0.33] +; GENERIC-NEXT: bextrq %rdi, (%rdx), %rcx # sched: [6:1.00] +; GENERIC-NEXT: bextrq %rdi, %rsi, %rax # sched: [2:1.00] ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; diff --git a/llvm/test/CodeGen/X86/bmi2-schedule.ll b/llvm/test/CodeGen/X86/bmi2-schedule.ll index 1ccd7c394f79..0a79b661e2af 100644 --- a/llvm/test/CodeGen/X86/bmi2-schedule.ll +++ b/llvm/test/CodeGen/X86/bmi2-schedule.ll @@ -9,8 +9,8 @@ define i32 @test_bzhi_i32(i32 %a0, i32 %a1, i32 *%a2) { ; GENERIC-LABEL: test_bzhi_i32: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:0.50] -; GENERIC-NEXT: bzhil %edi, %esi, %eax # sched: [1:0.33] +; GENERIC-NEXT: bzhil %edi, (%rdx), %ecx # sched: [5:1.00] +; GENERIC-NEXT: bzhil %edi, %esi, %eax # sched: [1:1.00] ; GENERIC-NEXT: addl %ecx, %eax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ; @@ -59,8 +59,8 @@ declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) define i64 @test_bzhi_i64(i64 %a0, i64 %a1, i64 *%a2) { ; GENERIC-LABEL: test_bzhi_i64: ; GENERIC: # %bb.0: -; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:0.50] -; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:0.33] +; GENERIC-NEXT: bzhiq %rdi, (%rdx), %rcx # sched: [5:1.00] +; GENERIC-NEXT: bzhiq %rdi, %rsi, %rax # sched: [1:1.00] ; GENERIC-NEXT: addq %rcx, %rax # sched: [1:0.33] ; GENERIC-NEXT: retq # sched: [1:1.00] ;