llvm-project/llvm/test/CodeGen/X86/mul-constant-i32.ll

2026 lines
67 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=X64-JAG
; RUN: llc < %s -mtriple=i686-unknown -mul-constant-optimization=false | FileCheck %s --check-prefix=X86-NOOPT
; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=haswell| FileCheck %s --check-prefix=HSW-NOOPT
; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=btver2| FileCheck %s --check-prefix=JAG-NOOPT
; RUN: llc < %s -mtriple=x86_64-unknown -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=X64-SLM
; RUN: llc < %s -mtriple=x86_64-unknown -mul-constant-optimization=false -print-schedule=true -mcpu=slm| FileCheck %s --check-prefix=SLM-NOOPT
define i32 @test_mul_by_1(i32 %x) {
; X86-LABEL: test_mul_by_1:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_1:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_1:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_1:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_1:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_1:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_1:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_1:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 1
ret i32 %mul
}
define i32 @test_mul_by_2(i32 %x) {
; X86-LABEL: test_mul_by_2:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_2:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_2:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_2:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: addl %eax, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_2:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_2:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; JAG-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_2:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_2:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 2
ret i32 %mul
}
define i32 @test_mul_by_3(i32 %x) {
; X86-LABEL: test_mul_by_3:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_3:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_3:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_3:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $3, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_3:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_3:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; JAG-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [2:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_3:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_3:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 3
ret i32 %mul
}
define i32 @test_mul_by_4(i32 %x) {
; X86-LABEL: test_mul_by_4:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_4:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_4:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_4:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $2, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_4:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; HSW-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_4:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; JAG-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [2:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_4:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_4:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; SLM-NOOPT-NEXT: leal (,%rdi,4), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 4
ret i32 %mul
}
define i32 @test_mul_by_5(i32 %x) {
; X86-LABEL: test_mul_by_5:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_5:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_5:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_5:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $5, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_5:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_5:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; JAG-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_5:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_5:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 5
ret i32 %mul
}
define i32 @test_mul_by_6(i32 %x) {
; X86-LABEL: test_mul_by_6:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_6:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_6:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_6:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $6, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_6:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_6:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_6:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_6:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $6, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 6
ret i32 %mul
}
define i32 @test_mul_by_7(i32 %x) {
; X86-LABEL: test_mul_by_7:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (,%ecx,8), %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_7:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_7:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_7:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $7, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_7:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_7:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_7:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_7:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $7, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 7
ret i32 %mul
}
define i32 @test_mul_by_8(i32 %x) {
; X86-LABEL: test_mul_by_8:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $3, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_8:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_8:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_8:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $3, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_8:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; HSW-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_8:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; JAG-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [2:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_8:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_8:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; SLM-NOOPT-NEXT: leal (,%rdi,8), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 8
ret i32 %mul
}
define i32 @test_mul_by_9(i32 %x) {
; X86-LABEL: test_mul_by_9:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_9:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_9:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_9:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $9, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_9:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_9:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; JAG-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_9:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_9:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; SLM-NOOPT-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 9
ret i32 %mul
}
define i32 @test_mul_by_10(i32 %x) {
; X86-LABEL: test_mul_by_10:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_10:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_10:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_10:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $10, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_10:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_10:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_10:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_10:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $10, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 10
ret i32 %mul
}
define i32 @test_mul_by_11(i32 %x) {
; X86-LABEL: test_mul_by_11:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_11:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_11:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_11:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $11, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_11:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_11:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_11:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_11:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $11, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 11
ret i32 %mul
}
define i32 @test_mul_by_12(i32 %x) {
; X86-LABEL: test_mul_by_12:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_12:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_12:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50]
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_12:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $12, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_12:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_12:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_12:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_12:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $12, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 12
ret i32 %mul
}
define i32 @test_mul_by_13(i32 %x) {
; X86-LABEL: test_mul_by_13:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,2), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_13:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_13:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_13:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $13, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_13:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_13:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_13:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_13:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $13, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 13
ret i32 %mul
}
define i32 @test_mul_by_14(i32 %x) {
; X86-LABEL: test_mul_by_14:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $4, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_14:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_14:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_14:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $14, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_14:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_14:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_14:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_14:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $14, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 14
ret i32 %mul
}
define i32 @test_mul_by_15(i32 %x) {
; X86-LABEL: test_mul_by_15:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_15:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_15:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_15:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $15, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_15:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_15:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_15:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_15:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $15, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 15
ret i32 %mul
}
define i32 @test_mul_by_16(i32 %x) {
; X86-LABEL: test_mul_by_16:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $4, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_16:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shll $4, %edi # sched: [1:0.50]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_16:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shll $4, %edi # sched: [1:0.50]
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_16:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $4, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_16:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_16:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: shll $4, %edi # sched: [1:0.50]
; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_16:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shll $4, %edi # sched: [1:1.00]
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_16:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: shll $4, %edi # sched: [1:1.00]
; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 16
ret i32 %mul
}
define i32 @test_mul_by_17(i32 %x) {
; X86-LABEL: test_mul_by_17:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $4, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_17:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $4, %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_17:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $4, %eax # sched: [1:0.50]
; X64-JAG-NEXT: leal (%rax,%rdi), %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_17:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $17, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_17:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_17:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_17:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: shll $4, %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rdi), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_17:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $17, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 17
ret i32 %mul
}
define i32 @test_mul_by_18(i32 %x) {
; X86-LABEL: test_mul_by_18:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: addl %eax, %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_18:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: addl %edi, %edi # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_18:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: addl %edi, %edi # sched: [1:0.50]
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_18:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $18, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_18:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_18:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_18:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: addl %edi, %edi # sched: [1:0.50]
; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_18:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $18, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 18
ret i32 %mul
}
define i32 @test_mul_by_19(i32 %x) {
; X86-LABEL: test_mul_by_19:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%eax,%ecx,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_19:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_19:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_19:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $19, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_19:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_19:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_19:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_19:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $19, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 19
ret i32 %mul
}
define i32 @test_mul_by_20(i32 %x) {
; X86-LABEL: test_mul_by_20:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $2, %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_20:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: shll $2, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_20:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: shll $2, %edi # sched: [1:0.50]
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_20:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $20, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_20:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_20:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_20:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: shll $2, %edi # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_20:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $20, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 20
ret i32 %mul
}
define i32 @test_mul_by_21(i32 %x) {
; X86-LABEL: test_mul_by_21:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_21:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_21:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_21:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $21, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_21:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_21:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_21:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_21:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $21, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 21
ret i32 %mul
}
define i32 @test_mul_by_22(i32 %x) {
; X86-LABEL: test_mul_by_22:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,4), %eax
; X86-NEXT: leal (%ecx,%eax,4), %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_22:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_22:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_22:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $22, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_22:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_22:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_22:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_22:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $22, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 22
ret i32 %mul
}
define i32 @test_mul_by_23(i32 %x) {
; X86-LABEL: test_mul_by_23:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,2), %eax
; X86-NEXT: shll $3, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_23:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: shll $3, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_23:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: shll $3, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_23:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $23, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_23:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_23:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_23:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_23:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $23, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 23
ret i32 %mul
}
define i32 @test_mul_by_24(i32 %x) {
; X86-LABEL: test_mul_by_24:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $3, %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_24:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: shll $3, %edi # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_24:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: shll $3, %edi # sched: [1:0.50]
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_24:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $24, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_24:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_24:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_24:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: shll $3, %edi # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rdi,%rdi,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_24:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $24, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 24
ret i32 %mul
}
define i32 @test_mul_by_25(i32 %x) {
; X86-LABEL: test_mul_by_25:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_25:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_25:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rax,%rax,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_25:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $25, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_25:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_25:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_25:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rax,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_25:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $25, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 25
ret i32 %mul
}
define i32 @test_mul_by_26(i32 %x) {
; X86-LABEL: test_mul_by_26:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,4), %eax
; X86-NEXT: leal (%eax,%eax,4), %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_26:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_26:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rax,%rax,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_26:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $26, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_26:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_26:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_26:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_26:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $26, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 26
ret i32 %mul
}
define i32 @test_mul_by_27(i32 %x) {
; X86-LABEL: test_mul_by_27:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_27:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_27:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_27:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $27, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_27:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_27:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_27:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: leal (%rax,%rax,2), %eax # sched: [1:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_27:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $27, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 27
ret i32 %mul
}
define i32 @test_mul_by_28(i32 %x) {
; X86-LABEL: test_mul_by_28:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_28:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_28:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_28:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $28, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_28:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_28:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_28:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_28:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $28, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 28
ret i32 %mul
}
define i32 @test_mul_by_29(i32 %x) {
; X86-LABEL: test_mul_by_29:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: leal (%ecx,%ecx,8), %eax
; X86-NEXT: leal (%eax,%eax,2), %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_29:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rax,%rax,2), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: addl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_29:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rax,%rax,2), %eax # sched: [2:1.00]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: addl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_29:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $29, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_29:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_29:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_29:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_29:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $29, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 29
ret i32 %mul
}
define i32 @test_mul_by_30(i32 %x) {
; X86-LABEL: test_mul_by_30:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_30:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_30:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_30:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $30, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_30:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_30:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_30:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: shll $5, %eax # sched: [1:1.00]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_30:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $30, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 30
ret i32 %mul
}
define i32 @test_mul_by_31(i32 %x) {
; X86-LABEL: test_mul_by_31:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_31:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_31:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $5, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_31:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $31, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_31:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_31:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_31:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: shll $5, %eax # sched: [1:1.00]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_31:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $31, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 31
ret i32 %mul
}
define i32 @test_mul_by_32(i32 %x) {
; X86-LABEL: test_mul_by_32:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_32:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: shll $5, %edi # sched: [1:0.50]
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_32:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: shll $5, %edi # sched: [1:0.50]
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_32:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: shll $5, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_32:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
; HSW-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.25]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_32:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: shll $5, %edi # sched: [1:0.50]
; JAG-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_32:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: shll $5, %edi # sched: [1:1.00]
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_32:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: shll $5, %edi # sched: [1:1.00]
; SLM-NOOPT-NEXT: movl %edi, %eax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 32
ret i32 %mul
}
define i32 @test_mul_by_37(i32 %x) {
; X86-LABEL: test_mul_by_37:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%eax,%ecx,4), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_37:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_37:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_37:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $37, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_37:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $37, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_37:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $37, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_37:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $37, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_37:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $37, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 37
ret i32 %mul
}
define i32 @test_mul_by_41(i32 %x) {
; X86-LABEL: test_mul_by_41:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,4), %ecx
; X86-NEXT: leal (%eax,%ecx,8), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_41:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_41:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_41:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $41, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_41:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $41, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_41:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $41, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_41:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $41, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_41:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $41, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 41
ret i32 %mul
}
define i32 @test_mul_by_62(i32 %x) {
; X86-LABEL: test_mul_by_62:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shll $6, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: subl %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_62:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: movl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: shll $6, %eax # sched: [1:0.50]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: subl %edi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_62:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: shll $6, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_62:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $62, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_62:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $62, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_62:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $62, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_62:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: movl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: shll $6, %eax # sched: [1:1.00]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: subl %edi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_62:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $62, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 62
ret i32 %mul
}
define i32 @test_mul_by_73(i32 %x) {
; X86-LABEL: test_mul_by_73:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %ecx
; X86-NEXT: leal (%eax,%ecx,8), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_by_73:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: leal (%rdi,%rax,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_by_73:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: leal (%rdi,%rax,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_by_73:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $73, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_by_73:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $73, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_by_73:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $73, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_by_73:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: imull $73, %edi, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_by_73:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $73, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 73
ret i32 %mul
}
; (x*9+42)*(x*5+2)
define i32 @test_mul_spec(i32 %x) nounwind {
; X86-LABEL: test_mul_spec:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal 42(%eax,%eax,8), %ecx
; X86-NEXT: leal 2(%eax,%eax,4), %eax
; X86-NEXT: imull %ecx, %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: test_mul_spec:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
; X64-HSW-NEXT: addl $42, %ecx # sched: [1:0.25]
; X64-HSW-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; X64-HSW-NEXT: addl $2, %eax # sched: [1:0.25]
; X64-HSW-NEXT: imull %ecx, %eax # sched: [3:1.00]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: test_mul_spec:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; X64-JAG-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [2:1.00]
; X64-JAG-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [2:1.00]
; X64-JAG-NEXT: imull %ecx, %eax # sched: [3:1.00]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: test_mul_spec:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: leal 42(%eax,%eax,8), %ecx
; X86-NOOPT-NEXT: leal 2(%eax,%eax,4), %eax
; X86-NOOPT-NEXT: imull %ecx, %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: test_mul_spec:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,8), %ecx # sched: [1:0.50]
; HSW-NOOPT-NEXT: addl $42, %ecx # sched: [1:0.25]
; HSW-NOOPT-NEXT: leal (%rdi,%rdi,4), %eax # sched: [1:0.50]
; HSW-NOOPT-NEXT: addl $2, %eax # sched: [1:0.25]
; HSW-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: test_mul_spec:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
[X86][BtVer2] correctly model the latency/throughput of LEA instructions. This patch fixes the latency/throughput of LEA instructions in the BtVer2 scheduling model. On Jaguar, A 3-operands LEA has a latency of 2cy, and a reciprocal throughput of 1. That is because it uses one cycle of SAGU followed by 1cy of ALU1. An LEA with a "Scale" operand is also slow, and it has the same latency profile as the 3-operands LEA. An LEA16r has a latency of 3cy, and a throughput of 0.5 (i.e. RThrouhgput of 2.0). This patch adds a new TIIPredicate named IsThreeOperandsLEAFn to X86Schedule.td. The tablegen backend (for instruction-info) expands that definition into this (file X86GenInstrInfo.inc): ``` static bool isThreeOperandsLEA(const MachineInstr &MI) { return ( ( MI.getOpcode() == X86::LEA32r || MI.getOpcode() == X86::LEA64r || MI.getOpcode() == X86::LEA64_32r || MI.getOpcode() == X86::LEA16r ) && MI.getOperand(1).isReg() && MI.getOperand(1).getReg() != 0 && MI.getOperand(3).isReg() && MI.getOperand(3).getReg() != 0 && ( ( MI.getOperand(4).isImm() && MI.getOperand(4).getImm() != 0 ) || (MI.getOperand(4).isGlobal()) ) ); } ``` A similar method is generated in the X86_MC namespace, and included into X86MCTargetDesc.cpp (the declaration lives in X86MCTargetDesc.h). Back to the BtVer2 scheduling model: A new scheduling predicate named JSlowLEAPredicate now checks if either the instruction is a three-operands LEA, or it is an LEA with a Scale value different than 1. A variant scheduling class uses that new predicate to correctly select the appropriate latency profile. Differential Revision: https://reviews.llvm.org/D49436 llvm-svn: 337469
2018-07-20 00:42:15 +08:00
; JAG-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [2:1.00]
; JAG-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [2:1.00]
; JAG-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: test_mul_spec:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
; X64-SLM-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
; X64-SLM-NEXT: imull %ecx, %eax # sched: [3:1.00]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: test_mul_spec:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: # kill: def $edi killed $edi def $rdi
; SLM-NOOPT-NEXT: leal 42(%rdi,%rdi,8), %ecx # sched: [1:1.00]
; SLM-NOOPT-NEXT: leal 2(%rdi,%rdi,4), %eax # sched: [1:1.00]
; SLM-NOOPT-NEXT: imull %ecx, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%mul = mul nsw i32 %x, 9
%add = add nsw i32 %mul, 42
%mul2 = mul nsw i32 %x, 5
%add2 = add nsw i32 %mul2, 2
%mul3 = mul nsw i32 %add, %add2
ret i32 %mul3
}
; This makes sure we are able to fold the negate generated by the mul expansion
; into the next instruction.
; FIXME: We make this work.
define i32 @mul_neg_fold(i32 %a, i32 %b) {
; X86-LABEL: mul_neg_fold:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: leal (%eax,%eax,8), %eax
; X86-NEXT: negl %eax
; X86-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NEXT: retl
;
; X64-HSW-LABEL: mul_neg_fold:
; X64-HSW: # %bb.0:
; X64-HSW-NEXT: # kill: def $edi killed $edi def $rdi
; X64-HSW-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:0.50]
; X64-HSW-NEXT: negl %eax # sched: [1:0.25]
; X64-HSW-NEXT: addl %esi, %eax # sched: [1:0.25]
; X64-HSW-NEXT: retq # sched: [7:1.00]
;
; X64-JAG-LABEL: mul_neg_fold:
; X64-JAG: # %bb.0:
; X64-JAG-NEXT: # kill: def $edi killed $edi def $rdi
; X64-JAG-NEXT: leal (%rdi,%rdi,8), %eax # sched: [2:1.00]
; X64-JAG-NEXT: negl %eax # sched: [1:0.50]
; X64-JAG-NEXT: addl %esi, %eax # sched: [1:0.50]
; X64-JAG-NEXT: retq # sched: [4:1.00]
;
; X86-NOOPT-LABEL: mul_neg_fold:
; X86-NOOPT: # %bb.0:
; X86-NOOPT-NEXT: imull $-9, {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: addl {{[0-9]+}}(%esp), %eax
; X86-NOOPT-NEXT: retl
;
; HSW-NOOPT-LABEL: mul_neg_fold:
; HSW-NOOPT: # %bb.0:
; HSW-NOOPT-NEXT: imull $-9, %edi, %eax # sched: [3:1.00]
; HSW-NOOPT-NEXT: addl %esi, %eax # sched: [1:0.25]
; HSW-NOOPT-NEXT: retq # sched: [7:1.00]
;
; JAG-NOOPT-LABEL: mul_neg_fold:
; JAG-NOOPT: # %bb.0:
; JAG-NOOPT-NEXT: imull $-9, %edi, %eax # sched: [3:1.00]
; JAG-NOOPT-NEXT: addl %esi, %eax # sched: [1:0.50]
; JAG-NOOPT-NEXT: retq # sched: [4:1.00]
;
; X64-SLM-LABEL: mul_neg_fold:
; X64-SLM: # %bb.0:
; X64-SLM-NEXT: # kill: def $edi killed $edi def $rdi
; X64-SLM-NEXT: leal (%rdi,%rdi,8), %eax # sched: [1:1.00]
; X64-SLM-NEXT: negl %eax # sched: [1:0.50]
; X64-SLM-NEXT: addl %esi, %eax # sched: [1:0.50]
; X64-SLM-NEXT: retq # sched: [4:1.00]
;
; SLM-NOOPT-LABEL: mul_neg_fold:
; SLM-NOOPT: # %bb.0:
; SLM-NOOPT-NEXT: imull $-9, %edi, %eax # sched: [3:1.00]
; SLM-NOOPT-NEXT: addl %esi, %eax # sched: [1:0.50]
; SLM-NOOPT-NEXT: retq # sched: [4:1.00]
%c = mul i32 %a, -9
%d = add i32 %b, %c
ret i32 %d
}