forked from OSchip/llvm-project
[X86] Use SHLD with both inputs from the same register to implement rotate on Sandy Bridge and later Intel CPUs
Summary: Sandy Bridge and later CPUs have better throughput using a SHLD to implement rotate versus the normal rotate instructions. Additionally it saves one uop and avoids a partial flag update dependency. This patch implements this change on any Sandy Bridge or later processor without BMI2 instructions. With BMI2 we will use RORX as we currently do. Reviewers: zvi Reviewed By: zvi Subscribers: llvm-commits Differential Revision: https://reviews.llvm.org/D30181 llvm-svn: 295697
This commit is contained in:
parent
b4f9625a7b
commit
d88389aa7e
|
@ -263,6 +263,15 @@ def FeatureFastLZCNT
|
|||
"fast-lzcnt", "HasFastLZCNT", "true",
|
||||
"LZCNT instructions are as fast as most simple integer ops">;
|
||||
|
||||
|
||||
// Sandy Bridge and newer processors can use SHLD with the same source on both
|
||||
// inputs to implement rotate to avoid the partial flag update of the normal
|
||||
// rotate instructions.
|
||||
def FeatureFastSHLDRotate
|
||||
: SubtargetFeature<
|
||||
"fast-shld-rotate", "HasFastSHLDRotate", "true",
|
||||
"SHLD can be used as a faster rotate">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// X86 processors supported.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -458,7 +467,8 @@ def SNBFeatures : ProcessorFeatures<[], [
|
|||
FeatureXSAVE,
|
||||
FeatureXSAVEOPT,
|
||||
FeatureLAHFSAHF,
|
||||
FeatureFastScalarFSQRT
|
||||
FeatureFastScalarFSQRT,
|
||||
FeatureFastSHLDRotate
|
||||
]>;
|
||||
|
||||
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
|
||||
|
|
|
@ -897,6 +897,7 @@ def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
|
|||
def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
|
||||
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
|
||||
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
|
||||
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
|
||||
def HasMFence : Predicate<"Subtarget->hasMFence()">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -846,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
|
|||
|
||||
} // Defs = [EFLAGS]
|
||||
|
||||
// Sandy Bridge and newer Intel processors support faster rotates using
|
||||
// SHLD to avoid a partial flag update on the normal rotate instructions.
|
||||
let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
|
||||
def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
|
||||
(SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
|
||||
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
|
||||
(SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
|
||||
}
|
||||
|
||||
def ROT32L2R_imm8 : SDNodeXForm<imm, [{
|
||||
// Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
|
||||
return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
|
||||
|
|
|
@ -302,6 +302,7 @@ void X86Subtarget::initializeEnvironment() {
|
|||
HasFastScalarFSQRT = false;
|
||||
HasFastVectorFSQRT = false;
|
||||
HasFastLZCNT = false;
|
||||
HasFastSHLDRotate = false;
|
||||
HasSlowDivide32 = false;
|
||||
HasSlowDivide64 = false;
|
||||
PadShortFunctions = false;
|
||||
|
|
|
@ -229,6 +229,9 @@ protected:
|
|||
/// True if LZCNT instruction is fast.
|
||||
bool HasFastLZCNT;
|
||||
|
||||
/// True if SHLD based rotate is fast.
|
||||
bool HasFastSHLDRotate;
|
||||
|
||||
/// True if the short functions should be padded to prevent
|
||||
/// a stall when returning too early.
|
||||
bool PadShortFunctions;
|
||||
|
@ -466,6 +469,7 @@ public:
|
|||
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
|
||||
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
|
||||
bool hasFastLZCNT() const { return HasFastLZCNT; }
|
||||
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
|
||||
bool hasSlowDivide32() const { return HasSlowDivide32; }
|
||||
bool hasSlowDivide64() const { return HasSlowDivide64; }
|
||||
bool padShortFunctions() const { return PadShortFunctions; }
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
|
||||
; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
|
||||
; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
|
||||
|
||||
define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
|
||||
|
@ -49,6 +50,8 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xfoo:
|
||||
; CHECK: roll $7
|
||||
; SHLD-LABEL: xfoo:
|
||||
; SHLD: shldl $7
|
||||
; BMI2-LABEL: xfoo:
|
||||
; BMI2: rorxl $25
|
||||
%0 = lshr i32 %x, 25
|
||||
|
@ -61,6 +64,8 @@ define i32 @xfoop(i32* %p) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xfoop:
|
||||
; CHECK: roll $7
|
||||
; SHLD-LABEL: xfoop:
|
||||
; SHLD: shldl $7
|
||||
; BMI2-LABEL: xfoop:
|
||||
; BMI2: rorxl $25
|
||||
%x = load i32, i32* %p
|
||||
|
@ -84,6 +89,8 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xun:
|
||||
; CHECK: roll $25
|
||||
; SHLD-LABEL: xun:
|
||||
; SHLD: shldl $25
|
||||
; BMI2-LABEL: xun:
|
||||
; BMI2: rorxl $7
|
||||
%0 = lshr i32 %x, 7
|
||||
|
@ -96,6 +103,8 @@ define i32 @xunp(i32* %p) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xunp:
|
||||
; CHECK: roll $25
|
||||
; shld-label: xunp:
|
||||
; shld: shldl $25
|
||||
; BMI2-LABEL: xunp:
|
||||
; BMI2: rorxl $7
|
||||
%x = load i32, i32* %p
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
|
||||
; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
|
||||
; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
|
||||
|
||||
define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
|
||||
|
@ -49,6 +50,8 @@ define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xfoo:
|
||||
; CHECK: rolq $7
|
||||
; SHLD-LABEL: xfoo:
|
||||
; SHLD: shldq $7
|
||||
; BMI2-LABEL: xfoo:
|
||||
; BMI2: rorxq $57
|
||||
%0 = lshr i64 %x, 57
|
||||
|
@ -61,6 +64,8 @@ define i64 @xfoop(i64* %p) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xfoop:
|
||||
; CHECK: rolq $7
|
||||
; SHLD-LABEL: xfoop:
|
||||
; SHLD: shldq $7
|
||||
; BMI2-LABEL: xfoop:
|
||||
; BMI2: rorxq $57
|
||||
%x = load i64, i64* %p
|
||||
|
@ -84,6 +89,8 @@ define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xun:
|
||||
; CHECK: rolq $57
|
||||
; SHLD-LABEL: xun:
|
||||
; SHLD: shldq $57
|
||||
; BMI2-LABEL: xun:
|
||||
; BMI2: rorxq $7
|
||||
%0 = lshr i64 %x, 7
|
||||
|
@ -96,6 +103,8 @@ define i64 @xunp(i64* %p) nounwind readnone {
|
|||
entry:
|
||||
; CHECK-LABEL: xunp:
|
||||
; CHECK: rolq $57
|
||||
; SHLD-LABEL: xunp:
|
||||
; SHLD: shldq $57
|
||||
; BMI2-LABEL: xunp:
|
||||
; BMI2: rorxq $7
|
||||
%x = load i64, i64* %p
|
||||
|
|
Loading…
Reference in New Issue