[X86] Use SHLD with both inputs from the same register to implement rotate on Sandy Bridge and later Intel CPUs

Summary:
Sandy Bridge and later CPUs have better throughput using a SHLD to implement rotate versus the normal rotate instructions. Additionally it saves one uop and avoids a partial flag update dependency.

This patch implements this change on any Sandy Bridge or later processor without BMI2 instructions. With BMI2 we will use RORX as we currently do.

Reviewers: zvi

Reviewed By: zvi

Subscribers: llvm-commits

Differential Revision: https://reviews.llvm.org/D30181

llvm-svn: 295697
This commit is contained in:
Craig Topper 2017-02-21 06:39:13 +00:00
parent b4f9625a7b
commit d88389aa7e
7 changed files with 44 additions and 1 deletions

View File

@ -263,6 +263,15 @@ def FeatureFastLZCNT
"fast-lzcnt", "HasFastLZCNT", "true",
"LZCNT instructions are as fast as most simple integer ops">;
// Sandy Bridge and newer processors can use SHLD with the same source on both
// inputs to implement rotate to avoid the partial flag update of the normal
// rotate instructions.
def FeatureFastSHLDRotate
: SubtargetFeature<
"fast-shld-rotate", "HasFastSHLDRotate", "true",
"SHLD can be used as a faster rotate">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
@ -458,7 +467,8 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureXSAVE,
FeatureXSAVEOPT,
FeatureLAHFSAHF,
FeatureFastScalarFSQRT
FeatureFastScalarFSQRT,
FeatureFastSHLDRotate
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,

View File

@ -897,6 +897,7 @@ def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
//===----------------------------------------------------------------------===//

View File

@ -846,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
} // Defs = [EFLAGS]
// Sandy Bridge and newer Intel processors support faster rotates using
// SHLD to avoid a partial flag update on the normal rotate instructions.
let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
(SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
(SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
}
def ROT32L2R_imm8 : SDNodeXForm<imm, [{
// Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
return getI8Imm(32 - N->getZExtValue(), SDLoc(N));

View File

@ -302,6 +302,7 @@ void X86Subtarget::initializeEnvironment() {
HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;
HasFastLZCNT = false;
HasFastSHLDRotate = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;

View File

@ -229,6 +229,9 @@ protected:
/// True if LZCNT instruction is fast.
bool HasFastLZCNT;
/// True if SHLD based rotate is fast.
bool HasFastSHLDRotate;
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions;
@ -466,6 +469,7 @@ public:
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }

View File

@ -1,4 +1,5 @@
; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
@ -49,6 +50,8 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
entry:
; CHECK-LABEL: xfoo:
; CHECK: roll $7
; SHLD-LABEL: xfoo:
; SHLD: shldl $7
; BMI2-LABEL: xfoo:
; BMI2: rorxl $25
%0 = lshr i32 %x, 25
@ -61,6 +64,8 @@ define i32 @xfoop(i32* %p) nounwind readnone {
entry:
; CHECK-LABEL: xfoop:
; CHECK: roll $7
; SHLD-LABEL: xfoop:
; SHLD: shldl $7
; BMI2-LABEL: xfoop:
; BMI2: rorxl $25
%x = load i32, i32* %p
@ -84,6 +89,8 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
entry:
; CHECK-LABEL: xun:
; CHECK: roll $25
; SHLD-LABEL: xun:
; SHLD: shldl $25
; BMI2-LABEL: xun:
; BMI2: rorxl $7
%0 = lshr i32 %x, 7
@ -96,6 +103,8 @@ define i32 @xunp(i32* %p) nounwind readnone {
entry:
; CHECK-LABEL: xunp:
; CHECK: roll $25
; shld-label: xunp:
; shld: shldl $25
; BMI2-LABEL: xunp:
; BMI2: rorxl $7
%x = load i32, i32* %p

View File

@ -1,4 +1,5 @@
; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
@ -49,6 +50,8 @@ define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
entry:
; CHECK-LABEL: xfoo:
; CHECK: rolq $7
; SHLD-LABEL: xfoo:
; SHLD: shldq $7
; BMI2-LABEL: xfoo:
; BMI2: rorxq $57
%0 = lshr i64 %x, 57
@ -61,6 +64,8 @@ define i64 @xfoop(i64* %p) nounwind readnone {
entry:
; CHECK-LABEL: xfoop:
; CHECK: rolq $7
; SHLD-LABEL: xfoop:
; SHLD: shldq $7
; BMI2-LABEL: xfoop:
; BMI2: rorxq $57
%x = load i64, i64* %p
@ -84,6 +89,8 @@ define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
entry:
; CHECK-LABEL: xun:
; CHECK: rolq $57
; SHLD-LABEL: xun:
; SHLD: shldq $57
; BMI2-LABEL: xun:
; BMI2: rorxq $7
%0 = lshr i64 %x, 7
@ -96,6 +103,8 @@ define i64 @xunp(i64* %p) nounwind readnone {
entry:
; CHECK-LABEL: xunp:
; CHECK: rolq $57
; SHLD-LABEL: xunp:
; SHLD: shldq $57
; BMI2-LABEL: xunp:
; BMI2: rorxq $7
%x = load i64, i64* %p