forked from OSchip/llvm-project
X86 memcpy: use REPMOVSB instead of REPMOVS{Q,D,W} for inline copies
when the subtarget has fast strings. This has two advantages: - Speed is improved. For example, on Haswell thoughput improvements increase linearly with size from 256 to 512 bytes, after which they plateau: (e.g. 1% for 260 bytes, 25% for 400 bytes, 40% for 508 bytes). - Code is much smaller (no need to handle boundaries). llvm-svn: 300957
This commit is contained in:
parent
f8a9642526
commit
1ce3b82dea
|
@ -273,6 +273,13 @@ def FeatureFastSHLDRotate
|
|||
"fast-shld-rotate", "HasFastSHLDRotate", "true",
|
||||
"SHLD can be used as a faster rotate">;
|
||||
|
||||
// String operations (e.g. REP MOVS) are fast. See "REP String Enhancement" in
|
||||
// the Intel Software Development Manual.
|
||||
def FeatureFastString
|
||||
: SubtargetFeature<
|
||||
"fast-string", "HasFastString", "true",
|
||||
"REP MOVS/STOS are fast">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// X86 processors supported.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -498,6 +505,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
|
|||
FeatureAVX2,
|
||||
FeatureBMI,
|
||||
FeatureBMI2,
|
||||
FeatureFastString,
|
||||
FeatureFMA,
|
||||
FeatureLZCNT,
|
||||
FeatureMOVBE,
|
||||
|
|
|
@ -897,6 +897,7 @@ def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
|
|||
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
|
||||
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
|
||||
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
|
||||
def HasFastString : Predicate<"Subtarget->hasFastString()">;
|
||||
def HasMFence : Predicate<"Subtarget->hasMFence()">;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -215,7 +215,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
|
|||
return SDValue();
|
||||
|
||||
MVT AVT;
|
||||
if (Align & 1)
|
||||
if (Subtarget.hasFastString())
|
||||
// If the target has fast strings, then it's at least as fast to use
|
||||
// REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
|
||||
// BytesLeft.
|
||||
AVT = MVT::i8;
|
||||
else if (Align & 1)
|
||||
AVT = MVT::i8;
|
||||
else if (Align & 2)
|
||||
AVT = MVT::i16;
|
||||
|
|
|
@ -303,6 +303,7 @@ void X86Subtarget::initializeEnvironment() {
|
|||
HasFastVectorFSQRT = false;
|
||||
HasFastLZCNT = false;
|
||||
HasFastSHLDRotate = false;
|
||||
HasFastString = false;
|
||||
HasSlowDivide32 = false;
|
||||
HasSlowDivide64 = false;
|
||||
PadShortFunctions = false;
|
||||
|
|
|
@ -232,6 +232,9 @@ protected:
|
|||
/// True if SHLD based rotate is fast.
|
||||
bool HasFastSHLDRotate;
|
||||
|
||||
/// True if the processor has fast REP MOVS.
|
||||
bool HasFastString;
|
||||
|
||||
/// True if the short functions should be padded to prevent
|
||||
/// a stall when returning too early.
|
||||
bool PadShortFunctions;
|
||||
|
@ -472,6 +475,7 @@ public:
|
|||
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
|
||||
bool hasFastLZCNT() const { return HasFastLZCNT; }
|
||||
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
|
||||
bool hasFastString() const { return HasFastString; }
|
||||
bool hasSlowDivide32() const { return HasSlowDivide32; }
|
||||
bool hasSlowDivide64() const { return HasSlowDivide64; }
|
||||
bool padShortFunctions() const { return PadShortFunctions; }
|
||||
|
|
|
@ -0,0 +1,15 @@
|
|||
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
|
||||
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
|
||||
|
||||
%struct.large = type { [4096 x i8] }
|
||||
|
||||
declare void @foo(%struct.large* align 8 byval) nounwind
|
||||
|
||||
define void @test1(%struct.large* nocapture %x) nounwind {
|
||||
call void @foo(%struct.large* align 8 byval %x)
|
||||
ret void
|
||||
|
||||
; ALL-LABEL: test1:
|
||||
; NOFAST: rep;movsq
|
||||
; FAST: rep;movsb
|
||||
}
|
Loading…
Reference in New Issue