X86 memcpy: use REPMOVSB instead of REPMOVS{Q,D,W} for inline copies

when the subtarget has fast strings.

This has two advantages:
  - Speed is improved. For example, on Haswell thoughput improvements increase
    linearly with size from 256 to 512 bytes, after which they plateau:
    (e.g. 1% for 260 bytes, 25% for 400 bytes, 40% for 508 bytes).
  - Code is much smaller (no need to handle boundaries).

llvm-svn: 300957
This commit is contained in:
Clement Courbet 2017-04-21 09:20:39 +00:00
parent f8a9642526
commit 1ce3b82dea
6 changed files with 35 additions and 1 deletions

View File

@ -273,6 +273,13 @@ def FeatureFastSHLDRotate
"fast-shld-rotate", "HasFastSHLDRotate", "true",
"SHLD can be used as a faster rotate">;
// String operations (e.g. REP MOVS) are fast. See "REP String Enhancement" in
// the Intel Software Development Manual.
def FeatureFastString
: SubtargetFeature<
"fast-string", "HasFastString", "true",
"REP MOVS/STOS are fast">;
//===----------------------------------------------------------------------===//
// X86 processors supported.
//===----------------------------------------------------------------------===//
@ -498,6 +505,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
FeatureAVX2,
FeatureBMI,
FeatureBMI2,
FeatureFastString,
FeatureFMA,
FeatureLZCNT,
FeatureMOVBE,

View File

@ -897,6 +897,7 @@ def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
def HasFastString : Predicate<"Subtarget->hasFastString()">;
def HasMFence : Predicate<"Subtarget->hasMFence()">;
//===----------------------------------------------------------------------===//

View File

@ -215,7 +215,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
return SDValue();
MVT AVT;
if (Align & 1)
if (Subtarget.hasFastString())
// If the target has fast strings, then it's at least as fast to use
// REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
// BytesLeft.
AVT = MVT::i8;
else if (Align & 1)
AVT = MVT::i8;
else if (Align & 2)
AVT = MVT::i16;

View File

@ -303,6 +303,7 @@ void X86Subtarget::initializeEnvironment() {
HasFastVectorFSQRT = false;
HasFastLZCNT = false;
HasFastSHLDRotate = false;
HasFastString = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;

View File

@ -232,6 +232,9 @@ protected:
/// True if SHLD based rotate is fast.
bool HasFastSHLDRotate;
/// True if the processor has fast REP MOVS.
bool HasFastString;
/// True if the short functions should be padded to prevent
/// a stall when returning too early.
bool PadShortFunctions;
@ -472,6 +475,7 @@ public:
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
bool hasFastString() const { return HasFastString; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }

View File

@ -0,0 +1,15 @@
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
%struct.large = type { [4096 x i8] }
declare void @foo(%struct.large* align 8 byval) nounwind
define void @test1(%struct.large* nocapture %x) nounwind {
call void @foo(%struct.large* align 8 byval %x)
ret void
; ALL-LABEL: test1:
; NOFAST: rep;movsq
; FAST: rep;movsb
}