X86 memcpy: use REPMOVSB instead of REPMOVS{Q,D,W} for inline copies

when the subtarget has fast strings. This has two advantages: - Speed is improved. For example, on Haswell thoughput improvements increase linearly with size from 256 to 512 bytes, after which they plateau: (e.g. 1% for 260 bytes, 25% for 400 bytes, 40% for 508 bytes). - Code is much smaller (no need to handle boundaries). llvm-svn: 300957
2017-04-21 09:20:39 +00:00 · 2017-04-21 09:20:39 +00:00 · 1ce3b82dea
parent f8a9642526
commit 1ce3b82dea
6 changed files with 35 additions and 1 deletions
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@ -273,6 +273,13 @@ def FeatureFastSHLDRotate
          "fast-shld-rotate", "HasFastSHLDRotate", "true",
          "SHLD can be used as a faster rotate">;

+// String operations (e.g. REP MOVS) are fast. See "REP String Enhancement" in
+// the Intel Software Development Manual.
+def FeatureFastString
+    : SubtargetFeature<
+          "fast-string", "HasFastString", "true",
+          "REP MOVS/STOS are fast">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@ -498,6 +505,7 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
  FeatureAVX2,
  FeatureBMI,
  FeatureBMI2,
+  FeatureFastString,
  FeatureFMA,
  FeatureLZCNT,
  FeatureMOVBE,
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@ -897,6 +897,7 @@ def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
 def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
+def HasFastString : Predicate<"Subtarget->hasFastString()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;

 //===----------------------------------------------------------------------===//
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@ -215,7 +215,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
    return SDValue();

  MVT AVT;
-  if (Align & 1)
+  if (Subtarget.hasFastString())
+    // If the target has fast strings, then it's at least as fast to use
+    // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle
+    // BytesLeft.
+    AVT = MVT::i8;
+  else if (Align & 1)
    AVT = MVT::i8;
  else if (Align & 2)
    AVT = MVT::i16;
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@ -303,6 +303,7 @@ void X86Subtarget::initializeEnvironment() {
  HasFastVectorFSQRT = false;
  HasFastLZCNT = false;
  HasFastSHLDRotate = false;
+  HasFastString = false;
  HasSlowDivide32 = false;
  HasSlowDivide64 = false;
  PadShortFunctions = false;
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@ -232,6 +232,9 @@ protected:
  /// True if SHLD based rotate is fast.
  bool HasFastSHLDRotate;

+  /// True if the processor has fast REP MOVS.
+  bool HasFastString;
+
  /// True if the short functions should be padded to prevent
  /// a stall when returning too early.
  bool PadShortFunctions;
@ -472,6 +475,7 @@ public:
  bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
  bool hasFastLZCNT() const { return HasFastLZCNT; }
  bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+  bool hasFastString() const { return HasFastString; }
  bool hasSlowDivide32() const { return HasSlowDivide32; }
  bool hasSlowDivide64() const { return HasSlowDivide64; }
  bool padShortFunctions() const { return PadShortFunctions; }
--- a/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
+++ b/llvm/test/CodeGen/X86/memcpy-struct-by-value.ll
@ -0,0 +1,15 @@
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+fast-string < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST
+
+%struct.large = type { [4096 x i8] }
+
+declare void @foo(%struct.large* align 8 byval) nounwind
+
+define void @test1(%struct.large* nocapture %x) nounwind {
+  call void @foo(%struct.large* align 8 byval %x)
+  ret void
+
+; ALL-LABEL: test1:
+; NOFAST: rep;movsq
+; FAST: rep;movsb
+}