[AArch64] Fix PR32384: bump up the number of stores per memset and memcpy

As suggested in https://bugs.llvm.org/show_bug.cgi?id=32384#c1, this change
makes the inlining of `memset()` and `memcpy()` more aggressive when
compiling for speed.  The tuning remains the same when optimizing for size.

Patch by: Sebastian Pop <s.pop@samsung.com>
          Evandro Menezes <e.menezes@samsung.com>

Differential revision: https://reviews.llvm.org/D45098

llvm-svn: 333429
This commit is contained in:
Evandro Menezes 2018-05-29 15:58:50 +00:00
parent 69301c9eb9
commit f8425340e4
4 changed files with 54 additions and 20 deletions

View File

@ -579,11 +579,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::GlobalAddress); setTargetDAGCombine(ISD::GlobalAddress);
MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; // In case of strict alignment, avoid an excessive number of byte wide stores.
MaxGluedStoresPerMemcpy = 4; MaxStoresPerMemsetOptSize = 8;
MaxStoresPerMemset = Subtarget->requiresStrictAlign()
? MaxStoresPerMemsetOptSize : 32;
MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; MaxGluedStoresPerMemcpy = 4;
MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; MaxStoresPerMemcpyOptSize = 4;
MaxStoresPerMemcpy = Subtarget->requiresStrictAlign()
? MaxStoresPerMemcpyOptSize : 16;
MaxStoresPerMemmoveOptSize = MaxStoresPerMemmove = 4;
setStackPointerRegisterToSaveRestore(AArch64::SP); setStackPointerRegisterToSaveRestore(AArch64::SP);

View File

@ -498,12 +498,12 @@ public:
CallingConv::ID CallConv, CallingConv::ID CallConv,
bool isVarArg) const override; bool isVarArg) const override;
private: private:
bool isExtFreeImpl(const Instruction *Ext) const override;
/// Keep a pointer to the AArch64Subtarget around so that we can /// Keep a pointer to the AArch64Subtarget around so that we can
/// make the right decision when generating code for different targets. /// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget; const AArch64Subtarget *Subtarget;
bool isExtFreeImpl(const Instruction *Ext) const override;
void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT); void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
void addDRTypeForNEON(MVT VT); void addDRTypeForNEON(MVT VT);
void addQRTypeForNEON(MVT VT); void addQRTypeForNEON(MVT VT);

View File

@ -1,14 +1,14 @@
; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \ ; RUN: llc %s -mtriple=arm64-apple-darwin -o - | \
; RUN: FileCheck --check-prefix=CHECK-DARWIN --check-prefix=CHECK %s ; RUN: FileCheck --check-prefixes=CHECK,CHECK-DARWIN %s
; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \ ; RUN: llc %s -mtriple=arm64-linux-gnu -o - | \
; RUN: FileCheck --check-prefix=CHECK-LINUX --check-prefix=CHECK %s ; RUN: FileCheck --check-prefixes=CHECK,CHECK-LINUX %s
; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset() ; <rdar://problem/14199482> ARM64: Calls to bzero() replaced with calls to memset()
; CHECK-LABEL: fct1: ; CHECK-LABEL: fct1:
; For small size (<= 256), we do not change memset to bzero. ; For small size (<= 256), we do not change memset to bzero.
; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-DARWIN: {{b|bl}} _memset
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct1(i8* nocapture %ptr) { define void @fct1(i8* nocapture %ptr) minsize {
entry: entry:
tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false) tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 256, i1 false)
ret void ret void
@ -20,7 +20,7 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
; When the size is bigger than 256, change into bzero. ; When the size is bigger than 256, change into bzero.
; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-DARWIN: {{b|bl}} _bzero
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct2(i8* nocapture %ptr) { define void @fct2(i8* nocapture %ptr) minsize {
entry: entry:
tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false) tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 257, i1 false)
ret void ret void
@ -30,7 +30,7 @@ entry:
; For unknown size, change to bzero. ; For unknown size, change to bzero.
; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-DARWIN: {{b|bl}} _bzero
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct3(i8* nocapture %ptr, i32 %unknown) { define void @fct3(i8* nocapture %ptr, i32 %unknown) minsize {
entry: entry:
%conv = sext i32 %unknown to i64 %conv = sext i32 %unknown to i64
tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false) tail call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %conv, i1 false)
@ -41,7 +41,7 @@ entry:
; Size <= 256, no change. ; Size <= 256, no change.
; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-DARWIN: {{b|bl}} _memset
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct4(i8* %ptr) { define void @fct4(i8* %ptr) minsize {
entry: entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
%call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp) %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 256, i64 %tmp)
@ -56,7 +56,7 @@ declare i64 @llvm.objectsize.i64(i8*, i1)
; Size > 256, change. ; Size > 256, change.
; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-DARWIN: {{b|bl}} _bzero
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct5(i8* %ptr) { define void @fct5(i8* %ptr) minsize {
entry: entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
%call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp) %call = tail call i8* @__memset_chk(i8* %ptr, i32 0, i64 257, i64 %tmp)
@ -67,7 +67,7 @@ entry:
; Size = unknown, change. ; Size = unknown, change.
; CHECK-DARWIN: {{b|bl}} _bzero ; CHECK-DARWIN: {{b|bl}} _bzero
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct6(i8* %ptr, i32 %unknown) { define void @fct6(i8* %ptr, i32 %unknown) minsize {
entry: entry:
%conv = sext i32 %unknown to i64 %conv = sext i32 %unknown to i64
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
@ -82,7 +82,7 @@ entry:
; memset with something that is not a zero, no change. ; memset with something that is not a zero, no change.
; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-DARWIN: {{b|bl}} _memset
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct7(i8* %ptr) { define void @fct7(i8* %ptr) minsize {
entry: entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
%call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp) %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 256, i64 %tmp)
@ -93,7 +93,7 @@ entry:
; memset with something that is not a zero, no change. ; memset with something that is not a zero, no change.
; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-DARWIN: {{b|bl}} _memset
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct8(i8* %ptr) { define void @fct8(i8* %ptr) minsize {
entry: entry:
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
%call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp) %call = tail call i8* @__memset_chk(i8* %ptr, i32 1, i64 257, i64 %tmp)
@ -104,7 +104,7 @@ entry:
; memset with something that is not a zero, no change. ; memset with something that is not a zero, no change.
; CHECK-DARWIN: {{b|bl}} _memset ; CHECK-DARWIN: {{b|bl}} _memset
; CHECK-LINUX: {{b|bl}} memset ; CHECK-LINUX: {{b|bl}} memset
define void @fct9(i8* %ptr, i32 %unknown) { define void @fct9(i8* %ptr, i32 %unknown) minsize {
entry: entry:
%conv = sext i32 %unknown to i64 %conv = sext i32 %unknown to i64
%tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false) %tmp = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)

View File

@ -1,6 +1,6 @@
; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s ; RUN: llc -mtriple=arm64-apple-ios -mattr=+strict-align < %s | FileCheck %s
; Small (16-bytes here) unaligned memcpys should stay memcpy calls if ; Small (16 bytes here) unaligned memcpy() should be a function call if
; strict-alignment is turned on. ; strict-alignment is turned on.
define void @t0(i8* %out, i8* %in) { define void @t0(i8* %out, i8* %in) {
; CHECK-LABEL: t0: ; CHECK-LABEL: t0:
@ -11,4 +11,32 @@ entry:
ret void ret void
} }
; Small (16 bytes here) aligned memcpy() should be inlined even if
; strict-alignment is turned on.
define void @t1(i8* align 8 %out, i8* align 8 %in) {
; CHECK-LABEL: t1:
; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x1]
; CHECK-NEXT: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0]
entry:
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %out, i8* align 8 %in, i64 16, i1 false)
ret void
}
; Tiny (4 bytes here) unaligned memcpy() should be inlined with byte sized
; loads and stores if strict-alignment is turned on.
define void @t2(i8* %out, i8* %in) {
; CHECK-LABEL: t2:
; CHECK: ldrb w{{[0-9]+}}, [x1, #3]
; CHECK-NEXT: ldrb w{{[0-9]+}}, [x1, #2]
; CHECK-NEXT: ldrb w{{[0-9]+}}, [x1, #1]
; CHECK-NEXT: ldrb w{{[0-9]+}}, [x1]
; CHECK-NEXT: strb w{{[0-9]+}}, [x0, #3]
; CHECK-NEXT: strb w{{[0-9]+}}, [x0, #2]
; CHECK-NEXT: strb w{{[0-9]+}}, [x0, #1]
; CHECK-NEXT: strb w{{[0-9]+}}, [x0]
entry:
call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 4, i1 false)
ret void
}
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1) declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)