forked from OSchip/llvm-project
[ARM] Fix lowering of misaligned memcpy/memset
Currently getOptimalMemOpType returns i32 for large enough sizes without checking for alignment, leading to poor code generation when misaligned accesses aren't permitted as we generate a word store then later split it up into byte stores. This means we inadvertantly go over the MaxStoresPerMemcpy limit and for memset we splat the memset value into a word then immediately split it up again. Fix this by leaving it up to FindOptimalMemOpLowering to figure out which type to use, but also fix a bug there where it wasn't correctly checking if misaligned memory accesses are allowed. Differential Revision: https://reviews.llvm.org/D33442 llvm-svn: 303990
This commit is contained in:
parent
ba9d8ba82a
commit
9009d2905d
|
@ -4779,23 +4779,23 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
|
|||
DAG.getMachineFunction());
|
||||
|
||||
if (VT == MVT::Other) {
|
||||
if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) ||
|
||||
TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) {
|
||||
VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS);
|
||||
} else {
|
||||
switch (DstAlign & 7) {
|
||||
case 0: VT = MVT::i64; break;
|
||||
case 4: VT = MVT::i32; break;
|
||||
case 2: VT = MVT::i16; break;
|
||||
default: VT = MVT::i8; break;
|
||||
}
|
||||
}
|
||||
// Use the largest integer type whose alignment constraints are satisfied.
|
||||
// We only need to check DstAlign here as SrcAlign is always greater or
|
||||
// equal to DstAlign (or zero).
|
||||
VT = MVT::i64;
|
||||
while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
|
||||
!TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
|
||||
VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
|
||||
assert(VT.isInteger());
|
||||
|
||||
// Find the largest legal integer type.
|
||||
MVT LVT = MVT::i64;
|
||||
while (!TLI.isTypeLegal(LVT))
|
||||
LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
|
||||
assert(LVT.isInteger());
|
||||
|
||||
// If the type we've chosen is larger than the largest legal integer type
|
||||
// then use that instead.
|
||||
if (VT.bitsGT(LVT))
|
||||
VT = LVT;
|
||||
}
|
||||
|
|
|
@ -12147,12 +12147,6 @@ EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
|
|||
}
|
||||
}
|
||||
|
||||
// Lowering to i32/i16 if the size permits.
|
||||
if (Size >= 4)
|
||||
return MVT::i32;
|
||||
else if (Size >= 2)
|
||||
return MVT::i16;
|
||||
|
||||
// Let the target-independent logic figure it out.
|
||||
return MVT::Other;
|
||||
}
|
||||
|
|
|
@ -95,10 +95,7 @@ entry:
|
|||
; CHECK: movt [[REG7:r[0-9]+]], #22866
|
||||
; CHECK: str [[REG7]]
|
||||
; CHECK-T1-LABEL: t5:
|
||||
; CHECK-T1: movs [[TREG3:r[0-9]]],
|
||||
; CHECK-T1: strb [[TREG3]],
|
||||
; CHECK-T1: movs [[TREG4:r[0-9]]],
|
||||
; CHECK-T1: strb [[TREG4]],
|
||||
; CHECK-T1: bl _memcpy
|
||||
tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -38,6 +38,56 @@ entry:
|
|||
ret void
|
||||
}
|
||||
|
||||
define void @t3(i8* %p) {
|
||||
entry:
|
||||
; CHECK-7A-LABEL: t3:
|
||||
; CHECK-7A: muls [[REG:r[0-9]+]],
|
||||
; CHECK-7A: str [[REG]],
|
||||
; CHECK-6M-LABEL: t3:
|
||||
; CHECK-6M-NOT: muls
|
||||
; CHECK-6M: strb [[REG:r[0-9]+]],
|
||||
; CHECK-6M: strb [[REG]],
|
||||
; CHECK-6M: strb [[REG]],
|
||||
; CHECK-6M: strb [[REG]],
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%0 = trunc i32 %i to i8
|
||||
call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 1, i1 false)
|
||||
call void @something(i8* %p)
|
||||
%inc = add nuw nsw i32 %i, 1
|
||||
%exitcond = icmp eq i32 %inc, 255
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @t4(i8* %p) {
|
||||
entry:
|
||||
; CHECK-7A-LABEL: t4:
|
||||
; CHECK-7A: muls [[REG:r[0-9]+]],
|
||||
; CHECK-7A: str [[REG]],
|
||||
; CHECK-6M-LABEL: t4:
|
||||
; CHECK-6M: muls [[REG:r[0-9]+]],
|
||||
; CHECK-6M: strh [[REG]],
|
||||
; CHECK-6M: strh [[REG]],
|
||||
br label %for.body
|
||||
|
||||
for.body:
|
||||
%i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
|
||||
%0 = trunc i32 %i to i8
|
||||
call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 2, i1 false)
|
||||
call void @something(i8* %p)
|
||||
%inc = add nuw nsw i32 %i, 1
|
||||
%exitcond = icmp eq i32 %inc, 255
|
||||
br i1 %exitcond, label %for.end, label %for.body
|
||||
|
||||
for.end:
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @something(i8*) nounwind
|
||||
declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
|
||||
declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
|
||||
|
|
Loading…
Reference in New Issue