forked from OSchip/llvm-project
[SLP] Don't vectorize loads of non-packed types (like i1, i2).
Summary: Given an array of i2 elements, 4 consecutive scalar loads will be lowered to i8-sized loads and thus will access 4 consecutive bytes in memory. If we vectorize these loads into a single <4 x i2> load, it'll access only 1 byte in memory. Hence, we should prohibit vectorization in such cases. PS: Initial patch was proposed by Arnold. Reviewers: aschwaighofer, nadav, hfinkel Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D13277 llvm-svn: 248943
This commit is contained in:
parent
757908e545
commit
fc783e91e0
|
@ -1158,6 +1158,23 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
return;
|
||||
}
|
||||
case Instruction::Load: {
|
||||
// Check that a vectorized load would load the same memory as a scalar
|
||||
// load.
|
||||
// For example we don't want vectorize loads that are smaller than 8 bit.
|
||||
// Even though we have a packed struct {<i2, i2, i2, i2>} LLVM treats
|
||||
// loading/storing it as an i8 struct. If we vectorize loads/stores from
|
||||
// such a struct we read/write packed bits disagreeing with the
|
||||
// unvectorized version.
|
||||
const DataLayout &DL = F->getParent()->getDataLayout();
|
||||
Type *ScalarTy = VL[0]->getType();
|
||||
|
||||
if (DL.getTypeSizeInBits(ScalarTy) !=
|
||||
DL.getTypeAllocSizeInBits(ScalarTy)) {
|
||||
BS.cancelScheduling(VL);
|
||||
newTreeEntry(VL, false);
|
||||
DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
|
||||
return;
|
||||
}
|
||||
// Check if the loads are consecutive or of we need to swizzle them.
|
||||
for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
|
||||
LoadInst *L = cast<LoadInst>(VL[i]);
|
||||
|
@ -1167,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
|
|||
DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
|
||||
return;
|
||||
}
|
||||
const DataLayout &DL = F->getParent()->getDataLayout();
|
||||
|
||||
if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) {
|
||||
if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) {
|
||||
++NumLoadsWantToChangeOrder;
|
||||
|
|
|
@ -47,4 +47,30 @@ exit:
|
|||
ret void
|
||||
}
|
||||
|
||||
define i8 @test3(i8 *%addr) {
|
||||
; Check that we do not vectorize types that are padded to a bigger ones.
|
||||
;
|
||||
; CHECK-LABEL: @test3
|
||||
; CHECK-NOT: <4 x i2>
|
||||
; CHECK: ret i8
|
||||
entry:
|
||||
%a = bitcast i8* %addr to i2*
|
||||
%a0 = getelementptr inbounds i2, i2* %a, i64 0
|
||||
%a1 = getelementptr inbounds i2, i2* %a, i64 1
|
||||
%a2 = getelementptr inbounds i2, i2* %a, i64 2
|
||||
%a3 = getelementptr inbounds i2, i2* %a, i64 3
|
||||
%l0 = load i2, i2* %a0, align 1
|
||||
%l1 = load i2, i2* %a1, align 1
|
||||
%l2 = load i2, i2* %a2, align 1
|
||||
%l3 = load i2, i2* %a3, align 1
|
||||
br label %bb1
|
||||
bb1: ; preds = %entry
|
||||
%p0 = phi i2 [ %l0, %entry ]
|
||||
%p1 = phi i2 [ %l1, %entry ]
|
||||
%p2 = phi i2 [ %l2, %entry ]
|
||||
%p3 = phi i2 [ %l3, %entry ]
|
||||
%r = zext i2 %p2 to i8
|
||||
ret i8 %r
|
||||
}
|
||||
|
||||
declare void @f(i64, i64)
|
||||
|
|
Loading…
Reference in New Issue