From fc783e91e0c0696ec5b3a990a7ac91bd751e370d Mon Sep 17 00:00:00 2001 From: Michael Zolotukhin Date: Wed, 30 Sep 2015 21:05:43 +0000 Subject: [PATCH] [SLP] Don't vectorize loads of non-packed types (like i1, i2). Summary: Given an array of i2 elements, 4 consecutive scalar loads will be lowered to i8-sized loads and thus will access 4 consecutive bytes in memory. If we vectorize these loads into a single <4 x i2> load, it'll access only 1 byte in memory. Hence, we should prohibit vectorization in such cases. PS: Initial patch was proposed by Arnold. Reviewers: aschwaighofer, nadav, hfinkel Subscribers: llvm-commits Differential Revision: http://reviews.llvm.org/D13277 llvm-svn: 248943 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 19 +++++++++++++- .../Transforms/SLPVectorizer/X86/bad_types.ll | 26 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fd8818c1ca92..f9dee18af950 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1158,6 +1158,23 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { return; } case Instruction::Load: { + // Check that a vectorized load would load the same memory as a scalar + // load. + // For example we don't want vectorize loads that are smaller than 8 bit. + // Even though we have a packed struct {} LLVM treats + // loading/storing it as an i8 struct. If we vectorize loads/stores from + // such a struct we read/write packed bits disagreeing with the + // unvectorized version. + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *ScalarTy = VL[0]->getType(); + + if (DL.getTypeSizeInBits(ScalarTy) != + DL.getTypeAllocSizeInBits(ScalarTy)) { + BS.cancelScheduling(VL); + newTreeEntry(VL, false); + DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n"); + return; + } // Check if the loads are consecutive or of we need to swizzle them. for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) { LoadInst *L = cast(VL[i]); @@ -1167,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth) { DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n"); return; } - const DataLayout &DL = F->getParent()->getDataLayout(); + if (!isConsecutiveAccess(VL[i], VL[i + 1], DL)) { if (VL.size() == 2 && isConsecutiveAccess(VL[1], VL[0], DL)) { ++NumLoadsWantToChangeOrder; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll index 2d8f3832ee29..98c29068bb96 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bad_types.ll @@ -47,4 +47,30 @@ exit: ret void } +define i8 @test3(i8 *%addr) { +; Check that we do not vectorize types that are padded to a bigger ones. +; +; CHECK-LABEL: @test3 +; CHECK-NOT: <4 x i2> +; CHECK: ret i8 +entry: + %a = bitcast i8* %addr to i2* + %a0 = getelementptr inbounds i2, i2* %a, i64 0 + %a1 = getelementptr inbounds i2, i2* %a, i64 1 + %a2 = getelementptr inbounds i2, i2* %a, i64 2 + %a3 = getelementptr inbounds i2, i2* %a, i64 3 + %l0 = load i2, i2* %a0, align 1 + %l1 = load i2, i2* %a1, align 1 + %l2 = load i2, i2* %a2, align 1 + %l3 = load i2, i2* %a3, align 1 + br label %bb1 +bb1: ; preds = %entry + %p0 = phi i2 [ %l0, %entry ] + %p1 = phi i2 [ %l1, %entry ] + %p2 = phi i2 [ %l2, %entry ] + %p3 = phi i2 [ %l3, %entry ] + %r = zext i2 %p2 to i8 + ret i8 %r +} + declare void @f(i64, i64)