diff --git a/llvm/include/llvm/LinkAllPasses.h b/llvm/include/llvm/LinkAllPasses.h index 58bb5ebea483..9e1bb2c5d23f 100644 --- a/llvm/include/llvm/LinkAllPasses.h +++ b/llvm/include/llvm/LinkAllPasses.h @@ -183,7 +183,7 @@ namespace { (void) llvm::createInstructionSimplifierPass(); (void) llvm::createLoopVectorizePass(); (void) llvm::createSLPVectorizerPass(); - (void) llvm::createLoadStoreVectorizerPass(128); + (void) llvm::createLoadStoreVectorizerPass(); (void) llvm::createBBVectorizePass(); (void) llvm::createPartiallyInlineLibCallsPass(); (void) llvm::createScalarizerPass(); diff --git a/llvm/include/llvm/Transforms/Vectorize.h b/llvm/include/llvm/Transforms/Vectorize.h index f132252295f7..f734e299c6e9 100644 --- a/llvm/include/llvm/Transforms/Vectorize.h +++ b/llvm/include/llvm/Transforms/Vectorize.h @@ -144,7 +144,7 @@ bool vectorizeBasicBlock(Pass *P, BasicBlock &BB, // LoadStoreVectorizer - Create vector loads and stores, but leave scalar // operations. // -Pass *createLoadStoreVectorizerPass(unsigned VecRegSize = 128); +Pass *createLoadStoreVectorizerPass(); } // End llvm namespace diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 37679fdf62da..0168d78145a8 100644 --- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -51,17 +51,18 @@ class Vectorizer { AliasAnalysis &AA; DominatorTree &DT; ScalarEvolution &SE; + TargetTransformInfo &TTI; const DataLayout &DL; IRBuilder<> Builder; ValueListMap StoreRefs; ValueListMap LoadRefs; - unsigned VecRegSize; public: Vectorizer(Function &F, AliasAnalysis &AA, DominatorTree &DT, - ScalarEvolution &SE, unsigned VecRegSize) - : F(F), AA(AA), DT(DT), SE(SE), DL(F.getParent()->getDataLayout()), - Builder(SE.getContext()), VecRegSize(VecRegSize) {} + ScalarEvolution &SE, TargetTransformInfo &TTI) + : F(F), AA(AA), DT(DT), SE(SE), TTI(TTI), + DL(F.getParent()->getDataLayout()), + Builder(SE.getContext()) {} bool run(); @@ -116,10 +117,8 @@ private: class LoadStoreVectorizer : public FunctionPass { public: static char ID; - unsigned VecRegSize; - LoadStoreVectorizer(unsigned VecRegSize = 128) : FunctionPass(ID), - VecRegSize(VecRegSize) { + LoadStoreVectorizer() : FunctionPass(ID) { initializeLoadStoreVectorizerPass(*PassRegistry::getPassRegistry()); } @@ -133,6 +132,7 @@ public: AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.setPreservesCFG(); } }; @@ -144,13 +144,14 @@ INITIALIZE_PASS_DEPENDENCY(SCEVAAWrapperPass) INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoadStoreVectorizer, DEBUG_TYPE, "Vectorize load and store instructions", false, false); char LoadStoreVectorizer::ID = 0; -Pass *llvm::createLoadStoreVectorizerPass(unsigned VecRegSize) { - return new LoadStoreVectorizer(VecRegSize); +Pass *llvm::createLoadStoreVectorizerPass() { + return new LoadStoreVectorizer(); } bool LoadStoreVectorizer::runOnFunction(Function &F) { @@ -161,8 +162,10 @@ bool LoadStoreVectorizer::runOnFunction(Function &F) { AliasAnalysis &AA = getAnalysis().getAAResults(); DominatorTree &DT = getAnalysis().getDomTree(); ScalarEvolution &SE = getAnalysis().getSE(); + TargetTransformInfo &TTI + = getAnalysis().getTTI(F); - Vectorizer V(F, AA, DT, SE, VecRegSize); + Vectorizer V(F, AA, DT, SE, TTI); return V.run(); } @@ -440,6 +443,10 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { if (TySize < 8) continue; + Value *Ptr = LI->getPointerOperand(); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); + // No point in looking at these if they're too big to vectorize. if (TySize > VecRegSize / 2) continue; @@ -456,8 +463,8 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { // TODO: Target hook to filter types. // Save the load locations. - Value *Ptr = GetUnderlyingObject(LI->getPointerOperand(), DL); - LoadRefs[Ptr].push_back(LI); + Value *ObjPtr = GetUnderlyingObject(Ptr, DL); + LoadRefs[ObjPtr].push_back(LI); } else if (StoreInst *SI = dyn_cast(&I)) { if (!SI->isSimple()) @@ -473,6 +480,9 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { if (TySize < 8) continue; + Value *Ptr = SI->getPointerOperand(); + unsigned AS = Ptr->getType()->getPointerAddressSpace(); + unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); if (TySize > VecRegSize / 2) continue; @@ -485,8 +495,8 @@ void Vectorizer::collectInstructions(BasicBlock *BB) { continue; // Save store location. - Value *Ptr = GetUnderlyingObject(SI->getPointerOperand(), DL); - StoreRefs[Ptr].push_back(SI); + Value *ObjPtr = GetUnderlyingObject(Ptr, DL); + StoreRefs[ObjPtr].push_back(SI); } } } @@ -592,6 +602,8 @@ bool Vectorizer::vectorizeStoreChain(ArrayRef Chain) { } unsigned Sz = DL.getTypeSizeInBits(StoreTy); + unsigned AS = S0->getPointerAddressSpace(); + unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; unsigned ChainSize = Chain.size(); @@ -664,7 +676,6 @@ bool Vectorizer::vectorizeStoreChain(ArrayRef Chain) { // Set insert point. Builder.SetInsertPoint(&*Last); - unsigned AS = S0->getPointerAddressSpace(); Value *Vec = UndefValue::get(VecTy); @@ -728,6 +739,8 @@ bool Vectorizer::vectorizeLoadChain(ArrayRef Chain) { } unsigned Sz = DL.getTypeSizeInBits(LoadTy); + unsigned AS = L0->getPointerAddressSpace(); + unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS); unsigned VF = VecRegSize / Sz; unsigned ChainSize = Chain.size(); @@ -798,7 +811,6 @@ bool Vectorizer::vectorizeLoadChain(ArrayRef Chain) { // Set insert point. Builder.SetInsertPoint(&*Last); - unsigned AS = L0->getPointerAddressSpace(); Value *Bitcast = Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS)); diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll new file mode 100644 index 000000000000..6651cd18867e --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll @@ -0,0 +1,51 @@ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT4 -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8 -check-prefix=ALL %s +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16 -check-prefix=ALL %s + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32 +; ELT4: store i32 +; ELT4: store i32 +; ELT4: store i32 +; ELT4: store i32 + +; ELT8: store <2 x i32> +; ELT8: store <2 x i32> + +; ELT16: store <4 x i32> +define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 { + %out.gep.1 = getelementptr i32, i32* %out, i32 1 + %out.gep.2 = getelementptr i32, i32* %out, i32 2 + %out.gep.3 = getelementptr i32, i32* %out, i32 3 + + store i32 9, i32* %out + store i32 1, i32* %out.gep.1 + store i32 23, i32* %out.gep.2 + store i32 19, i32* %out.gep.3 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8( +; ALL: store <4 x i8> +define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 { + %out.gep.1 = getelementptr i8, i8* %out, i32 1 + %out.gep.2 = getelementptr i8, i8* %out, i32 2 + %out.gep.3 = getelementptr i8, i8* %out, i32 3 + + store i8 9, i8* %out + store i8 1, i8* %out.gep.1 + store i8 23, i8* %out.gep.2 + store i8 19, i8* %out.gep.3 + ret void +} + +; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16( +; ALL: store <2 x i16> +define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 { + %out.gep.1 = getelementptr i16, i16* %out, i32 1 + + store i16 9, i16* %out + store i16 12, i16* %out.gep.1 + ret void +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll index 57aa5ef6cefd..fefc0856a1aa 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll @@ -502,7 +502,8 @@ define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) # } ; CHECK-LABEL: @merge_local_store_4_constants_i32 -; CHECK: store <4 x i32> , <4 x i32> addrspace(3)* +; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* +; CHECK: store <2 x i32> , <2 x i32> addrspace(3)* define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 { %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1 %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2