diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 773777ac804f..a634d9ffb157 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -105,6 +105,29 @@ static bool StressIVChain = false; namespace { +struct MemAccessTy { + /// Used in situations where the accessed memory type is unknown. + static const unsigned UnknownAddressSpace = ~0u; + + Type *MemTy; + unsigned AddrSpace; + + MemAccessTy() : MemTy(nullptr), AddrSpace(UnknownAddressSpace) {} + + MemAccessTy(Type *Ty, unsigned AS) : + MemTy(Ty), AddrSpace(AS) {} + + bool operator==(MemAccessTy Other) const { + return MemTy == Other.MemTy && AddrSpace == Other.AddrSpace; + } + + bool operator!=(MemAccessTy Other) const { return !(*this == Other); } + + static MemAccessTy getUnknown(LLVMContext &Ctx) { + return MemAccessTy(Type::getVoidTy(Ctx), UnknownAddressSpace); + } +}; + /// RegSortData - This class holds data which is used to order reuse candidates. class RegSortData { public: @@ -683,11 +706,14 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) { } /// getAccessType - Return the type of the memory being accessed. -static Type *getAccessType(const Instruction *Inst) { - Type *AccessTy = Inst->getType(); - if (const StoreInst *SI = dyn_cast(Inst)) - AccessTy = SI->getOperand(0)->getType(); - else if (const IntrinsicInst *II = dyn_cast(Inst)) { +static MemAccessTy getAccessType(const Instruction *Inst) { + MemAccessTy AccessTy(Inst->getType(), MemAccessTy::UnknownAddressSpace); + if (const StoreInst *SI = dyn_cast(Inst)) { + AccessTy.MemTy = SI->getOperand(0)->getType(); + AccessTy.AddrSpace = SI->getPointerAddressSpace(); + } else if (const LoadInst *LI = dyn_cast(Inst)) { + AccessTy.AddrSpace = LI->getPointerAddressSpace(); + } else if (const IntrinsicInst *II = dyn_cast(Inst)) { // Addressing modes can also be folded into prefetches and a variety // of intrinsics. switch (II->getIntrinsicID()) { @@ -696,16 +722,16 @@ static Type *getAccessType(const Instruction *Inst) { case Intrinsic::x86_sse2_storeu_pd: case Intrinsic::x86_sse2_storeu_dq: case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getArgOperand(0)->getType(); + AccessTy.MemTy = II->getArgOperand(0)->getType(); break; } } // All pointers have the same requirements, so canonicalize them to an // arbitrary pointer type to minimize variation. - if (PointerType *PTy = dyn_cast(AccessTy)) - AccessTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), - PTy->getAddressSpace()); + if (PointerType *PTy = dyn_cast(AccessTy.MemTy)) + AccessTy.MemTy = PointerType::get(IntegerType::get(PTy->getContext(), 1), + PTy->getAddressSpace()); return AccessTy; } @@ -1204,7 +1230,7 @@ public: typedef PointerIntPair SCEVUseKindPair; KindType Kind; - Type *AccessTy; + MemAccessTy AccessTy; SmallVector Offsets; int64_t MinOffset; @@ -1236,12 +1262,10 @@ public: /// Regs - The set of register candidates used by all formulae in this LSRUse. SmallPtrSet Regs; - LSRUse(KindType K, Type *T) : Kind(K), AccessTy(T), - MinOffset(INT64_MAX), - MaxOffset(INT64_MIN), - AllFixupsOutsideLoop(true), - RigidFormula(false), - WidestFixupType(nullptr) {} + LSRUse(KindType K, MemAccessTy AT) + : Kind(K), AccessTy(AT), MinOffset(INT64_MAX), MaxOffset(INT64_MIN), + AllFixupsOutsideLoop(true), RigidFormula(false), + WidestFixupType(nullptr) {} bool HasFormulaWithSameRegs(const Formula &F) const; bool InsertFormula(const Formula &F); @@ -1331,10 +1355,13 @@ void LSRUse::print(raw_ostream &OS) const { case ICmpZero: OS << "ICmpZero"; break; case Address: OS << "Address of "; - if (AccessTy->isPointerTy()) + if (AccessTy.MemTy->isPointerTy()) OS << "pointer"; // the full pointer type could be really verbose - else - OS << *AccessTy; + else { + OS << *AccessTy.MemTy; + } + + OS << " in addrspace(" << AccessTy.AddrSpace << ')'; } OS << ", Offsets={"; @@ -1360,12 +1387,13 @@ void LSRUse::dump() const { #endif static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { switch (Kind) { case LSRUse::Address: - return TTI.isLegalAddressingMode(AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale); + return TTI.isLegalAddressingMode(AccessTy.MemTy, BaseGV, BaseOffset, + HasBaseReg, Scale, AccessTy.AddrSpace); case LSRUse::ICmpZero: // There's not even a target hook for querying whether it would be legal to @@ -1412,7 +1440,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // Check for overflow. @@ -1433,7 +1461,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, int64_t MinOffset, int64_t MaxOffset, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, const Formula &F) { // For the purpose of isAMCompletelyFolded either having a canonical formula // or a scale not equal to zero is correct. @@ -1449,9 +1477,9 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI, /// isLegalUse - Test whether we know how to expand the current formula. static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, - int64_t Scale) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, GlobalValue *BaseGV, + int64_t BaseOffset, bool HasBaseReg, int64_t Scale) { // We know how to expand completely foldable formulae. return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy, BaseGV, BaseOffset, HasBaseReg, Scale) || @@ -1463,8 +1491,8 @@ static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, } static bool isLegalUse(const TargetTransformInfo &TTI, int64_t MinOffset, - int64_t MaxOffset, LSRUse::KindType Kind, Type *AccessTy, - const Formula &F) { + int64_t MaxOffset, LSRUse::KindType Kind, + MemAccessTy AccessTy, const Formula &F) { return isLegalUse(TTI, MinOffset, MaxOffset, Kind, AccessTy, F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale); } @@ -1490,14 +1518,12 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, switch (LU.Kind) { case LSRUse::Address: { // Check the scaling factor cost with both the min and max offsets. - int ScaleCostMinOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MinOffset, - F.HasBaseReg, F.Scale); - int ScaleCostMaxOffset = - TTI.getScalingFactorCost(LU.AccessTy, F.BaseGV, - F.BaseOffset + LU.MaxOffset, - F.HasBaseReg, F.Scale); + int ScaleCostMinOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MinOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); + int ScaleCostMaxOffset = TTI.getScalingFactorCost( + LU.AccessTy.MemTy, F.BaseGV, F.BaseOffset + LU.MaxOffset, F.HasBaseReg, + F.Scale, LU.AccessTy.AddrSpace); assert(ScaleCostMinOffset >= 0 && ScaleCostMaxOffset >= 0 && "Legal addressing mode has an illegal cost!"); @@ -1515,7 +1541,7 @@ static unsigned getScalingFactorCost(const TargetTransformInfo &TTI, } static bool isAlwaysFoldable(const TargetTransformInfo &TTI, - LSRUse::KindType Kind, Type *AccessTy, + LSRUse::KindType Kind, MemAccessTy AccessTy, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg) { // Fast-path: zero is always foldable. @@ -1539,7 +1565,8 @@ static bool isAlwaysFoldable(const TargetTransformInfo &TTI, static bool isAlwaysFoldable(const TargetTransformInfo &TTI, ScalarEvolution &SE, int64_t MinOffset, int64_t MaxOffset, LSRUse::KindType Kind, - Type *AccessTy, const SCEV *S, bool HasBaseReg) { + MemAccessTy AccessTy, const SCEV *S, + bool HasBaseReg) { // Fast-path: zero is always foldable. if (S->isZero()) return true; @@ -1696,11 +1723,10 @@ class LSRInstance { UseMapTy UseMap; bool reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy); + LSRUse::KindType Kind, MemAccessTy AccessTy); - std::pair getUse(const SCEV *&Expr, - LSRUse::KindType Kind, - Type *AccessTy); + std::pair getUse(const SCEV *&Expr, LSRUse::KindType Kind, + MemAccessTy AccessTy); void DeleteUse(LSRUse &LU, size_t LUIdx); @@ -2152,16 +2178,18 @@ LSRInstance::OptimizeLoopTermCond() { C->getValue().isMinSignedValue()) goto decline_post_inc; // Check for possible scaled-address reuse. - Type *AccessTy = getAccessType(UI->getUser()); + MemAccessTy AccessTy = getAccessType(UI->getUser()); int64_t Scale = C->getSExtValue(); - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; Scale = -Scale; - if (TTI.isLegalAddressingMode(AccessTy, /*BaseGV=*/ nullptr, - /*BaseOffset=*/ 0, - /*HasBaseReg=*/ false, Scale)) + if (TTI.isLegalAddressingMode(AccessTy.MemTy, /*BaseGV=*/nullptr, + /*BaseOffset=*/0, + /*HasBaseReg=*/false, Scale, + AccessTy.AddrSpace)) goto decline_post_inc; } } @@ -2216,12 +2244,12 @@ LSRInstance::OptimizeLoopTermCond() { /// reconcileNewOffset - Determine if the given use can accommodate a fixup /// at the given offset and other details. If so, update the use and /// return true. -bool -LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, - LSRUse::KindType Kind, Type *AccessTy) { +bool LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, + bool HasBaseReg, LSRUse::KindType Kind, + MemAccessTy AccessTy) { int64_t NewMinOffset = LU.MinOffset; int64_t NewMaxOffset = LU.MaxOffset; - Type *NewAccessTy = AccessTy; + MemAccessTy NewAccessTy = AccessTy; // Check for a mismatched kind. It's tempting to collapse mismatched kinds to // something conservative, however this can pessimize in the case that one of @@ -2232,8 +2260,10 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, // Check for a mismatched access type, and fall back conservatively as needed. // TODO: Be less conservative when the type is similar and can use the same // addressing modes. - if (Kind == LSRUse::Address && AccessTy != LU.AccessTy) - NewAccessTy = Type::getVoidTy(AccessTy->getContext()); + if (Kind == LSRUse::Address) { + if (AccessTy != LU.AccessTy) + NewAccessTy = MemAccessTy::getUnknown(AccessTy.MemTy->getContext()); + } // Conservatively assume HasBaseReg is true for now. if (NewOffset < LU.MinOffset) { @@ -2260,9 +2290,9 @@ LSRInstance::reconcileNewOffset(LSRUse &LU, int64_t NewOffset, bool HasBaseReg, /// getUse - Return an LSRUse index and an offset value for a fixup which /// needs the given expression, with the given kind and optional access type. /// Either reuse an existing use or create a new one, as needed. -std::pair -LSRInstance::getUse(const SCEV *&Expr, - LSRUse::KindType Kind, Type *AccessTy) { +std::pair LSRInstance::getUse(const SCEV *&Expr, + LSRUse::KindType Kind, + MemAccessTy AccessTy) { const SCEV *Copy = Expr; int64_t Offset = ExtractImmediate(Expr, SE); @@ -2831,10 +2861,10 @@ static bool canFoldIVIncExpr(const SCEV *IncExpr, Instruction *UserInst, if (IncConst->getValue()->getValue().getMinSignedBits() > 64) return false; + MemAccessTy AccessTy = getAccessType(UserInst); int64_t IncOffset = IncConst->getValue()->getSExtValue(); - if (!isAlwaysFoldable(TTI, LSRUse::Address, - getAccessType(UserInst), /*BaseGV=*/ nullptr, - IncOffset, /*HaseBaseReg=*/ false)) + if (!isAlwaysFoldable(TTI, LSRUse::Address, AccessTy, /*BaseGV=*/nullptr, + IncOffset, /*HaseBaseReg=*/false)) return false; return true; @@ -2961,7 +2991,7 @@ void LSRInstance::CollectFixupsAndInitialFormulae() { LF.PostIncLoops = U.getPostIncLoops(); LSRUse::KindType Kind = LSRUse::Basic; - Type *AccessTy = nullptr; + MemAccessTy AccessTy; if (isAddressUse(LF.UserInst, LF.OperandValToReplace)) { Kind = LSRUse::Address; AccessTy = getAccessType(LF.UserInst); @@ -3148,7 +3178,8 @@ LSRInstance::CollectLoopInvariantFixupsAndFormulae() { LSRFixup &LF = getNewFixup(); LF.UserInst = const_cast(UserInst); LF.OperandValToReplace = U; - std::pair P = getUse(S, LSRUse::Basic, nullptr); + std::pair P = getUse( + S, LSRUse::Basic, MemAccessTy()); LF.LUIdx = P.first; LF.Offset = P.second; LSRUse &LU = Uses[LF.LUIdx]; diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll new file mode 100644 index 000000000000..bf61112a3c3e --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll @@ -0,0 +1,156 @@ +; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s + +; Test that loops with different maximum offsets for different address +; spaces are correctly handled. + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + +; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_i32( +; OPT: {{^}}.lr.ph: +; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095 +; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1 +define void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i64 %indvars.iv, 4095 + %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %tmp1 + %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv + %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 + %tmp7 = add nsw i32 %tmp6, %tmp4 + store i32 %tmp7, i32 addrspace(1)* %tmp5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +; OPT-LABEL: @test_global_addressing_loop_uniform_index_max_offset_p1_i32( +; OPT: {{^}}.lr.ph.preheader: +; OPT: %scevgep2 = getelementptr i8, i8 addrspace(1)* %arg1, i64 4096 +; OPT: br label %.lr.ph + +; OPT: {{^}}.lr.ph: +; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1 +define void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i64 %indvars.iv, 4096 + %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg1, i64 %tmp1 + %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1 + %tmp4 = sext i8 %tmp3 to i32 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv + %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 + %tmp7 = add nsw i32 %tmp6, %tmp4 + store i32 %tmp7, i32 addrspace(1)* %tmp5, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_i32( +; OPT: {{^}}.lr.ph +; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535 +; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1 +define void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i64 %indvars.iv, 65535 + %tmp2 = trunc i64 %tmp1 to i32 + %tmp3 = getelementptr inbounds i8, i8 addrspace(3)* %arg1, i32 %tmp2 + %tmp4 = load i8, i8 addrspace(3)* %tmp3, align 1 + %tmp5 = sext i8 %tmp4 to i32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = add nsw i32 %tmp7, %tmp5 + store i32 %tmp8, i32 addrspace(1)* %tmp6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +; OPT-LABEL: @test_local_addressing_loop_uniform_index_max_offset_p1_i32( +; OPT: {{^}}.lr.ph.preheader: +; OPT: %scevgep2 = getelementptr i8, i8 addrspace(3)* %arg1, i32 65536 +; OPT: br label %.lr.ph + +; OPT: {{^}}.lr.ph: +; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ] +; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1 +define void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 { +bb: + %tmp = icmp sgt i32 %n, 0 + br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge + +.lr.ph.preheader: ; preds = %bb + br label %.lr.ph + +._crit_edge.loopexit: ; preds = %.lr.ph + br label %._crit_edge + +._crit_edge: ; preds = %._crit_edge.loopexit, %bb + ret void + +.lr.ph: ; preds = %.lr.ph, %.lr.ph.preheader + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ] + %tmp1 = add nuw nsw i64 %indvars.iv, 65536 + %tmp2 = trunc i64 %tmp1 to i32 + %tmp3 = getelementptr inbounds i8, i8 addrspace(3)* %arg1, i32 %tmp2 + %tmp4 = load i8, i8 addrspace(3)* %tmp3, align 1 + %tmp5 = sext i8 %tmp4 to i32 + %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %indvars.iv + %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4 + %tmp8 = add nsw i32 %tmp7, %tmp5 + store i32 %tmp8, i32 addrspace(1)* %tmp6, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lit.local.cfg b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lit.local.cfg new file mode 100644 index 000000000000..6baccf05fff0 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lit.local.cfg @@ -0,0 +1,3 @@ +if not 'AMDGPU' in config.root.targets: + config.unsupported = True + diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll new file mode 100644 index 000000000000..bd80302a68b8 --- /dev/null +++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll @@ -0,0 +1,113 @@ +; RUN: llc -march=amdgcn -mcpu=bonaire -print-lsr-output < %s 2>&1 | FileCheck %s + +; Test various conditions where OptimizeLoopTermCond doesn't look at a +; memory instruction use and fails to find the address space. + +target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64" + +; CHECK-LABEL: @local_cmp_user( +; CHECK: bb11: +; CHECK: %lsr.iv1 = phi i32 [ %lsr.iv.next2, %bb ], [ -2, %entry ] +; CHECK: %lsr.iv = phi i32 [ %lsr.iv.next, %bb ], [ undef, %entry ] + +; CHECK: bb: +; CHECK: %lsr.iv.next = add i32 %lsr.iv, -1 +; CHECK: %lsr.iv.next2 = add i32 %lsr.iv1, 2 +; CHECK: %scevgep = getelementptr i8, i8 addrspace(3)* %t, i32 %lsr.iv.next2 +; CHECK: %c1 = icmp ult i8 addrspace(3)* %scevgep, undef +define void @local_cmp_user() nounwind { +entry: + br label %bb11 + +bb11: + %i = phi i32 [ 0, %entry ], [ %i.next, %bb ] + %ii = shl i32 %i, 1 + %c0 = icmp eq i32 %i, undef + br i1 %c0, label %bb13, label %bb + +bb: + %t = load i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* undef + %p = getelementptr i8, i8 addrspace(3)* %t, i32 %ii + %c1 = icmp ult i8 addrspace(3)* %p, undef + %i.next = add i32 %i, 1 + br i1 %c1, label %bb11, label %bb13 + +bb13: + unreachable +} + +; CHECK-LABEL: @global_cmp_user( +; CHECK: %lsr.iv.next = add i64 %lsr.iv, -1 +; CHECK: %lsr.iv.next2 = add i64 %lsr.iv1, 2 +; CHECK: %scevgep = getelementptr i8, i8 addrspace(1)* %t, i64 %lsr.iv.next2 +define void @global_cmp_user() nounwind { +entry: + br label %bb11 + +bb11: + %i = phi i64 [ 0, %entry ], [ %i.next, %bb ] + %ii = shl i64 %i, 1 + %c0 = icmp eq i64 %i, undef + br i1 %c0, label %bb13, label %bb + +bb: + %t = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* undef + %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii + %c1 = icmp ult i8 addrspace(1)* %p, undef + %i.next = add i64 %i, 1 + br i1 %c1, label %bb11, label %bb13 + +bb13: + unreachable +} + +; CHECK-LABEL: @global_gep_user( +; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i32 %lsr.iv1 +; CHECK: %lsr.iv.next = add i32 %lsr.iv, -1 +; CHECK: %lsr.iv.next2 = add i32 %lsr.iv1, 2 +define void @global_gep_user() nounwind { +entry: + br label %bb11 + +bb11: + %i = phi i32 [ 0, %entry ], [ %i.next, %bb ] + %ii = shl i32 %i, 1 + %c0 = icmp eq i32 %i, undef + br i1 %c0, label %bb13, label %bb + +bb: + %t = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* undef + %p = getelementptr i8, i8 addrspace(1)* %t, i32 %ii + %c1 = icmp ult i8 addrspace(1)* %p, undef + %i.next = add i32 %i, 1 + br i1 %c1, label %bb11, label %bb13 + +bb13: + unreachable +} + +; CHECK-LABEL: @global_sext_scale_user( +; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext +; CHECK: %lsr.iv.next = add i32 %lsr.iv, -1 +; CHECK: %lsr.iv.next2 = add i32 %lsr.iv1, 2 +define void @global_sext_scale_user() nounwind { +entry: + br label %bb11 + +bb11: + %i = phi i32 [ 0, %entry ], [ %i.next, %bb ] + %ii = shl i32 %i, 1 + %ii.ext = sext i32 %ii to i64 + %c0 = icmp eq i32 %i, undef + br i1 %c0, label %bb13, label %bb + +bb: + %t = load i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* undef + %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext + %c1 = icmp ult i8 addrspace(1)* %p, undef + %i.next = add i32 %i, 1 + br i1 %c1, label %bb11, label %bb13 + +bb13: + unreachable +}