Utils: Always set alignment when expanding mem intrinsics

This was creating natural aligned loads and stores, which may not be
the case. The target could request a wider type load with less
alignment.
This commit is contained in:
Matt Arsenault 2020-02-14 15:39:44 -05:00 committed by Matt Arsenault
parent 05e7d8d6ce
commit b0bdb186f5
2 changed files with 80 additions and 68 deletions

View File

@ -14,17 +14,9 @@
using namespace llvm; using namespace llvm;
static unsigned getLoopOperandSizeInBytes(Type *Type) {
if (VectorType *VTy = dyn_cast<VectorType>(Type)) {
return VTy->getBitWidth() / 8;
}
return Type->getPrimitiveSizeInBits() / 8;
}
void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr, void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
Value *DstAddr, ConstantInt *CopyLen, Value *DstAddr, ConstantInt *CopyLen,
unsigned SrcAlign, unsigned DestAlign, unsigned SrcAlign, unsigned DstAlign,
bool SrcIsVolatile, bool DstIsVolatile, bool SrcIsVolatile, bool DstIsVolatile,
const TargetTransformInfo &TTI) { const TargetTransformInfo &TTI) {
// No need to expand zero length copies. // No need to expand zero length copies.
@ -35,15 +27,16 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
BasicBlock *PostLoopBB = nullptr; BasicBlock *PostLoopBB = nullptr;
Function *ParentFunc = PreLoopBB->getParent(); Function *ParentFunc = PreLoopBB->getParent();
LLVMContext &Ctx = PreLoopBB->getContext(); LLVMContext &Ctx = PreLoopBB->getContext();
const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
Type *TypeOfCopyLen = CopyLen->getType(); Type *TypeOfCopyLen = CopyLen->getType();
Type *LoopOpType = Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DestAlign); SrcAlign, DstAlign);
unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize; uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
if (LoopEndCount != 0) { if (LoopEndCount != 0) {
@ -66,16 +59,20 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType); DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
} }
Align PartDstAlign(MinAlign(DstAlign, LoopOpSize));
Align PartSrcAlign(MinAlign(SrcAlign, LoopOpSize));
IRBuilder<> LoopBuilder(LoopBB); IRBuilder<> LoopBuilder(LoopBB);
PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index"); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB); LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
// Loop Body // Loop Body
Value *SrcGEP = Value *SrcGEP =
LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile); Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
PartSrcAlign, SrcIsVolatile);
Value *DstGEP = Value *DstGEP =
LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
Value *NewIndex = Value *NewIndex =
LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U)); LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
@ -93,18 +90,16 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI() IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
: InsertBefore); : InsertBefore);
// Update the alignment based on the copy size used in the loop body.
SrcAlign = std::min(SrcAlign, LoopOpSize);
DestAlign = std::min(DestAlign, LoopOpSize);
SmallVector<Type *, 5> RemainingOps; SmallVector<Type *, 5> RemainingOps;
TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes, TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
SrcAS, DstAS, SrcAS, DstAS, SrcAlign, DstAlign);
SrcAlign, DestAlign);
for (auto OpTy : RemainingOps) { for (auto OpTy : RemainingOps) {
Align PartSrcAlign(MinAlign(SrcAlign, BytesCopied));
Align PartDstAlign(MinAlign(DstAlign, BytesCopied));
// Calaculate the new index // Calaculate the new index
unsigned OperandSize = getLoopOperandSizeInBytes(OpTy); unsigned OperandSize = DL.getTypeStoreSize(OpTy);
uint64_t GepIndex = BytesCopied / OperandSize; uint64_t GepIndex = BytesCopied / OperandSize;
assert(GepIndex * OperandSize == BytesCopied && assert(GepIndex * OperandSize == BytesCopied &&
"Division should have no Remainder!"); "Division should have no Remainder!");
@ -115,7 +110,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
: RBuilder.CreateBitCast(SrcAddr, SrcPtrType); : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
Value *SrcGEP = RBuilder.CreateInBoundsGEP( Value *SrcGEP = RBuilder.CreateInBoundsGEP(
OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex)); OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
Value *Load = RBuilder.CreateLoad(OpTy, SrcGEP, SrcIsVolatile); Value *Load =
RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
// Cast destination to operand type and store. // Cast destination to operand type and store.
PointerType *DstPtrType = PointerType::get(OpTy, DstAS); PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
@ -124,7 +120,7 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
: RBuilder.CreateBitCast(DstAddr, DstPtrType); : RBuilder.CreateBitCast(DstAddr, DstPtrType);
Value *DstGEP = RBuilder.CreateInBoundsGEP( Value *DstGEP = RBuilder.CreateInBoundsGEP(
OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex)); OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
RBuilder.CreateStore(Load, DstGEP, DstIsVolatile); RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
BytesCopied += OperandSize; BytesCopied += OperandSize;
} }
@ -136,7 +132,7 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore, void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
Value *SrcAddr, Value *DstAddr, Value *SrcAddr, Value *DstAddr,
Value *CopyLen, unsigned SrcAlign, Value *CopyLen, unsigned SrcAlign,
unsigned DestAlign, bool SrcIsVolatile, unsigned DstAlign, bool SrcIsVolatile,
bool DstIsVolatile, bool DstIsVolatile,
const TargetTransformInfo &TTI) { const TargetTransformInfo &TTI) {
BasicBlock *PreLoopBB = InsertBefore->getParent(); BasicBlock *PreLoopBB = InsertBefore->getParent();
@ -144,13 +140,14 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion"); PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
Function *ParentFunc = PreLoopBB->getParent(); Function *ParentFunc = PreLoopBB->getParent();
const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
LLVMContext &Ctx = PreLoopBB->getContext(); LLVMContext &Ctx = PreLoopBB->getContext();
unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace(); unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace(); unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
Type *LoopOpType = Type *LoopOpType = TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS,
TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAS, DstAS, SrcAlign, DestAlign); SrcAlign, DstAlign);
unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType); unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
IRBuilder<> PLBuilder(PreLoopBB->getTerminator()); IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
@ -178,13 +175,17 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB); BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
IRBuilder<> LoopBuilder(LoopBB); IRBuilder<> LoopBuilder(LoopBB);
Align PartSrcAlign(MinAlign(SrcAlign, LoopOpSize));
Align PartDstAlign(MinAlign(DstAlign, LoopOpSize));
PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index"); PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB); LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex); Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile); Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign,
SrcIsVolatile);
Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex); Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile); LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
Value *NewIndex = Value *NewIndex =
LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U)); LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
@ -235,10 +236,11 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex); Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
Value *SrcGEP = Value *SrcGEP =
ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset); ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
Value *Load = ResBuilder.CreateLoad(Int8Type, SrcGEP, SrcIsVolatile); Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign,
SrcIsVolatile);
Value *DstGEP = Value *DstGEP =
ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset); ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile); ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
Value *ResNewIndex = Value *ResNewIndex =
ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U)); ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
@ -285,13 +287,14 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
// } // }
// return dst; // return dst;
// } // }
static void createMemMoveLoop(Instruction *InsertBefore, static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
Value *SrcAddr, Value *DstAddr, Value *CopyLen, Value *DstAddr, Value *CopyLen, unsigned SrcAlign,
unsigned SrcAlign, unsigned DestAlign, unsigned DstAlign, bool SrcIsVolatile,
bool SrcIsVolatile, bool DstIsVolatile) { bool DstIsVolatile) {
Type *TypeOfCopyLen = CopyLen->getType(); Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent(); BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent(); Function *F = OrigBB->getParent();
const DataLayout &DL = F->getParent()->getDataLayout();
Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType(); Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType();
@ -319,6 +322,10 @@ static void createMemMoveLoop(Instruction *InsertBefore,
BasicBlock *ExitBB = InsertBefore->getParent(); BasicBlock *ExitBB = InsertBefore->getParent();
ExitBB->setName("memmove_done"); ExitBB->setName("memmove_done");
unsigned PartSize = DL.getTypeStoreSize(EltTy);
Align PartSrcAlign(MinAlign(SrcAlign, PartSize));
Align PartDstAlign(MinAlign(DstAlign, PartSize));
// Initial comparison of n == 0 that lets us skip the loops altogether. Shared // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
// between both backwards and forward copy clauses. // between both backwards and forward copy clauses.
ICmpInst *CompareN = ICmpInst *CompareN =
@ -332,11 +339,12 @@ static void createMemMoveLoop(Instruction *InsertBefore,
PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
Value *IndexPtr = LoopBuilder.CreateSub( Value *IndexPtr = LoopBuilder.CreateSub(
LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr"); LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
Value *Element = LoopBuilder.CreateLoad( Value *Element = LoopBuilder.CreateAlignedLoad(
EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr), EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
"element"); PartSrcAlign, "element");
LoopBuilder.CreateStore( LoopBuilder.CreateAlignedStore(
Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr)); Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
PartDstAlign);
LoopBuilder.CreateCondBr( LoopBuilder.CreateCondBr(
LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)), LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
ExitBB, LoopBB); ExitBB, LoopBB);
@ -350,11 +358,11 @@ static void createMemMoveLoop(Instruction *InsertBefore,
BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB); BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
IRBuilder<> FwdLoopBuilder(FwdLoopBB); IRBuilder<> FwdLoopBuilder(FwdLoopBB);
PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr"); PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
Value *FwdElement = FwdLoopBuilder.CreateLoad( Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
EltTy, FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi), Value *FwdElement =
"element"); FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element");
FwdLoopBuilder.CreateStore( Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
FwdElement, FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi)); FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign);
Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd( Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment"); FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen), FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
@ -366,12 +374,13 @@ static void createMemMoveLoop(Instruction *InsertBefore,
ElseTerm->eraseFromParent(); ElseTerm->eraseFromParent();
} }
static void createMemSetLoop(Instruction *InsertBefore, static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
Value *DstAddr, Value *CopyLen, Value *SetValue, Value *CopyLen, Value *SetValue, unsigned DstAlign,
unsigned Align, bool IsVolatile) { bool IsVolatile) {
Type *TypeOfCopyLen = CopyLen->getType(); Type *TypeOfCopyLen = CopyLen->getType();
BasicBlock *OrigBB = InsertBefore->getParent(); BasicBlock *OrigBB = InsertBefore->getParent();
Function *F = OrigBB->getParent(); Function *F = OrigBB->getParent();
const DataLayout &DL = F->getParent()->getDataLayout();
BasicBlock *NewBB = BasicBlock *NewBB =
OrigBB->splitBasicBlock(InsertBefore, "split"); OrigBB->splitBasicBlock(InsertBefore, "split");
BasicBlock *LoopBB BasicBlock *LoopBB
@ -389,14 +398,17 @@ static void createMemSetLoop(Instruction *InsertBefore,
LoopBB); LoopBB);
OrigBB->getTerminator()->eraseFromParent(); OrigBB->getTerminator()->eraseFromParent();
unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
Align PartAlign(MinAlign(DstAlign, PartSize));
IRBuilder<> LoopBuilder(LoopBB); IRBuilder<> LoopBuilder(LoopBB);
PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
LoopBuilder.CreateStore( LoopBuilder.CreateAlignedStore(
SetValue, SetValue,
LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex), LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
IsVolatile); PartAlign, IsVolatile);
Value *NewIndex = Value *NewIndex =
LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));

View File

@ -24,9 +24,9 @@ define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)*
; OPT: load-store-loop: ; OPT: load-store-loop:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ] ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP4:%.*]], [[LOAD_STORE_LOOP]] ]
; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: [[TMP2:%.*]] = load i8, i8 addrspace(1)* [[TMP1]] ; OPT-NEXT: [[TMP2:%.*]] = load i8, i8 addrspace(1)* [[TMP1]], align 1
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: store i8 [[TMP2]], i8 addrspace(1)* [[TMP3]] ; OPT-NEXT: store i8 [[TMP2]], i8 addrspace(1)* [[TMP3]], align 1
; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP4]] = add i64 [[LOOP_INDEX]], 1
; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1025 ; OPT-NEXT: [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 1025
; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]] ; OPT-NEXT: br i1 [[TMP5]], label [[LOAD_STORE_LOOP]], label [[MEMCPY_SPLIT:%.*]]
@ -57,9 +57,9 @@ define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)
; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ] ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ [[INDEX_PTR:%.*]], [[COPY_BACKWARDS_LOOP]] ], [ 1025, [[COPY_BACKWARDS]] ]
; OPT-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1 ; OPT-NEXT: [[INDEX_PTR]] = sub i64 [[TMP1]], 1
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR]]
; OPT-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] ; OPT-NEXT: [[ELEMENT:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]] ; OPT-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR]]
; OPT-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]] ; OPT-NEXT: store i8 [[ELEMENT]], i8 addrspace(1)* [[TMP3]], align 1
; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0 ; OPT-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_PTR]], 0
; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]] ; OPT-NEXT: br i1 [[TMP4]], label [[MEMMOVE_DONE]], label [[COPY_BACKWARDS_LOOP]]
; OPT: copy_forward: ; OPT: copy_forward:
@ -67,9 +67,9 @@ define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)
; OPT: copy_forward_loop: ; OPT: copy_forward_loop:
; OPT-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ] ; OPT-NEXT: [[INDEX_PTR1:%.*]] = phi i64 [ [[INDEX_INCREMENT:%.*]], [[COPY_FORWARD_LOOP]] ], [ 0, [[COPY_FORWARD]] ]
; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]] ; OPT-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[INDEX_PTR1]]
; OPT-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]] ; OPT-NEXT: [[ELEMENT2:%.*]] = load i8, i8 addrspace(1)* [[TMP5]], align 1
; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]] ; OPT-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST]], i64 [[INDEX_PTR1]]
; OPT-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]] ; OPT-NEXT: store i8 [[ELEMENT2]], i8 addrspace(1)* [[TMP6]], align 1
; OPT-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1 ; OPT-NEXT: [[INDEX_INCREMENT]] = add i64 [[INDEX_PTR1]], 1
; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025 ; OPT-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_INCREMENT]], 1025
; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]] ; OPT-NEXT: br i1 [[TMP7]], label [[MEMMOVE_DONE]], label [[COPY_FORWARD_LOOP]]
@ -95,7 +95,7 @@ define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)*
; OPT: loadstoreloop: ; OPT: loadstoreloop:
; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ] ; OPT-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP3:%.*]], [[LOADSTORELOOP]] ]
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[TMP1]]
; OPT-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]] ; OPT-NEXT: store i8 [[VAL:%.*]], i8 addrspace(1)* [[TMP2]], align 1
; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1 ; OPT-NEXT: [[TMP3]] = add i64 [[TMP1]], 1
; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025 ; OPT-NEXT: [[TMP4:%.*]] = icmp ult i64 [[TMP3]], 1025
; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]] ; OPT-NEXT: br i1 [[TMP4]], label [[LOADSTORELOOP]], label [[SPLIT]]
@ -113,9 +113,9 @@ define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 add
; OPT: loop-memcpy-expansion: ; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] ; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] ; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1
; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1
; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]]
; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]]
@ -133,9 +133,9 @@ define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 add
; OPT: loop-memcpy-expansion: ; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] ; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] ; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1
; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1
; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]]
; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]]
@ -153,9 +153,9 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0,
; OPT: loop-memcpy-expansion2: ; OPT: loop-memcpy-expansion2:
; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ] ; OPT-NEXT: [[LOOP_INDEX3:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION2]] ]
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX3]] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX3]]
; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] ; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST0:%.*]], i64 [[LOOP_INDEX3]] ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST0:%.*]], i64 [[LOOP_INDEX3]]
; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] ; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1
; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX3]], 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX3]], 1
; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]]
; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION2]], label [[POST_LOOP_MEMCPY_EXPANSION1]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION2]], label [[POST_LOOP_MEMCPY_EXPANSION1]]
@ -165,9 +165,9 @@ define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0,
; OPT: loop-memcpy-expansion: ; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP11:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[POST_LOOP_MEMCPY_EXPANSION1]] ], [ [[TMP11:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC]], i64 [[LOOP_INDEX]]
; OPT-NEXT: [[TMP9:%.*]] = load i8, i8 addrspace(1)* [[TMP8]] ; OPT-NEXT: [[TMP9:%.*]] = load i8, i8 addrspace(1)* [[TMP8]], align 1
; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST1:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST1:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: store i8 [[TMP9]], i8 addrspace(1)* [[TMP10]] ; OPT-NEXT: store i8 [[TMP9]], i8 addrspace(1)* [[TMP10]], align 1
; OPT-NEXT: [[TMP11]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP11]] = add i64 [[LOOP_INDEX]], 1
; OPT-NEXT: [[TMP12:%.*]] = icmp ult i64 [[TMP11]], [[M]] ; OPT-NEXT: [[TMP12:%.*]] = icmp ult i64 [[TMP11]], [[M]]
; OPT-NEXT: br i1 [[TMP12]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; OPT-NEXT: br i1 [[TMP12]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]]
@ -186,9 +186,9 @@ define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3
; OPT: loop-memcpy-expansion: ; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i32 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[SRC:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(3)* [[SRC:%.*]], i32 [[LOOP_INDEX]]
; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(3)* [[TMP2]] ; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(3)* [[TMP2]], align 1
; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i32 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST:%.*]], i32 [[LOOP_INDEX]]
; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] ; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1
; OPT-NEXT: [[TMP5]] = add i32 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP5]] = add i32 [[LOOP_INDEX]], 1
; OPT-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[N]] ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i32 [[TMP5]], [[N]]
; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]]
@ -207,9 +207,9 @@ define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace
; OPT: loop-memcpy-expansion: ; OPT: loop-memcpy-expansion:
; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ] ; OPT-NEXT: [[LOOP_INDEX:%.*]] = phi i64 [ 0, [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[LOOP_MEMCPY_EXPANSION]] ]
; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[SRC:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]] ; OPT-NEXT: [[TMP3:%.*]] = load i8, i8 addrspace(1)* [[TMP2]], align 1
; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST0:%.*]], i64 [[LOOP_INDEX]] ; OPT-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, i8 addrspace(1)* [[DST0:%.*]], i64 [[LOOP_INDEX]]
; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]] ; OPT-NEXT: store i8 [[TMP3]], i8 addrspace(1)* [[TMP4]], align 1
; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1 ; OPT-NEXT: [[TMP5]] = add i64 [[LOOP_INDEX]], 1
; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]] ; OPT-NEXT: [[TMP6:%.*]] = icmp ult i64 [[TMP5]], [[N]]
; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]] ; OPT-NEXT: br i1 [[TMP6]], label [[LOOP_MEMCPY_EXPANSION]], label [[POST_LOOP_MEMCPY_EXPANSION]]