forked from OSchip/llvm-project
LSV: Fix adjust alloca alignment trick for AMDGPU
This was checking the hardcoded address space 0 for the stack. Additionally, this should be checking for legality with the adjusted alignment, so defer the alignment check. Also try to split if the unaligned access isn't allowed. llvm-svn: 342442
This commit is contained in:
parent
f1b0b47b2d
commit
c640798597
|
@ -954,11 +954,6 @@ bool Vectorizer::vectorizeStoreChain(
|
|||
// try again.
|
||||
unsigned EltSzInBytes = Sz / 8;
|
||||
unsigned SzInBytes = EltSzInBytes * ChainSize;
|
||||
if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
|
||||
auto Chains = splitOddVectorElts(Chain, Sz);
|
||||
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
|
||||
vectorizeStoreChain(Chains.second, InstructionsProcessed);
|
||||
}
|
||||
|
||||
VectorType *VecTy;
|
||||
VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
|
||||
|
@ -991,14 +986,23 @@ bool Vectorizer::vectorizeStoreChain(
|
|||
|
||||
// If the store is going to be misaligned, don't vectorize it.
|
||||
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
|
||||
if (S0->getPointerAddressSpace() != 0)
|
||||
return false;
|
||||
if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
|
||||
auto Chains = splitOddVectorElts(Chain, Sz);
|
||||
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
|
||||
vectorizeStoreChain(Chains.second, InstructionsProcessed);
|
||||
}
|
||||
|
||||
unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
|
||||
StackAdjustedAlignment,
|
||||
DL, S0, nullptr, &DT);
|
||||
if (NewAlign < StackAdjustedAlignment)
|
||||
return false;
|
||||
if (NewAlign != 0)
|
||||
Alignment = NewAlign;
|
||||
}
|
||||
|
||||
if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
|
||||
auto Chains = splitOddVectorElts(Chain, Sz);
|
||||
return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
|
||||
vectorizeStoreChain(Chains.second, InstructionsProcessed);
|
||||
}
|
||||
|
||||
BasicBlock::iterator First, Last;
|
||||
|
@ -1037,13 +1041,11 @@ bool Vectorizer::vectorizeStoreChain(
|
|||
}
|
||||
}
|
||||
|
||||
// This cast is safe because Builder.CreateStore() always creates a bona fide
|
||||
// StoreInst.
|
||||
StoreInst *SI = cast<StoreInst>(
|
||||
Builder.CreateStore(Vec, Builder.CreateBitCast(S0->getPointerOperand(),
|
||||
VecTy->getPointerTo(AS))));
|
||||
StoreInst *SI = Builder.CreateAlignedStore(
|
||||
Vec,
|
||||
Builder.CreateBitCast(S0->getPointerOperand(), VecTy->getPointerTo(AS)),
|
||||
Alignment);
|
||||
propagateMetadata(SI, Chain);
|
||||
SI->setAlignment(Alignment);
|
||||
|
||||
eraseInstructions(Chain);
|
||||
++NumVectorInstructions;
|
||||
|
@ -1102,12 +1104,6 @@ bool Vectorizer::vectorizeLoadChain(
|
|||
// try again.
|
||||
unsigned EltSzInBytes = Sz / 8;
|
||||
unsigned SzInBytes = EltSzInBytes * ChainSize;
|
||||
if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
|
||||
auto Chains = splitOddVectorElts(Chain, Sz);
|
||||
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
|
||||
vectorizeLoadChain(Chains.second, InstructionsProcessed);
|
||||
}
|
||||
|
||||
VectorType *VecTy;
|
||||
VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
|
||||
if (VecLoadTy)
|
||||
|
@ -1132,18 +1128,27 @@ bool Vectorizer::vectorizeLoadChain(
|
|||
|
||||
// If the load is going to be misaligned, don't vectorize it.
|
||||
if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
|
||||
if (L0->getPointerAddressSpace() != 0)
|
||||
return false;
|
||||
if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
|
||||
auto Chains = splitOddVectorElts(Chain, Sz);
|
||||
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
|
||||
vectorizeLoadChain(Chains.second, InstructionsProcessed);
|
||||
}
|
||||
|
||||
unsigned NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
|
||||
StackAdjustedAlignment,
|
||||
DL, L0, nullptr, &DT);
|
||||
if (NewAlign < StackAdjustedAlignment)
|
||||
return false;
|
||||
if (NewAlign != 0)
|
||||
Alignment = NewAlign;
|
||||
|
||||
Alignment = NewAlign;
|
||||
}
|
||||
|
||||
if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
|
||||
auto Chains = splitOddVectorElts(Chain, Sz);
|
||||
return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
|
||||
vectorizeLoadChain(Chains.second, InstructionsProcessed);
|
||||
}
|
||||
|
||||
LLVM_DEBUG({
|
||||
dbgs() << "LSV: Loads to vectorize:\n";
|
||||
for (Instruction *I : Chain)
|
||||
|
@ -1159,11 +1164,8 @@ bool Vectorizer::vectorizeLoadChain(
|
|||
|
||||
Value *Bitcast =
|
||||
Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
|
||||
// This cast is safe because Builder.CreateLoad always creates a bona fide
|
||||
// LoadInst.
|
||||
LoadInst *LI = cast<LoadInst>(Builder.CreateLoad(Bitcast));
|
||||
LoadInst *LI = Builder.CreateAlignedLoad(Bitcast, Alignment);
|
||||
propagateMetadata(LI, Chain);
|
||||
LI->setAlignment(Alignment);
|
||||
|
||||
if (VecLoadTy) {
|
||||
SmallVector<Instruction *, 16> InstrsToErase;
|
||||
|
|
|
@ -64,10 +64,7 @@ define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noal
|
|||
; ALL: alloca [128 x i32], align 16
|
||||
|
||||
; UNALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 1{{$}}
|
||||
|
||||
; FIXME: Should change alignment
|
||||
; ALIGNED: load i32
|
||||
; ALIGNED: load i32
|
||||
; ALIGNED: load <2 x i32>, <2 x i32> addrspace(5)* %{{[0-9]+}}, align 4{{$}}
|
||||
define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
|
||||
%alloca = alloca [128 x i32], align 16, addrspace(5)
|
||||
%ptr0 = getelementptr inbounds [128 x i32], [128 x i32] addrspace(5)* %alloca, i32 0, i32 %offset
|
||||
|
@ -128,5 +125,84 @@ define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noa
|
|||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32(
|
||||
; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
|
||||
; ALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 4
|
||||
|
||||
; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
|
||||
; UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32> addrspace(5)* %1, align 1
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32() {
|
||||
%alloca = alloca [8 x i32], align 1, addrspace(5)
|
||||
%out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
|
||||
%out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
|
||||
|
||||
store i32 9, i32 addrspace(5)* %out, align 1
|
||||
store i32 1, i32 addrspace(5)* %out.gep.1, align 1
|
||||
store i32 23, i32 addrspace(5)* %out.gep.2, align 1
|
||||
store i32 19, i32 addrspace(5)* %out.gep.3, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
|
||||
; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
|
||||
; ALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 4
|
||||
|
||||
; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
|
||||
; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8> addrspace(5)* %1, align 1
|
||||
define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8() {
|
||||
%alloca = alloca [8 x i8], align 1, addrspace(5)
|
||||
%out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
|
||||
%out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
|
||||
%out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
|
||||
%out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
|
||||
|
||||
store i8 9, i8 addrspace(5)* %out, align 1
|
||||
store i8 1, i8 addrspace(5)* %out.gep.1, align 1
|
||||
store i8 23, i8 addrspace(5)* %out.gep.2, align 1
|
||||
store i8 19, i8 addrspace(5)* %out.gep.3, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i32(
|
||||
; ALIGNED: %alloca = alloca [8 x i32], align 4, addrspace(5)
|
||||
; ALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 4
|
||||
|
||||
; UNALIGNED: %alloca = alloca [8 x i32], align 1, addrspace(5)
|
||||
; UNALIGNED: load <4 x i32>, <4 x i32> addrspace(5)* %1, align 1
|
||||
define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i32() {
|
||||
%alloca = alloca [8 x i32], align 1, addrspace(5)
|
||||
%out = bitcast [8 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
|
||||
%out.gep.1 = getelementptr i32, i32 addrspace(5)* %out, i32 1
|
||||
%out.gep.2 = getelementptr i32, i32 addrspace(5)* %out, i32 2
|
||||
%out.gep.3 = getelementptr i32, i32 addrspace(5)* %out, i32 3
|
||||
|
||||
%load0 = load i32, i32 addrspace(5)* %out, align 1
|
||||
%load1 = load i32, i32 addrspace(5)* %out.gep.1, align 1
|
||||
%load2 = load i32, i32 addrspace(5)* %out.gep.2, align 1
|
||||
%load3 = load i32, i32 addrspace(5)* %out.gep.3, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: @merge_private_load_4_vector_elts_loads_v4i8(
|
||||
; ALIGNED: %alloca = alloca [8 x i8], align 4, addrspace(5)
|
||||
; ALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 4
|
||||
|
||||
; UNALIGNED: %alloca = alloca [8 x i8], align 1, addrspace(5)
|
||||
; UNALIGNED: load <4 x i8>, <4 x i8> addrspace(5)* %1, align 1
|
||||
define amdgpu_kernel void @merge_private_load_4_vector_elts_loads_v4i8() {
|
||||
%alloca = alloca [8 x i8], align 1, addrspace(5)
|
||||
%out = bitcast [8 x i8] addrspace(5)* %alloca to i8 addrspace(5)*
|
||||
%out.gep.1 = getelementptr i8, i8 addrspace(5)* %out, i8 1
|
||||
%out.gep.2 = getelementptr i8, i8 addrspace(5)* %out, i8 2
|
||||
%out.gep.3 = getelementptr i8, i8 addrspace(5)* %out, i8 3
|
||||
|
||||
%load0 = load i8, i8 addrspace(5)* %out, align 1
|
||||
%load1 = load i8, i8 addrspace(5)* %out.gep.1, align 1
|
||||
%load2 = load i8, i8 addrspace(5)* %out.gep.2, align 1
|
||||
%load3 = load i8, i8 addrspace(5)* %out.gep.3, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
Loading…
Reference in New Issue