forked from OSchip/llvm-project
AMDGPU: Fix offsets for < 4-byte aggregate kernel arguments
We were still using the rounded down offset and alignment even though they aren't handled because you can't trivially bitcast the loaded value. llvm-svn: 348658
This commit is contained in:
parent
cc4b6920b3
commit
b5613ecf17
|
@ -122,14 +122,17 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
|
|||
|
||||
VectorType *VT = dyn_cast<VectorType>(ArgTy);
|
||||
bool IsV3 = VT && VT->getNumElements() == 3;
|
||||
bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
|
||||
|
||||
VectorType *V4Ty = nullptr;
|
||||
|
||||
int64_t AlignDownOffset = alignDown(EltOffset, 4);
|
||||
int64_t OffsetDiff = EltOffset - AlignDownOffset;
|
||||
unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
|
||||
unsigned AdjustedAlign = MinAlign(DoShiftOpt ? AlignDownOffset : EltOffset,
|
||||
KernArgBaseAlign);
|
||||
|
||||
Value *ArgPtr;
|
||||
if (Size < 32 && !ArgTy->isAggregateType()) { // FIXME: Handle aggregate types
|
||||
if (DoShiftOpt) { // FIXME: Handle aggregate types
|
||||
// Since we don't have sub-dword scalar loads, avoid doing an extload by
|
||||
// loading earlier than the argument address, and extracting the relevant
|
||||
// bits.
|
||||
|
@ -147,7 +150,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
|
|||
} else {
|
||||
ArgPtr = Builder.CreateConstInBoundsGEP1_64(
|
||||
KernArgSegment,
|
||||
AlignDownOffset,
|
||||
EltOffset,
|
||||
Arg.getName() + ".kernarg.offset");
|
||||
ArgPtr = Builder.CreateBitCast(ArgPtr, ArgTy->getPointerTo(AS),
|
||||
ArgPtr->getName() + ".cast");
|
||||
|
@ -198,7 +201,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
|
|||
|
||||
// TODO: Convert noalias arg to !noalias
|
||||
|
||||
if (Size < 32 && !ArgTy->isAggregateType()) {
|
||||
if (DoShiftOpt) {
|
||||
Value *ExtractBits = OffsetDiff == 0 ?
|
||||
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
|
||||
|
||||
|
|
|
@ -739,10 +739,10 @@ define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32,
|
|||
; multiple.
|
||||
; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
|
||||
; HSA-GFX9: kernarg_segment_byte_size = 28
|
||||
; HSA-GFX9: global_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:13
|
||||
; HSA-GFX9: global_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:17
|
||||
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
|
||||
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
|
||||
; HSA-GFX9: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
|
||||
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
|
||||
%val0 = extractvalue <{i32, i64}> %arg0, 0
|
||||
%val1 = extractvalue <{i32, i64}> %arg0, 1
|
||||
|
@ -789,10 +789,18 @@ define amdgpu_kernel void @array_3xi32(i16 %arg0, [3 x i32] %arg1) {
|
|||
; FIXME: Why not all scalar loads?
|
||||
; GCN-LABEL: {{^}}array_3xi16:
|
||||
; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:2
|
||||
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
|
||||
; HSA-GFX9: s_load_dword s{{[0-9]+}}, s[4:5], 0x4
|
||||
; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:4
|
||||
; HSA-GFX9: global_load_ushort v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:6
|
||||
define amdgpu_kernel void @array_3xi16(i8 %arg0, [3 x i16] %arg1) {
|
||||
store volatile i8 %arg0, i8 addrspace(1)* undef
|
||||
store volatile [3 x i16] %arg1, [3 x i16] addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}small_array_round_down_offset:
|
||||
; HSA-GFX9: global_load_ubyte v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, off offset:1
|
||||
define amdgpu_kernel void @small_array_round_down_offset(i8, [1 x i8] %arg) {
|
||||
%val = extractvalue [1 x i8] %arg, 0
|
||||
store volatile i8 %val, i8 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue