AMDGPU/GlobalISel: Implement custom kernel arg lowering

Avoid using allocateKernArg / AssignFn. We do not want any
of the type splitting properties of normal calling convention
lowering.

For now at least this exists alongside the IR argument lowering
pass. This is necessary to handle struct padding correctly while
some arguments are still skipped by the IR argument lowering
pass.

llvm-svn: 336373
This commit is contained in:
Matt Arsenault 2018-07-05 17:01:20 +00:00
parent 5ba7266761
commit 29f303799b
6 changed files with 829 additions and 52 deletions

View File

@ -43,7 +43,7 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
Type *ParamTy,
unsigned Offset) const {
uint64_t Offset) const {
MachineFunction &MF = MIRBuilder.getMF();
const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@ -66,7 +66,8 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
}
void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
Type *ParamTy, unsigned Offset,
Type *ParamTy, uint64_t Offset,
unsigned Align,
unsigned DstReg) const {
MachineFunction &MF = MIRBuilder.getMF();
const Function &F = MF.getFunction();
@ -74,7 +75,6 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
unsigned Align = DL.getABITypeAlignment(ParamTy);
unsigned PtrReg = lowerParameterPtr(MIRBuilder, ParamTy, Offset);
MachineMemOperand *MMO =
@ -95,7 +95,7 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
return false;
MachineFunction &MF = MIRBuilder.getMF();
const SISubtarget *Subtarget = static_cast<const SISubtarget *>(&MF.getSubtarget());
const SISubtarget *Subtarget = &MF.getSubtarget<SISubtarget>();
MachineRegisterInfo &MRI = MF.getRegInfo();
SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const SIRegisterInfo *TRI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
@ -145,6 +145,36 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
CCInfo.AllocateReg(FlatScratchInitReg);
}
// The infrastructure for normal calling convention lowering is essentially
// useless for kernels. We want to avoid any kind of legalization or argument
// splitting.
if (F.getCallingConv() == CallingConv::AMDGPU_KERNEL) {
unsigned i = 0;
const unsigned KernArgBaseAlign = 16;
const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
uint64_t ExplicitArgOffset = 0;
// TODO: Align down to dword alignment and extract bits for extending loads.
for (auto &Arg : F.args()) {
Type *ArgTy = Arg.getType();
unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
if (AllocSize == 0)
continue;
unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
lowerParameter(MIRBuilder, ArgTy, ArgOffset, Align, VRegs[i]);
++i;
}
return true;
}
unsigned NumArgs = F.arg_size();
Function::const_arg_iterator CurOrigArg = F.arg_begin();
const AMDGPUTargetLowering &TLI = *getTLI<AMDGPUTargetLowering>();
@ -216,13 +246,5 @@ bool AMDGPUCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
return true;
}
for (unsigned i = 0; i != ArgLocs.size(); ++i, ++Arg) {
// FIXME: We should be getting DebugInfo from the arguments some how.
CCValAssign &VA = ArgLocs[i];
lowerParameter(MIRBuilder, Arg->getType(),
VA.getLocMemOffset() +
Subtarget->getExplicitKernelArgOffset(F), VRegs[i]);
}
return true;
return false;
}

View File

@ -26,10 +26,11 @@ class AMDGPUCallLowering: public CallLowering {
AMDGPUAS AMDGPUASI;
unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
unsigned Offset) const;
uint64_t Offset) const;
void lowerParameter(MachineIRBuilder &MIRBuilder, Type *ParamTy,
unsigned Offset, unsigned DstReg) const;
uint64_t Offset, unsigned Align,
unsigned DstReg) const;
public:
AMDGPUCallLowering(const AMDGPUTargetLowering &TLI);

View File

@ -85,11 +85,6 @@ def RetCC_SI_Shader : CallingConv<[
]>>
]>;
// Calling convention for compute kernels
def CC_AMDGPU_Kernel : CallingConv<[
CCCustom<"allocateKernArg">
]>;
def CSR_AMDGPU_VGPRs_24_255 : CalleeSavedRegs<
(sequence "VGPR%u", 24, 255)
>;
@ -137,16 +132,6 @@ def RetCC_AMDGPU_Func : CallingConv<[
]>;
def CC_AMDGPU : CallingConv<[
CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >="
"AMDGPUSubtarget::SOUTHERN_ISLANDS && "
"!AMDGPU::isShader(State.getCallingConv())",
CCDelegateTo<CC_AMDGPU_Kernel>>,
CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() < "
"AMDGPUSubtarget::SOUTHERN_ISLANDS && "
"!AMDGPU::isShader(State.getCallingConv())",
CCDelegateTo<CC_AMDGPU_Kernel>>,
CCIf<"static_cast<const AMDGPUSubtarget&>"
"(State.getMachineFunction().getSubtarget()).getGeneration() >= "
"AMDGPUSubtarget::SOUTHERN_ISLANDS",

View File

@ -843,7 +843,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForCall(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return CC_AMDGPU_Kernel;
llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:
@ -866,7 +866,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC,
switch (CC) {
case CallingConv::AMDGPU_KERNEL:
case CallingConv::SPIR_KERNEL:
return CC_AMDGPU_Kernel;
llvm_unreachable("kernels should not be handled here");
case CallingConv::AMDGPU_VS:
case CallingConv::AMDGPU_GS:
case CallingConv::AMDGPU_PS:

View File

@ -0,0 +1,723 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; REQUIRES: global-isel
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -O0 -amdgpu-ir-lower-kernel-arguments=0 -stop-after=irtranslator -global-isel %s -o - | FileCheck -check-prefix=HSA-VI %s
define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
; HSA-VI-LABEL: name: i8_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = zext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
; HSA-VI-LABEL: name: i8_zext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s8)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = zext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
; HSA-VI-LABEL: name: i8_sext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s8)
; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = sext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
; HSA-VI-LABEL: name: i16_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = zext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
; HSA-VI-LABEL: name: i16_zext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s16)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = zext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
; HSA-VI-LABEL: name: i16_sext_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s16) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `i16 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s16)
; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = sext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
; HSA-VI-LABEL: name: i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `i32 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store i32 %in, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
; HSA-VI-LABEL: name: f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `float addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `float addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store float %in, float addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
; HSA-VI-LABEL: name: v2i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 2 from `<2 x i8> addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s8>), [[LOAD]](p1) :: (store 2 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <2 x i8> %in, <2 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
; HSA-VI-LABEL: name: v2i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<2 x i16> addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s16>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <2 x i16> %in, <2 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v2i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x i32> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
; HSA-VI-LABEL: name: v2f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<2 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<2 x float> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<2 x s32>), [[LOAD]](p1) :: (store 8 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
; HSA-VI-LABEL: name: v3i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 3 from `<3 x i8> addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s8>), [[LOAD]](p1) :: (store 3 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
; HSA-VI-LABEL: name: v3i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 6 from `<3 x i16> addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s16>), [[LOAD]](p1) :: (store 6 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v3i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x i32> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
; HSA-VI-LABEL: name: v3f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<3 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<3 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 12 from `<3 x float> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<3 x s32>), [[LOAD]](p1) :: (store 12 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
; HSA-VI-LABEL: name: v4i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 4 from `<4 x i8> addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s8>), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
; HSA-VI-LABEL: name: v4i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<4 x i16> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s16>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v4i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
; HSA-VI-LABEL: name: v4f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<4 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<4 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<4 x float> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<4 x s32>), [[LOAD]](p1) :: (store 16 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
; HSA-VI-LABEL: name: v8i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `<8 x i8> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s8>), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
; HSA-VI-LABEL: name: v8i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<8 x i16> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s16>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v8i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x i32> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
; HSA-VI-LABEL: name: v8f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<8 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<8 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<8 x float> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<8 x s32>), [[LOAD]](p1) :: (store 32 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
; HSA-VI-LABEL: name: v16i8_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i8> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s8>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 16 from `<16 x i8> addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s8>), [[LOAD]](p1) :: (store 16 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
; HSA-VI-LABEL: name: v16i16_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i16> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s16>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 32 from `<16 x i16> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s16>), [[LOAD]](p1) :: (store 32 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
ret void
}
define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
; HSA-VI-LABEL: name: v16i32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x i32> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x i32> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
; HSA-VI-LABEL: name: v16f32_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `<16 x float> addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 64
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(<16 x s32>) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 64 from `<16 x float> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](<16 x s32>), [[LOAD]](p1) :: (store 64 into %ir.out, align 4, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
; HSA-VI-LABEL: name: kernel_arg_i64
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
store i64 %a, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
; HSA-VI-LABEL: name: f64_kernel_arg
; HSA-VI: bb.1.entry:
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `double addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 8 from `double addrspace(4)* undef`, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
entry:
store double %in, double addrspace(1)* %out
ret void
}
define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i1 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: G_STORE [[LOAD1]](s1), [[LOAD]](p1) :: (store 1 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
store i1 %x, i1 addrspace(1)* %out, align 1
ret void
}
define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_zext_i32
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[ZEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = zext i1 %x to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_zext_i64
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[ZEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = zext i1 %x to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_sext_i32
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i32 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[SEXT]](s32), [[LOAD]](p1) :: (store 4 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = sext i1 %x to i32
store i32 %ext, i32addrspace(1)* %out, align 4
ret void
}
define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
; HSA-VI-LABEL: name: i1_arg_sext_i64
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(p1) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 8 from `i64 addrspace(1)* addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s1) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i1 addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[LOAD1]](s1)
; HSA-VI: G_STORE [[SEXT]](s64), [[LOAD]](p1) :: (store 8 into %ir.out, addrspace 1)
; HSA-VI: S_ENDPGM
%ext = sext i1 %x to i64
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
; HSA-VI-LABEL: name: empty_struct_arg
; HSA-VI: bb.1 (%ir-block.0):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: S_ENDPGM
ret void
}
; The correct load offsets for these:
; load 4 from 0,
; load 8 from 8
; load 4 from 24
; load 8 from 32
; With the SelectionDAG argument lowering, the alignments for the
; struct members is not properly considered, making these wrong.
; FIXME: GlobalISel extractvalue emission broken
define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
; %val0 = extractvalue {i32, i64} %arg0, 0
; %val1 = extractvalue {i32, i64} %arg0, 1
; %val2 = extractvalue {i32, i64} %arg1, 0
; %val3 = extractvalue {i32, i64} %arg1, 1
; store volatile i32 %val0, i32 addrspace(1)* null
; store volatile i64 %val1, i64 addrspace(1)* null
; store volatile i32 %val2, i32 addrspace(1)* null
; store volatile i64 %val3, i64 addrspace(1)* null
; HSA-VI-LABEL: name: struct_argument_alignment
; HSA-VI: bb.1 (%ir-block.1):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 16
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 24
; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
; HSA-VI: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 16 from `{ i32, i64 } addrspace(4)* undef`, align 8, addrspace 4)
; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s128), 0
; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s128), 64
; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s128), 0
; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s128), 64
; HSA-VI: S_ENDPGM
ret void
}
; No padding between i8 and next struct, but round up at end to 4 byte
; multiple.
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
; %val0 = extractvalue <{i32, i64}> %arg0, 0
; %val1 = extractvalue <{i32, i64}> %arg0, 1
; %val2 = extractvalue <{i32, i64}> %arg1, 0
; %val3 = extractvalue <{i32, i64}> %arg1, 1
; store volatile i32 %val0, i32 addrspace(1)* null
; store volatile i64 %val1, i64 addrspace(1)* null
; store volatile i32 %val2, i32 addrspace(1)* null
; store volatile i64 %val3, i64 addrspace(1)* null
; HSA-VI-LABEL: name: packed_struct_argument_alignment
; HSA-VI: bb.1 (%ir-block.1):
; HSA-VI: liveins: $sgpr4_sgpr5
; HSA-VI: [[COPY:%[0-9]+]]:_(p4) = COPY $sgpr4_sgpr5
; HSA-VI: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
; HSA-VI: [[GEP:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C]](s64)
; HSA-VI: [[LOAD:%[0-9]+]]:_(s96) = G_LOAD [[GEP]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 16, addrspace 4)
; HSA-VI: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 12
; HSA-VI: [[GEP1:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C1]](s64)
; HSA-VI: [[LOAD1:%[0-9]+]]:_(s8) = G_LOAD [[GEP1]](p4) :: (non-temporal invariant load 1 from `i8 addrspace(4)* undef`, align 4, addrspace 4)
; HSA-VI: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 13
; HSA-VI: [[GEP2:%[0-9]+]]:_(p4) = G_GEP [[COPY]], [[C2]](s64)
; HSA-VI: [[LOAD2:%[0-9]+]]:_(s96) = G_LOAD [[GEP2]](p4) :: (non-temporal invariant load 12 from `<{ i32, i64 }> addrspace(4)* undef`, align 1, addrspace 4)
; HSA-VI: [[EXTRACT:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD]](s96), 0
; HSA-VI: [[EXTRACT1:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD]](s96), 32
; HSA-VI: [[EXTRACT2:%[0-9]+]]:_(s32) = G_EXTRACT [[LOAD2]](s96), 0
; HSA-VI: [[EXTRACT3:%[0-9]+]]:_(s64) = G_EXTRACT [[LOAD2]](s96), 32
; HSA-VI: S_ENDPGM
ret void
}

View File

@ -14,12 +14,9 @@
; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
entry:
%0 = zext i8 %in to i32
store i32 %0, i32 addrspace(1)* %out, align 4
%ext = zext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@ -33,9 +30,8 @@ entry:
; HSA-VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x8
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xff
define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
entry:
%0 = zext i8 %in to i32
store i32 %0, i32 addrspace(1)* %out, align 4
%ext = zext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@ -51,9 +47,8 @@ entry:
; HSA-VI: s_sext_i32_i8 s{{[0-9]+}}, [[VAL]]
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
entry:
%0 = sext i8 %in to i32
store i32 %0, i32 addrspace(1)* %out, align 4
%ext = sext i8 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@ -71,9 +66,8 @@ entry:
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
entry:
%0 = zext i16 %in to i32
store i32 %0, i32 addrspace(1)* %out, align 4
%ext = zext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@ -89,9 +83,8 @@ entry:
; HSA-VI: s_and_b32 s{{[0-9]+}}, [[VAL]], 0xffff{{$}}
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
entry:
%0 = zext i16 %in to i32
store i32 %0, i32 addrspace(1)* %out, align 4
%ext = zext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@ -108,9 +101,8 @@ entry:
; HSA-VI: s_sext_i32_i16 s{{[0-9]+}}, [[VAL]]
; HSA-VI: flat_store_dword
define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
entry:
%0 = sext i16 %in to i32
store i32 %0, i32 addrspace(1)* %out, align 4
%ext = sext i16 %in to i32
store i32 %ext, i32 addrspace(1)* %out, align 4
ret void
}
@ -657,3 +649,57 @@ define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwi
store i64 %ext, i64 addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}empty_struct_arg:
; HSA: kernarg_segment_byte_size = 0
define amdgpu_kernel void @empty_struct_arg({} %in) nounwind {
ret void
}
; The correct load offsets for these:
; load 4 from 0,
; load 8 from 8
; load 4 from 24
; load 8 from 32
; With the SelectionDAG argument lowering, the alignments for the
; struct members is not properly considered, making these wrong.
; FIXME: Total argument size is computed wrong
; FUNC-LABEL: {{^}}struct_argument_alignment:
; HSA: kernarg_segment_byte_size = 40
; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x18
; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
define amdgpu_kernel void @struct_argument_alignment({i32, i64} %arg0, i8, {i32, i64} %arg1) {
%val0 = extractvalue {i32, i64} %arg0, 0
%val1 = extractvalue {i32, i64} %arg0, 1
%val2 = extractvalue {i32, i64} %arg1, 0
%val3 = extractvalue {i32, i64} %arg1, 1
store volatile i32 %val0, i32 addrspace(1)* null
store volatile i64 %val1, i64 addrspace(1)* null
store volatile i32 %val2, i32 addrspace(1)* null
store volatile i64 %val3, i64 addrspace(1)* null
ret void
}
; No padding between i8 and next struct, but round up at end to 4 byte
; multiple.
; FUNC-LABEL: {{^}}packed_struct_argument_alignment:
; HSA: kernarg_segment_byte_size = 28
; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x4
; HSA: s_load_dword s{{[0-9]+}}, s[4:5], 0xc
; HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
define amdgpu_kernel void @packed_struct_argument_alignment(<{i32, i64}> %arg0, i8, <{i32, i64}> %arg1) {
%val0 = extractvalue <{i32, i64}> %arg0, 0
%val1 = extractvalue <{i32, i64}> %arg0, 1
%val2 = extractvalue <{i32, i64}> %arg1, 0
%val3 = extractvalue <{i32, i64}> %arg1, 1
store volatile i32 %val0, i32 addrspace(1)* null
store volatile i64 %val1, i64 addrspace(1)* null
store volatile i32 %val2, i32 addrspace(1)* null
store volatile i64 %val3, i64 addrspace(1)* null
ret void
}