forked from OSchip/llvm-project
[AMDGPU] Generate range metadata for workitem id
If workgroup size is known inform llvm about range returned by local id and local size queries. Differential Revision: https://reviews.llvm.org/D31804 llvm-svn: 300102
This commit is contained in:
parent
04aee46779
commit
c90347d760
|
@ -55,7 +55,7 @@ ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nul
|
|||
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
|
||||
extern char &AMDGPUAnnotateKernelFeaturesID;
|
||||
|
||||
ModulePass *createAMDGPULowerIntrinsicsPass();
|
||||
ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr);
|
||||
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
|
||||
extern char &AMDGPULowerIntrinsicsID;
|
||||
|
||||
|
|
|
@ -8,6 +8,7 @@
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "AMDGPU.h"
|
||||
#include "AMDGPUSubtarget.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
|
@ -23,10 +24,16 @@ namespace {
|
|||
const unsigned MaxStaticSize = 1024;
|
||||
|
||||
class AMDGPULowerIntrinsics : public ModulePass {
|
||||
private:
|
||||
const TargetMachine *TM;
|
||||
|
||||
bool makeLIDRangeMetadata(Function &F) const;
|
||||
|
||||
public:
|
||||
static char ID;
|
||||
|
||||
AMDGPULowerIntrinsics() : ModulePass(ID) { }
|
||||
AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr)
|
||||
: ModulePass(ID), TM(TM) { }
|
||||
bool runOnModule(Module &M) override;
|
||||
StringRef getPassName() const override {
|
||||
return "AMDGPU Lower Intrinsics";
|
||||
|
@ -39,8 +46,8 @@ char AMDGPULowerIntrinsics::ID = 0;
|
|||
|
||||
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
|
||||
|
||||
INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
|
||||
"Lower intrinsics", false, false)
|
||||
INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
|
||||
"Lower intrinsics", false, false)
|
||||
|
||||
// TODO: Should refine based on estimated number of accesses (e.g. does it
|
||||
// require splitting based on alignment)
|
||||
|
@ -96,6 +103,23 @@ static bool expandMemIntrinsicUses(Function &F) {
|
|||
return Changed;
|
||||
}
|
||||
|
||||
bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
|
||||
if (!TM)
|
||||
return false;
|
||||
|
||||
bool Changed = false;
|
||||
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
|
||||
|
||||
for (auto *U : F.users()) {
|
||||
auto *CI = dyn_cast<CallInst>(U);
|
||||
if (!CI)
|
||||
continue;
|
||||
|
||||
Changed |= ST.makeLIDRangeMetadata(CI);
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
|
||||
bool Changed = false;
|
||||
|
||||
|
@ -110,6 +134,19 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
|
|||
if (expandMemIntrinsicUses(F))
|
||||
Changed = true;
|
||||
break;
|
||||
|
||||
case Intrinsic::amdgcn_workitem_id_x:
|
||||
case Intrinsic::r600_read_tidig_x:
|
||||
case Intrinsic::amdgcn_workitem_id_y:
|
||||
case Intrinsic::r600_read_tidig_y:
|
||||
case Intrinsic::amdgcn_workitem_id_z:
|
||||
case Intrinsic::r600_read_tidig_z:
|
||||
case Intrinsic::r600_read_local_size_x:
|
||||
case Intrinsic::r600_read_local_size_y:
|
||||
case Intrinsic::r600_read_local_size_z:
|
||||
Changed |= makeLIDRangeMetadata(F);
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
@ -118,6 +155,6 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
|
|||
return Changed;
|
||||
}
|
||||
|
||||
ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
|
||||
return new AMDGPULowerIntrinsics();
|
||||
ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) {
|
||||
return new AMDGPULowerIntrinsics(TM);
|
||||
}
|
||||
|
|
|
@ -38,7 +38,6 @@
|
|||
#include "llvm/IR/Intrinsics.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/MDBuilder.h"
|
||||
#include "llvm/IR/Metadata.h"
|
||||
#include "llvm/IR/Module.h"
|
||||
#include "llvm/IR/Type.h"
|
||||
|
@ -71,7 +70,6 @@ private:
|
|||
const TargetMachine *TM;
|
||||
Module *Mod = nullptr;
|
||||
const DataLayout *DL = nullptr;
|
||||
MDNode *MaxWorkGroupSizeRange = nullptr;
|
||||
AMDGPUAS AS;
|
||||
|
||||
// FIXME: This should be per-kernel.
|
||||
|
@ -133,13 +131,6 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
|
|||
Mod = &M;
|
||||
DL = &Mod->getDataLayout();
|
||||
|
||||
// The maximum workitem id.
|
||||
//
|
||||
// FIXME: Should get as subtarget property. Usually runtime enforced max is
|
||||
// 256.
|
||||
MDBuilder MDB(Mod->getContext());
|
||||
MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
|
||||
|
||||
const Triple &TT = TM->getTargetTriple();
|
||||
|
||||
IsAMDGCN = TT.getArch() == Triple::amdgcn;
|
||||
|
@ -258,6 +249,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
|
|||
|
||||
std::pair<Value *, Value *>
|
||||
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
||||
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
|
||||
*Builder.GetInsertBlock()->getParent());
|
||||
|
||||
if (!IsAMDHSA) {
|
||||
Function *LocalSizeYFn
|
||||
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
|
||||
|
@ -267,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
|||
CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
|
||||
CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
|
||||
|
||||
LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
||||
LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
||||
ST.makeLIDRangeMetadata(LocalSizeY);
|
||||
ST.makeLIDRangeMetadata(LocalSizeZ);
|
||||
|
||||
return std::make_pair(LocalSizeY, LocalSizeZ);
|
||||
}
|
||||
|
@ -333,7 +327,7 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
|||
MDNode *MD = MDNode::get(Mod->getContext(), None);
|
||||
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
|
||||
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
|
||||
LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
||||
ST.makeLIDRangeMetadata(LoadZU);
|
||||
|
||||
// Extract y component. Upper half of LoadZU should be zero already.
|
||||
Value *Y = Builder.CreateLShr(LoadXY, 16);
|
||||
|
@ -342,6 +336,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
|
|||
}
|
||||
|
||||
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
|
||||
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
|
||||
*Builder.GetInsertBlock()->getParent());
|
||||
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
|
||||
|
||||
switch (N) {
|
||||
|
@ -364,7 +360,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
|
|||
|
||||
Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
|
||||
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
|
||||
CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
||||
ST.makeLIDRangeMetadata(CI);
|
||||
|
||||
return CI;
|
||||
}
|
||||
|
@ -690,8 +686,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
|
|||
|
||||
const AMDGPUSubtarget &ST =
|
||||
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
|
||||
// FIXME: We should also try to get this value from the reqd_work_group_size
|
||||
// function attribute if it is available.
|
||||
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
|
||||
|
||||
const DataLayout &DL = Mod->getDataLayout();
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "SIMachineFunctionInfo.h"
|
||||
#include "llvm/ADT/SmallString.h"
|
||||
#include "llvm/CodeGen/MachineScheduler.h"
|
||||
#include "llvm/IR/MDBuilder.h"
|
||||
#include "llvm/Target/TargetFrameLowering.h"
|
||||
#include <algorithm>
|
||||
|
||||
|
@ -240,6 +241,65 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
|
|||
return Requested;
|
||||
}
|
||||
|
||||
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
|
||||
Function *Kernel = I->getParent()->getParent();
|
||||
unsigned MinSize = 0;
|
||||
unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
|
||||
bool IdQuery = false;
|
||||
|
||||
// If reqd_work_group_size is present it narrows value down.
|
||||
if (auto *CI = dyn_cast<CallInst>(I)) {
|
||||
const Function *F = CI->getCalledFunction();
|
||||
if (F) {
|
||||
unsigned Dim = UINT_MAX;
|
||||
switch (F->getIntrinsicID()) {
|
||||
case Intrinsic::amdgcn_workitem_id_x:
|
||||
case Intrinsic::r600_read_tidig_x:
|
||||
IdQuery = true;
|
||||
case Intrinsic::r600_read_local_size_x:
|
||||
Dim = 0;
|
||||
break;
|
||||
case Intrinsic::amdgcn_workitem_id_y:
|
||||
case Intrinsic::r600_read_tidig_y:
|
||||
IdQuery = true;
|
||||
case Intrinsic::r600_read_local_size_y:
|
||||
Dim = 1;
|
||||
break;
|
||||
case Intrinsic::amdgcn_workitem_id_z:
|
||||
case Intrinsic::r600_read_tidig_z:
|
||||
IdQuery = true;
|
||||
case Intrinsic::r600_read_local_size_z:
|
||||
Dim = 2;
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (Dim <= 3) {
|
||||
if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
|
||||
if (Node->getNumOperands() == 3)
|
||||
MinSize = MaxSize = mdconst::extract<ConstantInt>(
|
||||
Node->getOperand(Dim))->getZExtValue();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!MaxSize)
|
||||
return false;
|
||||
|
||||
// Range metadata is [Lo, Hi). For ID query we need to pass max size
|
||||
// as Hi. For size query we need to pass Hi + 1.
|
||||
if (IdQuery)
|
||||
MinSize = 0;
|
||||
else
|
||||
++MaxSize;
|
||||
|
||||
MDBuilder MDB(I->getContext());
|
||||
MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
|
||||
APInt(32, MaxSize));
|
||||
I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
|
||||
return true;
|
||||
}
|
||||
|
||||
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
||||
const TargetMachine &TM) :
|
||||
AMDGPUSubtarget(TT, GPU, FS, TM),
|
||||
|
|
|
@ -512,6 +512,9 @@ public:
|
|||
/// compatible with minimum/maximum number of waves limited by flat work group
|
||||
/// size, register usage, and/or lds usage.
|
||||
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
|
||||
|
||||
/// Creates value range metadata on an workitemid.* inrinsic call or load.
|
||||
bool makeLIDRangeMetadata(Instruction *I) const;
|
||||
};
|
||||
|
||||
class R600Subtarget final : public AMDGPUSubtarget {
|
||||
|
|
|
@ -555,12 +555,14 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
|
|||
}
|
||||
|
||||
void AMDGPUPassConfig::addIRPasses() {
|
||||
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
|
||||
|
||||
// There is no reason to run these.
|
||||
disablePass(&StackMapLivenessID);
|
||||
disablePass(&FuncletLayoutID);
|
||||
disablePass(&PatchableFunctionID);
|
||||
|
||||
addPass(createAMDGPULowerIntrinsicsPass());
|
||||
addPass(createAMDGPULowerIntrinsicsPass(&TM));
|
||||
|
||||
// Function calls are not supported, so make sure we inline everything.
|
||||
addPass(createAMDGPUAlwaysInlinePass());
|
||||
|
@ -572,8 +574,6 @@ void AMDGPUPassConfig::addIRPasses() {
|
|||
// without ever running any passes on the second.
|
||||
addPass(createBarrierNoopPass());
|
||||
|
||||
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
|
||||
|
||||
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
|
||||
// TODO: May want to move later or split into an early and late one.
|
||||
|
||||
|
|
|
@ -84,10 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
|
|||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
|
||||
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
||||
; VI: flat_load_ushort [[A:v[0-9]+]]
|
||||
; VI: flat_load_ushort [[B:v[0-9]+]]
|
||||
; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
|
||||
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
||||
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -27,8 +27,6 @@
|
|||
; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
|
||||
; HSA-PROMOTE: .end_amd_kernel_code_t
|
||||
|
||||
; FIXME: These should be merged
|
||||
; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1
|
||||
; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2
|
||||
|
||||
; SI-PROMOTE: ds_write_b32
|
||||
|
@ -58,9 +56,9 @@
|
|||
; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
|
||||
; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
|
||||
|
||||
; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
|
||||
; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1
|
||||
; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1
|
||||
; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2
|
||||
; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !2
|
||||
; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !2
|
||||
|
||||
; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
|
||||
; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
|
||||
|
@ -77,9 +75,9 @@
|
|||
|
||||
; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0
|
||||
; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0
|
||||
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
|
||||
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
|
||||
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
|
||||
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1
|
||||
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1
|
||||
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1
|
||||
define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||
entry:
|
||||
%stack = alloca [5 x i32], align 4
|
||||
|
@ -557,6 +555,8 @@ entry:
|
|||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
|
||||
|
||||
; HSAOPT: !0 = !{}
|
||||
; HSAOPT: !1 = !{i32 0, i32 2048}
|
||||
; HSAOPT: !1 = !{i32 0, i32 257}
|
||||
; HSAOPT: !2 = !{i32 0, i32 256}
|
||||
|
||||
; NOHSAOPT: !0 = !{i32 0, i32 2048}
|
||||
; NOHSAOPT: !0 = !{i32 0, i32 257}
|
||||
; NOHSAOPT: !1 = !{i32 0, i32 256}
|
||||
|
|
|
@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
|||
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
|
||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
|
||||
; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
|
||||
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
|||
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
|
||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
|
||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
|
||||
; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
|
||||
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -150,7 +150,7 @@ define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1
|
|||
; Do scalar loads into the super register we need.
|
||||
; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; CI-NOT: v_mov
|
||||
; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
|
||||
; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
|
||||
; CI: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
|
||||
|
@ -173,7 +173,7 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x
|
|||
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
|
||||
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
|
||||
; CI-NOT: v_mov
|
||||
; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
|
||||
; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
|
||||
; CI: s_endpgm
|
||||
define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
|
||||
|
|
|
@ -234,8 +234,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %p
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
|
||||
define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -248,8 +248,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
|
||||
define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -355,8 +355,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
|
||||
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
|
||||
define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
|
||||
|
@ -370,8 +370,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
|
||||
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
|
||||
define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
|
||||
|
|
|
@ -206,8 +206,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
|
||||
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
|
||||
define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
|
||||
|
@ -221,8 +221,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
|
||||
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
|
||||
define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
|
||||
|
@ -348,8 +348,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %p
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
|
||||
define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
@ -362,8 +362,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
|
||||
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
|
||||
; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
|
||||
define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
|
||||
%id = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -45,11 +45,7 @@ entry:
|
|||
; GCN-LABEL: {{^}}local_memory_two_objects:
|
||||
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
|
||||
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
||||
|
||||
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
|
||||
|
||||
; SI-DAG: ds_write_b32 [[ADDRW]],
|
||||
; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
|
||||
; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
|
||||
|
||||
; GCN: s_barrier
|
||||
|
||||
|
|
|
@ -14,8 +14,8 @@ entry:
|
|||
}
|
||||
|
||||
; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
|
||||
; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
|
||||
; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
|
||||
; CHECK-NOT: v_and_b32
|
||||
; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
|
||||
define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
|
||||
entry:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
|
||||
|
@ -26,8 +26,8 @@ entry:
|
|||
|
||||
; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1:
|
||||
; CHECK-NOT: v0
|
||||
; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
|
||||
; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
|
||||
; CHECK-NOT: v_and_b32
|
||||
; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
|
||||
define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
|
||||
entry:
|
||||
%id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
|
||||
|
|
|
@ -12,9 +12,9 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
|
|||
|
||||
; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
|
||||
; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
|
||||
; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0
|
||||
; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
|
||||
; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
|
||||
; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1
|
||||
; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1
|
||||
; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1
|
||||
|
||||
define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
|
||||
entry:
|
||||
|
@ -295,6 +295,7 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; OPT: !0 = !{i32 0, i32 2048}
|
||||
; OPT: !0 = !{i32 0, i32 257}
|
||||
; OPT: !1 = !{i32 0, i32 256}
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
|
||||
|
|
|
@ -12,7 +12,7 @@
|
|||
; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i128, i128 addrspace(1)* %in.gep
|
||||
|
@ -56,7 +56,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128
|
|||
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i128, i128 addrspace(1)* %in.gep
|
||||
|
@ -113,5 +113,7 @@ define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i
|
|||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
|
||||
define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i64, i64 addrspace(1)* %in.gep
|
||||
|
@ -42,7 +42,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 add
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
|
||||
define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i64, i64 addrspace(1)* %in.gep
|
||||
|
@ -58,7 +58,7 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addr
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
|
||||
define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i64, i64 addrspace(1)* %in.gep
|
||||
|
@ -106,7 +106,7 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 add
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
|
||||
define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i64, i64 addrspace(1)* %in.gep
|
||||
|
@ -122,7 +122,7 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
|
||||
define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i64, i64 addrspace(1)* %in.gep
|
||||
|
@ -138,7 +138,7 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 a
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
|
||||
define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i64, i64 addrspace(1)* %in.gep
|
||||
|
@ -156,7 +156,7 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 a
|
|||
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
|
||||
define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
|
||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
|
||||
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
|
||||
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
|
||||
%ld.64 = load i64, i64 addrspace(1)* %in.gep
|
||||
|
@ -383,5 +383,7 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 add
|
|||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
|
||||
declare i32 @llvm.amdgcn.workgroup.id.x() #0
|
||||
|
||||
attributes #0 = { nounwind readnone }
|
||||
attributes #1 = { nounwind }
|
||||
|
|
|
@ -4,6 +4,8 @@
|
|||
|
||||
declare i32 @llvm.r600.read.tidig.x() #0
|
||||
|
||||
declare i32 @llvm.r600.read.tgid.x() #0
|
||||
|
||||
|
||||
;EG: {{^}}shl_v2i32:
|
||||
;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
|
||||
|
@ -288,7 +290,7 @@ define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
|
|||
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
|
||||
define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
|
||||
%tid = call i32 @llvm.r600.read.tidig.x() #0
|
||||
%tid = call i32 @llvm.r600.read.tgid.x() #0
|
||||
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
|
||||
%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
|
||||
%a = load i64, i64 addrspace(1)* %gep.in
|
||||
|
|
|
@ -85,10 +85,10 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
|
|||
|
||||
; FIXME: Need to handle non-uniform case for function below (load without gep).
|
||||
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
|
||||
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
||||
; VI: flat_load_ushort [[A:v[0-9]+]]
|
||||
; VI: flat_load_ushort [[B:v[0-9]+]]
|
||||
; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
|
||||
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
|
||||
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
|
||||
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x()
|
||||
|
|
|
@ -0,0 +1,83 @@
|
|||
; RUN: llc -march=amdgcn < %s | FileCheck %s
|
||||
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
|
||||
|
||||
; CHECK-NOT: and_b32
|
||||
|
||||
; OPT-LABEL: @zext_grp_size_128
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0
|
||||
define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
%tmp1 = and i32 %tmp, 127
|
||||
store i32 %tmp1, i32 addrspace(1)* %arg, align 4
|
||||
%tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
|
||||
%tmp3 = and i32 %tmp2, 127
|
||||
%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
|
||||
store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
|
||||
%tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
|
||||
%tmp6 = and i32 %tmp5, 127
|
||||
%tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
|
||||
store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @zext_grp_size_32x4x1
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4
|
||||
define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
%tmp1 = and i32 %tmp, 31
|
||||
store i32 %tmp1, i32 addrspace(1)* %arg, align 4
|
||||
%tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
|
||||
%tmp3 = and i32 %tmp2, 3
|
||||
%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
|
||||
store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
|
||||
%tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
|
||||
%tmp6 = and i32 %tmp5, 1
|
||||
%tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
|
||||
store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; OPT-LABEL: @zext_grp_size_512
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5
|
||||
; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5
|
||||
define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
|
||||
bb:
|
||||
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
%tmp1 = and i32 %tmp, 65535
|
||||
store i32 %tmp1, i32 addrspace(1)* %arg, align 4
|
||||
%tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
|
||||
%tmp3 = and i32 %tmp2, 65535
|
||||
%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
|
||||
store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
|
||||
%tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
|
||||
%tmp6 = and i32 %tmp5, 65535
|
||||
%tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
|
||||
store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #2
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.y() #2
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.z() #2
|
||||
|
||||
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" }
|
||||
attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
|
||||
attributes #2 = { nounwind readnone }
|
||||
|
||||
!0 = !{i32 32, i32 4, i32 1}
|
||||
|
||||
; OPT: !0 = !{i32 0, i32 128}
|
||||
; OPT: !1 = !{i32 32, i32 4, i32 1}
|
||||
; OPT: !2 = !{i32 0, i32 32}
|
||||
; OPT: !3 = !{i32 0, i32 4}
|
||||
; OPT: !4 = !{i32 0, i32 1}
|
||||
; OPT: !5 = !{i32 0, i32 512}
|
Loading…
Reference in New Issue