[AMDGPU] Generate range metadata for workitem id

If workgroup size is known inform llvm about range returned by local
id  and local size queries.

Differential Revision: https://reviews.llvm.org/D31804

llvm-svn: 300102
This commit is contained in:
Stanislav Mekhanoshin 2017-04-12 20:48:56 +00:00
parent 04aee46779
commit c90347d760
20 changed files with 259 additions and 79 deletions

View File

@ -55,7 +55,7 @@ ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nul
void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
extern char &AMDGPUAnnotateKernelFeaturesID;
ModulePass *createAMDGPULowerIntrinsicsPass();
ModulePass *createAMDGPULowerIntrinsicsPass(const TargetMachine *TM = nullptr);
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;

View File

@ -8,6 +8,7 @@
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
@ -23,10 +24,16 @@ namespace {
const unsigned MaxStaticSize = 1024;
class AMDGPULowerIntrinsics : public ModulePass {
private:
const TargetMachine *TM;
bool makeLIDRangeMetadata(Function &F) const;
public:
static char ID;
AMDGPULowerIntrinsics() : ModulePass(ID) { }
AMDGPULowerIntrinsics(const TargetMachine *TM = nullptr)
: ModulePass(ID), TM(TM) { }
bool runOnModule(Module &M) override;
StringRef getPassName() const override {
return "AMDGPU Lower Intrinsics";
@ -39,8 +46,8 @@ char AMDGPULowerIntrinsics::ID = 0;
char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
"Lower intrinsics", false, false)
INITIALIZE_TM_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
"Lower intrinsics", false, false)
// TODO: Should refine based on estimated number of accesses (e.g. does it
// require splitting based on alignment)
@ -96,6 +103,23 @@ static bool expandMemIntrinsicUses(Function &F) {
return Changed;
}
bool AMDGPULowerIntrinsics::makeLIDRangeMetadata(Function &F) const {
if (!TM)
return false;
bool Changed = false;
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
for (auto *U : F.users()) {
auto *CI = dyn_cast<CallInst>(U);
if (!CI)
continue;
Changed |= ST.makeLIDRangeMetadata(CI);
}
return Changed;
}
bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
bool Changed = false;
@ -110,6 +134,19 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
if (expandMemIntrinsicUses(F))
Changed = true;
break;
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
case Intrinsic::r600_read_local_size_x:
case Intrinsic::r600_read_local_size_y:
case Intrinsic::r600_read_local_size_z:
Changed |= makeLIDRangeMetadata(F);
break;
default:
break;
}
@ -118,6 +155,6 @@ bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
return Changed;
}
ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
return new AMDGPULowerIntrinsics();
ModulePass *llvm::createAMDGPULowerIntrinsicsPass(const TargetMachine *TM) {
return new AMDGPULowerIntrinsics(TM);
}

View File

@ -38,7 +38,6 @@
#include "llvm/IR/Intrinsics.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
@ -71,7 +70,6 @@ private:
const TargetMachine *TM;
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
MDNode *MaxWorkGroupSizeRange = nullptr;
AMDGPUAS AS;
// FIXME: This should be per-kernel.
@ -133,13 +131,6 @@ bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
Mod = &M;
DL = &Mod->getDataLayout();
// The maximum workitem id.
//
// FIXME: Should get as subtarget property. Usually runtime enforced max is
// 256.
MDBuilder MDB(Mod->getContext());
MaxWorkGroupSizeRange = MDB.createRange(APInt(32, 0), APInt(32, 2048));
const Triple &TT = TM->getTargetTriple();
IsAMDGCN = TT.getArch() == Triple::amdgcn;
@ -258,6 +249,9 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
std::pair<Value *, Value *>
AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
*Builder.GetInsertBlock()->getParent());
if (!IsAMDHSA) {
Function *LocalSizeYFn
= Intrinsic::getDeclaration(Mod, Intrinsic::r600_read_local_size_y);
@ -267,8 +261,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
LocalSizeY->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
LocalSizeZ->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
ST.makeLIDRangeMetadata(LocalSizeY);
ST.makeLIDRangeMetadata(LocalSizeZ);
return std::make_pair(LocalSizeY, LocalSizeZ);
}
@ -333,7 +327,7 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
MDNode *MD = MDNode::get(Mod->getContext(), None);
LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
LoadZU->setMetadata(LLVMContext::MD_invariant_load, MD);
LoadZU->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
ST.makeLIDRangeMetadata(LoadZU);
// Extract y component. Upper half of LoadZU should be zero already.
Value *Y = Builder.CreateLShr(LoadXY, 16);
@ -342,6 +336,8 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
}
Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(
*Builder.GetInsertBlock()->getParent());
Intrinsic::ID IntrID = Intrinsic::ID::not_intrinsic;
switch (N) {
@ -364,7 +360,7 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
Function *WorkitemIdFn = Intrinsic::getDeclaration(Mod, IntrID);
CallInst *CI = Builder.CreateCall(WorkitemIdFn);
CI->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
ST.makeLIDRangeMetadata(CI);
return CI;
}
@ -690,8 +686,6 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
const AMDGPUSubtarget &ST =
TM->getSubtarget<AMDGPUSubtarget>(ContainingFunction);
// FIXME: We should also try to get this value from the reqd_work_group_size
// function attribute if it is available.
unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
const DataLayout &DL = Mod->getDataLayout();

View File

@ -16,6 +16,7 @@
#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/Target/TargetFrameLowering.h"
#include <algorithm>
@ -240,6 +241,65 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
return Requested;
}
bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const {
Function *Kernel = I->getParent()->getParent();
unsigned MinSize = 0;
unsigned MaxSize = getFlatWorkGroupSizes(*Kernel).second;
bool IdQuery = false;
// If reqd_work_group_size is present it narrows value down.
if (auto *CI = dyn_cast<CallInst>(I)) {
const Function *F = CI->getCalledFunction();
if (F) {
unsigned Dim = UINT_MAX;
switch (F->getIntrinsicID()) {
case Intrinsic::amdgcn_workitem_id_x:
case Intrinsic::r600_read_tidig_x:
IdQuery = true;
case Intrinsic::r600_read_local_size_x:
Dim = 0;
break;
case Intrinsic::amdgcn_workitem_id_y:
case Intrinsic::r600_read_tidig_y:
IdQuery = true;
case Intrinsic::r600_read_local_size_y:
Dim = 1;
break;
case Intrinsic::amdgcn_workitem_id_z:
case Intrinsic::r600_read_tidig_z:
IdQuery = true;
case Intrinsic::r600_read_local_size_z:
Dim = 2;
break;
default:
break;
}
if (Dim <= 3) {
if (auto Node = Kernel->getMetadata("reqd_work_group_size"))
if (Node->getNumOperands() == 3)
MinSize = MaxSize = mdconst::extract<ConstantInt>(
Node->getOperand(Dim))->getZExtValue();
}
}
}
if (!MaxSize)
return false;
// Range metadata is [Lo, Hi). For ID query we need to pass max size
// as Hi. For size query we need to pass Hi + 1.
if (IdQuery)
MinSize = 0;
else
++MaxSize;
MDBuilder MDB(I->getContext());
MDNode *MaxWorkGroupSizeRange = MDB.createRange(APInt(32, MinSize),
APInt(32, MaxSize));
I->setMetadata(LLVMContext::MD_range, MaxWorkGroupSizeRange);
return true;
}
R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS,
const TargetMachine &TM) :
AMDGPUSubtarget(TT, GPU, FS, TM),

View File

@ -512,6 +512,9 @@ public:
/// compatible with minimum/maximum number of waves limited by flat work group
/// size, register usage, and/or lds usage.
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
/// Creates value range metadata on an workitemid.* inrinsic call or load.
bool makeLIDRangeMetadata(Instruction *I) const;
};
class R600Subtarget final : public AMDGPUSubtarget {

View File

@ -555,12 +555,14 @@ void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() {
}
void AMDGPUPassConfig::addIRPasses() {
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
// There is no reason to run these.
disablePass(&StackMapLivenessID);
disablePass(&FuncletLayoutID);
disablePass(&PatchableFunctionID);
addPass(createAMDGPULowerIntrinsicsPass());
addPass(createAMDGPULowerIntrinsicsPass(&TM));
// Function calls are not supported, so make sure we inline everything.
addPass(createAMDGPUAlwaysInlinePass());
@ -572,8 +574,6 @@ void AMDGPUPassConfig::addIRPasses() {
// without ever running any passes on the second.
addPass(createBarrierNoopPass());
const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
if (TM.getTargetTriple().getArch() == Triple::amdgcn) {
// TODO: May want to move later or split into an early and late one.

View File

@ -84,10 +84,10 @@ define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i1
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_add_i16_zext_to_i64:
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -27,8 +27,6 @@
; HSA-PROMOTE: workgroup_group_segment_byte_size = 5120
; HSA-PROMOTE: .end_amd_kernel_code_t
; FIXME: These should be merged
; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x1
; HSA-PROMOTE: s_load_dword s{{[0-9]+}}, s[4:5], 0x2
; SI-PROMOTE: ds_write_b32
@ -58,9 +56,9 @@
; HSAOPT: [[LDZU:%[0-9]+]] = load i32, i32 addrspace(2)* [[GEP1]], align 4, !range !1, !invariant.load !0
; HSAOPT: [[EXTRACTY:%[0-9]+]] = lshr i32 [[LDXY]], 16
; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !1
; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !1
; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !1
; HSAOPT: [[WORKITEM_ID_X:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.x(), !range !2
; HSAOPT: [[WORKITEM_ID_Y:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.y(), !range !2
; HSAOPT: [[WORKITEM_ID_Z:%[0-9]+]] = call i32 @llvm.amdgcn.workitem.id.z(), !range !2
; HSAOPT: [[Y_SIZE_X_Z_SIZE:%[0-9]+]] = mul nuw nsw i32 [[EXTRACTY]], [[LDZU]]
; HSAOPT: [[YZ_X_XID:%[0-9]+]] = mul i32 [[Y_SIZE_X_Z_SIZE]], [[WORKITEM_ID_X]]
@ -77,9 +75,9 @@
; NOHSAOPT: call i32 @llvm.r600.read.local.size.y(), !range !0
; NOHSAOPT: call i32 @llvm.r600.read.local.size.z(), !range !0
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !1
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !1
; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !1
define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
%stack = alloca [5 x i32], align 4
@ -557,6 +555,8 @@ entry:
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }
; HSAOPT: !0 = !{}
; HSAOPT: !1 = !{i32 0, i32 2048}
; HSAOPT: !1 = !{i32 0, i32 257}
; HSAOPT: !2 = !{i32 0, i32 256}
; NOHSAOPT: !0 = !{i32 0, i32 2048}
; NOHSAOPT: !0 = !{i32 0, i32 257}
; NOHSAOPT: !1 = !{i32 0, i32 256}

View File

@ -50,7 +50,7 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
; GCN: s_load_dword [[SRC:s[0-9]+]]
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
@ -128,7 +128,7 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
; GCN: s_load_dword [[SRC:s[0-9]+]]
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], {{s[0-9]+}}
; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -150,7 +150,7 @@ define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1
; Do scalar loads into the super register we need.
; CI-LABEL: {{^}}simple_read2_v2f32_superreg_scalar_loads_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-NOT: v_mov
; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
; CI: s_endpgm
define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
@ -173,7 +173,7 @@ define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x
; CI-LABEL: {{^}}simple_read2_v4f32_superreg_scalar_loads_align4:
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT0:[0-9]+]]:[[REG_ELT1:[0-9]+]]{{\]}}, v{{[0-9]+}} offset1:1{{$}}
; CI-DAG: ds_read2_b32 v{{\[}}[[REG_ELT2:[0-9]+]]:[[REG_ELT3:[0-9]+]]{{\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
; CI-NOT: v_mov
; CI-NOT: v_mov {{v[0-9]+}}, {{[sv][0-9]+}}
; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
; CI: s_endpgm
define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {

View File

@ -234,8 +234,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %p
}
; GCN-LABEL: {{^}}flat_atomic_dec_ret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
%id = call i32 @llvm.amdgcn.workitem.id.x()
@ -248,8 +248,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4
}
; GCN-LABEL: {{^}}flat_atomic_dec_noret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
%id = call i32 @llvm.amdgcn.workitem.id.x()
@ -355,8 +355,8 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)*
}
; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@ -370,8 +370,8 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace
}
; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {

View File

@ -206,8 +206,8 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)*
}
; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
@ -221,8 +221,8 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace
}
; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
@ -348,8 +348,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %p
}
; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
%id = call i32 @llvm.amdgcn.workitem.id.x()
@ -362,8 +362,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4
}
; GCN-LABEL: {{^}}flat_atomic_inc_noret_i64_offset_addr64:
; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
%id = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -45,11 +45,7 @@ entry:
; GCN-LABEL: {{^}}local_memory_two_objects:
; GCN: v_lshlrev_b32_e32 [[ADDRW:v[0-9]+]], 2, v0
; CI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
; SI: v_add_i32_e32 [[ADDRW_OFF:v[0-9]+]], vcc, 16, [[ADDRW]]
; SI-DAG: ds_write_b32 [[ADDRW]],
; SI-DAG: ds_write_b32 [[ADDRW_OFF]],
; SI-DAG: ds_write2_b32 [[ADDRW]], {{v[0-9]+}}, {{v[0-9]+}} offset1:4
; GCN: s_barrier

View File

@ -14,8 +14,8 @@ entry:
}
; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
; CHECK-NOT: v_and_b32
; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
entry:
%id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
@ -26,8 +26,8 @@ entry:
; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range_m1:
; CHECK-NOT: v0
; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
; CHECK-NOT: v_and_b32
; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
entry:
%id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1

View File

@ -12,9 +12,9 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
; OPT: call i32 @llvm.r600.read.local.size.y(), !range !0
; OPT: call i32 @llvm.r600.read.local.size.z(), !range !0
; OPT: call i32 @llvm.r600.read.tidig.x(), !range !0
; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
; OPT: call i32 @llvm.r600.read.tidig.x(), !range !1
; OPT: call i32 @llvm.r600.read.tidig.y(), !range !1
; OPT: call i32 @llvm.r600.read.tidig.z(), !range !1
define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
entry:
@ -295,6 +295,7 @@ define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
ret void
}
; OPT: !0 = !{i32 0, i32 2048}
; OPT: !0 = !{i32 0, i32 257}
; OPT: !1 = !{i32 0, i32 256}
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,2" }

View File

@ -12,7 +12,7 @@
; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
%ld.64 = load i128, i128 addrspace(1)* %in.gep
@ -56,7 +56,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128
; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
; GCN: s_endpgm
define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
%ld.64 = load i128, i128 addrspace(1)* %in.gep
@ -113,5 +113,7 @@ define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workgroup.id.x() #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

View File

@ -9,7 +9,7 @@
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
%ld.64 = load i64, i64 addrspace(1)* %in.gep
@ -42,7 +42,7 @@ define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 add
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
%ld.64 = load i64, i64 addrspace(1)* %in.gep
@ -58,7 +58,7 @@ define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addr
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
%ld.64 = load i64, i64 addrspace(1)* %in.gep
@ -106,7 +106,7 @@ define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 add
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
%ld.64 = load i64, i64 addrspace(1)* %in.gep
@ -122,7 +122,7 @@ define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
%ld.64 = load i64, i64 addrspace(1)* %in.gep
@ -138,7 +138,7 @@ define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 a
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
%ld.64 = load i64, i64 addrspace(1)* %in.gep
@ -156,7 +156,7 @@ define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 a
; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%id.x = tail call i32 @llvm.amdgcn.workgroup.id.x()
%in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
%out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
%ld.64 = load i64, i64 addrspace(1)* %in.gep
@ -383,5 +383,7 @@ define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 add
declare i32 @llvm.amdgcn.workitem.id.x() #0
declare i32 @llvm.amdgcn.workgroup.id.x() #0
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }

View File

@ -4,6 +4,8 @@
declare i32 @llvm.r600.read.tidig.x() #0
declare i32 @llvm.r600.read.tgid.x() #0
;EG: {{^}}shl_v2i32:
;EG: LSHL {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
@ -288,7 +290,7 @@ define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
%tid = call i32 @llvm.r600.read.tidig.x() #0
%tid = call i32 @llvm.r600.read.tgid.x() #0
%gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
%gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
%a = load i64, i64 addrspace(1)* %gep.in

View File

@ -85,10 +85,10 @@ define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i1
; FIXME: Need to handle non-uniform case for function below (load without gep).
; GCN-LABEL: {{^}}v_test_sub_i16_zext_to_i64:
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: flat_load_ushort [[A:v[0-9]+]]
; VI: flat_load_ushort [[B:v[0-9]+]]
; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
%tid = call i32 @llvm.amdgcn.workitem.id.x()

View File

@ -0,0 +1,83 @@
; RUN: llc -march=amdgcn < %s | FileCheck %s
; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-intrinsics < %s | FileCheck -check-prefix=OPT %s
; CHECK-NOT: and_b32
; OPT-LABEL: @zext_grp_size_128
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !0
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !0
; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !0
define amdgpu_kernel void @zext_grp_size_128(i32 addrspace(1)* nocapture %arg) #0 {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%tmp1 = and i32 %tmp, 127
store i32 %tmp1, i32 addrspace(1)* %arg, align 4
%tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
%tmp3 = and i32 %tmp2, 127
%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
%tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
%tmp6 = and i32 %tmp5, 127
%tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
ret void
}
; OPT-LABEL: @zext_grp_size_32x4x1
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !2
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !3
; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !4
define amdgpu_kernel void @zext_grp_size_32x4x1(i32 addrspace(1)* nocapture %arg) #0 !reqd_work_group_size !0 {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%tmp1 = and i32 %tmp, 31
store i32 %tmp1, i32 addrspace(1)* %arg, align 4
%tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
%tmp3 = and i32 %tmp2, 3
%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
%tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
%tmp6 = and i32 %tmp5, 1
%tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
ret void
}
; OPT-LABEL: @zext_grp_size_512
; OPT: tail call i32 @llvm.amdgcn.workitem.id.x() #2, !range !5
; OPT: tail call i32 @llvm.amdgcn.workitem.id.y() #2, !range !5
; OPT: tail call i32 @llvm.amdgcn.workitem.id.z() #2, !range !5
define amdgpu_kernel void @zext_grp_size_512(i32 addrspace(1)* nocapture %arg) #1 {
bb:
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #2
%tmp1 = and i32 %tmp, 65535
store i32 %tmp1, i32 addrspace(1)* %arg, align 4
%tmp2 = tail call i32 @llvm.amdgcn.workitem.id.y() #2
%tmp3 = and i32 %tmp2, 65535
%tmp4 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
store i32 %tmp3, i32 addrspace(1)* %tmp4, align 4
%tmp5 = tail call i32 @llvm.amdgcn.workitem.id.z() #2
%tmp6 = and i32 %tmp5, 65535
%tmp7 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
store i32 %tmp6, i32 addrspace(1)* %tmp7, align 4
ret void
}
declare i32 @llvm.amdgcn.workitem.id.x() #2
declare i32 @llvm.amdgcn.workitem.id.y() #2
declare i32 @llvm.amdgcn.workitem.id.z() #2
attributes #0 = { nounwind "amdgpu-flat-work-group-size"="64,128" }
attributes #1 = { nounwind "amdgpu-flat-work-group-size"="512,512" }
attributes #2 = { nounwind readnone }
!0 = !{i32 32, i32 4, i32 1}
; OPT: !0 = !{i32 0, i32 128}
; OPT: !1 = !{i32 32, i32 4, i32 1}
; OPT: !2 = !{i32 0, i32 32}
; OPT: !3 = !{i32 0, i32 4}
; OPT: !4 = !{i32 0, i32 1}
; OPT: !5 = !{i32 0, i32 512}