AMDGPU: Add pass to lower kernel arguments to loads

This replaces most argument uses with loads, but for
now not all.

The code in SelectionDAG for calling convention lowering
is actively harmful for amdgpu_kernel. It attempts to
split the argument types into register legal types, which
results in low quality code for arbitary types. Since
all kernel arguments are passed in memory, we just want the
raw types.

I've tried a couple of methods of mitigating this in SelectionDAG,
but it's easier to just bypass this problem alltogether. It's
possible to hack around the problem in the initial lowering,
but the real problem is the DAG then expects to be able to use
CopyToReg/CopyFromReg for uses of the arguments outside the block.

Exposing the argument loads in the IR also has the advantage
that the LoadStoreVectorizer can merge them.

I'm not sure the best approach to dealing with the IR
argument list is. The patch as-is just leaves the IR arguments
in place, so all the existing code will still compute the same
kernarg size and pointlessly lowers the arguments.

Arguably the frontend should emit kernels with an empty argument
list in the first place. Alternatively a dummy array could be
inserted as a single argument just to reserve space.

This does have some disadvantages. Local pointer kernel arguments can
no longer have AssertZext placed  on them as the equivalent !range
metadata is not valid on pointer  typed loads. This is mostly bad
for SI which needs to know about the known bits in order to use the
DS instruction offset, so in this case this is not done.

More importantly, this skips noalias arguments since this pass
does not yet convert this to the equivalent !alias.scope and !noalias
metadata. Producing this metadata correctly seems to be tricky,
although this logically is the same as inlining into a function which
doesn't exist. Additionally, exposing these loads to the vectorizer
may result in degraded aliasing information if a pointer load is
merged with another argument load.

I'm also not entirely sure this is preserving the current clover
ABI, although I would greatly prefer if it would stop widening
arguments and match the HSA ABI. As-is I think it is extending
< 4-byte arguments to 4-bytes but doesn't align them to 4-bytes.

llvm-svn: 335650
This commit is contained in:
Matt Arsenault 2018-06-26 19:10:00 +00:00
parent 7e991d30c0
commit 8c4a35237a
130 changed files with 3107 additions and 1535 deletions

View File

@ -73,6 +73,10 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
extern char &AMDGPULowerIntrinsicsID;
FunctionPass *createAMDGPULowerKernelArgumentsPass();
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
extern char &AMDGPULowerKernelArgumentsID;
ModulePass *createAMDGPULowerKernelAttributesPass();
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
extern char &AMDGPULowerKernelAttributesID;

View File

@ -0,0 +1,267 @@
//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
/// \file This pass replaces accesses to kernel arguments with loads from
/// offsets from the kernarg base pointer.
//
//===----------------------------------------------------------------------===//
#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "AMDGPUTargetMachine.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Analysis/DivergenceAnalysis.h"
#include "llvm/Analysis/Loads.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/BasicBlock.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstrTypes.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/MDBuilder.h"
#include "llvm/IR/Metadata.h"
#include "llvm/IR/Operator.h"
#include "llvm/IR/Type.h"
#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
using namespace llvm;
namespace {
class AMDGPULowerKernelArguments : public FunctionPass{
public:
static char ID;
AMDGPULowerKernelArguments() : FunctionPass(ID) {}
bool runOnFunction(Function &F) override;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<TargetPassConfig>();
AU.setPreservesAll();
}
};
} // end anonymous namespace
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
CallingConv::ID CC = F.getCallingConv();
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
return false;
auto &TPC = getAnalysis<TargetPassConfig>();
const TargetMachine &TM = TPC.getTM<TargetMachine>();
const SISubtarget &ST = TM.getSubtarget<SISubtarget>(F);
LLVMContext &Ctx = F.getParent()->getContext();
const DataLayout &DL = F.getParent()->getDataLayout();
BasicBlock &EntryBlock = *F.begin();
IRBuilder<> Builder(&*EntryBlock.begin());
SmallVector<Type *, 16> ArgTypes;
for (Argument &Arg : F.args()) {
Type *ArgTy = Arg.getType();
unsigned Size = DL.getTypeStoreSizeInBits(ArgTy);
bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
!ST.isAmdHsaOS();
// Clover seems to always pad i8/i16 to i32, but doesn't properly align
// them?
// Make sure the struct elements have correct size and alignment for ext
// args. These seem to be padded up to 4-bytes but not correctly aligned.
ArgTypes.push_back(
IsExtArg ? ArrayType::get(ArgTy, 32 / Size) : Arg.getType());
}
StructType *ArgStructTy = StructType::create(Ctx, ArgTypes, F.getName());
const StructLayout *Layout = DL.getStructLayout(ArgStructTy);
// Minimum alignment for kern segment is 16.
unsigned KernArgBaseAlign = std::max(16u, DL.getABITypeAlignment(ArgStructTy));
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
// FIXME: Alignment is broken broken with explicit arg offset.;
const uint64_t TotalKernArgSize = BaseOffset +
ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy));
CallInst *KernArgSegment =
Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
F.getName() + ".kernarg.segment");
KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
KernArgSegment->addAttribute(AttributeList::ReturnIndex,
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
KernArgSegment->addAttribute(AttributeList::ReturnIndex,
Attribute::getWithAlignment(Ctx, KernArgBaseAlign));
Value *KernArgBase = KernArgSegment;
if (BaseOffset != 0) {
KernArgBase = Builder.CreateConstInBoundsGEP1_64(KernArgBase, BaseOffset);
KernArgBaseAlign = MinAlign(KernArgBaseAlign, BaseOffset);
}
unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
Value *CastStruct = Builder.CreateBitCast(KernArgBase,
ArgStructTy->getPointerTo(AS));
for (Argument &Arg : F.args()) {
if (Arg.use_empty())
continue;
Type *ArgTy = Arg.getType();
if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
// FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
// can't represent this with range metadata because it's only allowed for
// integer types.
if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
continue;
// FIXME: We can replace this with equivalent alias.scope/noalias
// metadata, but this appears to be a lot of work.
if (Arg.hasNoAliasAttr())
continue;
}
VectorType *VT = dyn_cast<VectorType>(ArgTy);
bool IsV3 = VT && VT->getNumElements() == 3;
VectorType *V4Ty = nullptr;
unsigned Size = DL.getTypeSizeInBits(ArgTy);
bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
!ST.isAmdHsaOS();
int64_t EltOffset = Layout->getElementOffset(Arg.getArgNo());
int64_t AlignDownOffset = alignDown(EltOffset, 4);
int64_t OffsetDiff = EltOffset - AlignDownOffset;
unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
Value *ArgPtr;
if (Size < 32) {
// Since we don't have sub-dword scalar loads, avoid doing an extload by
// loading earlier than the argument address, and extracting the relevant
// bits.
//
// Additionally widen any sub-dword load to i32 even if suitably aligned,
// so that CSE between different argument loads works easily.
ArgPtr = Builder.CreateConstGEP1_64(KernArgBase, AlignDownOffset);
ArgPtr = Builder.CreateBitCast(
ArgPtr,
Builder.getInt32Ty()->getPointerTo(AS),
Arg.getName() + ".kernarg.offset.align.down");
} else {
ArgPtr = Builder.CreateStructGEP(CastStruct, Arg.getArgNo(),
Arg.getName() + ".kernarg.offset");
}
assert((!IsExtArg || !IsV3) && "incompatible situation");
if (IsV3 && Size >= 32) {
V4Ty = VectorType::get(VT->getVectorElementType(), 4);
// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
}
LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
MDBuilder MDB(Ctx);
if (isa<PointerType>(ArgTy)) {
if (Arg.hasNonNullAttr())
Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
uint64_t DerefBytes = Arg.getDereferenceableBytes();
if (DerefBytes != 0) {
Load->setMetadata(
LLVMContext::MD_dereferenceable,
MDNode::get(Ctx,
MDB.createConstant(
ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
}
uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
if (DerefOrNullBytes != 0) {
Load->setMetadata(
LLVMContext::MD_dereferenceable_or_null,
MDNode::get(Ctx,
MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
DerefOrNullBytes))));
}
unsigned ParamAlign = Arg.getParamAlignment();
if (ParamAlign != 0) {
Load->setMetadata(
LLVMContext::MD_align,
MDNode::get(Ctx,
MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
ParamAlign))));
}
}
// TODO: Convert noalias arg to !noalias
if (Size < 32) {
if (IsExtArg && OffsetDiff == 0) {
Type *I32Ty = Builder.getInt32Ty();
bool IsSext = Arg.hasSExtAttr();
Metadata *LowAndHigh[] = {
ConstantAsMetadata::get(
ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
ConstantAsMetadata::get(
ConstantInt::get(I32Ty,
IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
};
Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
}
Value *ExtractBits = OffsetDiff == 0 ?
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
IntegerType *ArgIntTy = Builder.getIntNTy(Size);
Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
Arg.getName() + ".load");
Arg.replaceAllUsesWith(NewVal);
} else if (IsV3) {
Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
{0, 1, 2},
Arg.getName() + ".load");
Arg.replaceAllUsesWith(Shuf);
} else {
Load->setName(Arg.getName() + ".load");
Arg.replaceAllUsesWith(Load);
}
}
return true;
}
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
"AMDGPU Lower Kernel Arguments", false, false)
INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
false, false)
char AMDGPULowerKernelArguments::ID = 0;
FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
return new AMDGPULowerKernelArguments();
}

View File

@ -130,6 +130,12 @@ static cl::opt<bool> EnableLibCallSimplify(
cl::init(true),
cl::Hidden);
static cl::opt<bool> EnableLowerKernelArguments(
"amdgpu-ir-lower-kernel-arguments",
cl::desc("Lower kernel argument loads in IR pass"),
cl::init(true),
cl::Hidden);
extern "C" void LLVMInitializeAMDGPUTarget() {
// Register the target
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@ -155,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPULowerKernelArgumentsPass(*PR);
initializeAMDGPULowerKernelAttributesPass(*PR);
initializeAMDGPULowerIntrinsicsPass(*PR);
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
@ -669,6 +676,10 @@ void AMDGPUPassConfig::addIRPasses() {
}
void AMDGPUPassConfig::addCodeGenPrepare() {
if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
TargetPassConfig::addCodeGenPrepare();
if (EnableLoadStoreVectorizer)

View File

@ -40,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
AMDGPULibCalls.cpp
AMDGPULibFunc.cpp
AMDGPULowerIntrinsics.cpp
AMDGPULowerKernelArguments.cpp
AMDGPULowerKernelAttributes.cpp
AMDGPUMachineCFGStructurizer.cpp
AMDGPUMachineFunction.cpp

View File

@ -9,11 +9,11 @@
; GCN-LABEL: {{^}}smrd0:
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
define amdgpu_kernel void @smrd0(i32 addrspace(4)* %ptr) {
entry:
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
%1 = load i32, i32 addrspace(4)* %0
store i32 %1, i32 addrspace(1)* %out
store i32 %1, i32 addrspace(1)* undef
ret void
}
@ -21,11 +21,11 @@ entry:
; GCN-LABEL: {{^}}smrd1:
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
define amdgpu_kernel void @smrd1(i32 addrspace(4)* %ptr) {
entry:
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
%1 = load i32, i32 addrspace(4)* %0
store i32 %1, i32 addrspace(1)* %out
store i32 %1, i32 addrspace(1)* undef
ret void
}
@ -36,11 +36,11 @@ entry:
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
; GCN: s_endpgm
define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
define amdgpu_kernel void @smrd2(i32 addrspace(4)* %ptr) {
entry:
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
%1 = load i32, i32 addrspace(4)* %0
store i32 %1, i32 addrspace(1)* %out
store i32 %1, i32 addrspace(1)* undef
ret void
}
@ -51,11 +51,11 @@ entry:
; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
; TODO: Add VI checks
; XGCN: s_endpgm
define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
define amdgpu_kernel void @smrd3(i32 addrspace(4)* %ptr) {
entry:
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
%1 = load i32, i32 addrspace(4)* %0
store i32 %1, i32 addrspace(1)* %out
store i32 %1, i32 addrspace(1)* undef
ret void
}
@ -65,11 +65,11 @@ entry:
; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
define amdgpu_kernel void @smrd4(i32 addrspace(4)* %ptr) {
entry:
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
%1 = load i32, i32 addrspace(4)* %0
store i32 %1, i32 addrspace(1)* %out
store i32 %1, i32 addrspace(1)* undef
ret void
}
@ -79,11 +79,11 @@ entry:
; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
; GCN: s_endpgm
define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
define amdgpu_kernel void @smrd5(i32 addrspace(4)* %ptr) {
entry:
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
%1 = load i32, i32 addrspace(4)* %0
store i32 %1, i32 addrspace(1)* %out
store i32 %1, i32 addrspace(1)* undef
ret void
}

View File

@ -76,7 +76,7 @@ define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out,
; SI-NOT: addc
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
; SI: buffer_store_dword [[VRESULT]],
define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i32, i64 %a, i32, i64 %b) {
%add = add i64 %b, %a
%trunc = trunc i64 %add to i32
store i32 %trunc, i32 addrspace(1)* %out, align 8

View File

@ -3,7 +3,7 @@
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
; TRAP-HANDLER-ENABLE: NumSgprs: 60
; TRAP-HANDLER-DISABLE: NumSgprs: 76
; TRAP-HANDLER-DISABLE: NumSgprs: 78
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
i32 addrspace(1)* %out0, i32 %in0,
i32 addrspace(1)* %out1, i32 %in1,

View File

@ -217,7 +217,7 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out,
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i32, i64 %a) {
%and = and i64 %a, 1234567
store i64 %and, i64 addrspace(1)* %out, align 8
ret void
@ -235,7 +235,7 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
; SI-NOT: and
; SI: buffer_store_dwordx2
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
%shl.a = shl i64 %a, 1
%shl.b = shl i64 %b, 1
%and0 = and i64 %shl.a, 62
@ -381,7 +381,7 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 a
; SI-NOT: and
; SI: s_add_u32
; SI-NEXT: s_addc_u32
define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i32, i64 %b) {
%shl = shl i64 %a, 1
%and = and i64 %shl, 64
%add = add i64 %and, %b

View File

@ -12,25 +12,17 @@
; CIVI: s_load_dword [[LHS:s[0-9]+]]
; CIVI: s_load_dword [[RHS:s[0-9]+]]
; VI: s_ashr_i32
; VI: s_ashr_i32
; VI: s_sext_i32_i16
; VI: s_sext_i32_i16
; VI: s_ashr_i32
; VI: s_ashr_i32
; VI: s_lshl_b32
; VI: s_and_b32
; VI: s_or_b32
; CIVI-DAG: s_ashr_i32
; CIVI-DAG: s_ashr_i32
; CIVI-DAG: s_sext_i32_i16
; CIVI-DAG: s_sext_i32_i16
; CIVI-DAG: s_ashr_i32
; CIVI-DAG: s_ashr_i32
; CIVI-DAG: s_lshl_b32
; CIVI: s_and_b32
; CIVI: s_or_b32
; CI: s_ashr_i32
; CI: s_and_b32
; CI: s_lshr_b32
; CI: s_sext_i32_i16
; CI: s_ashr_i32
; CI: s_ashr_i32
; CI: s_lshl_b32
; CI: s_and_b32
define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x i16> %lhs, i32, <2 x i16> %rhs) #0 {
%result = ashr <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
ret void

View File

@ -7,16 +7,16 @@
; GFX9-NOT: m0
; SICIVI-DAG: s_mov_b32 m0
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
%result = extractvalue { i32, i1 } %pair, 0
@ -70,15 +70,15 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspac
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
; GCN: s_endpgm
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
%result = extractvalue { i32, i1 } %pair, 0

View File

@ -0,0 +1,33 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
; ALL-LABEL: {{^}}max_9_sgprs:
; ALL: SGPRBlocks: 1
; ALL: NumSGPRsForWavesPerEU: 9
define amdgpu_kernel void @max_9_sgprs() #0 {
%one = load volatile i32, i32 addrspace(4)* undef
%two = load volatile i32, i32 addrspace(4)* undef
%three = load volatile i32, i32 addrspace(4)* undef
%four = load volatile i32, i32 addrspace(4)* undef
%five = load volatile i32, i32 addrspace(4)* undef
%six = load volatile i32, i32 addrspace(4)* undef
%seven = load volatile i32, i32 addrspace(4)* undef
%eight = load volatile i32, i32 addrspace(4)* undef
%nine = load volatile i32, i32 addrspace(4)* undef
%ten = load volatile i32, i32 addrspace(4)* undef
call void asm sideeffect "", "s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight)
store volatile i32 %one, i32 addrspace(1)* undef
store volatile i32 %two, i32 addrspace(1)* undef
store volatile i32 %three, i32 addrspace(1)* undef
store volatile i32 %four, i32 addrspace(1)* undef
store volatile i32 %five, i32 addrspace(1)* undef
store volatile i32 %six, i32 addrspace(1)* undef
store volatile i32 %seven, i32 addrspace(1)* undef
store volatile i32 %eight, i32 addrspace(1)* undef
store volatile i32 %nine, i32 addrspace(1)* undef
store volatile i32 %ten, i32 addrspace(1)* undef
ret void
}
attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }

View File

@ -1,25 +1,37 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
; If spilling to smem, additional registers are used for the resource
; descriptor.
; FIXME: Vectorization can increase required SGPR count beyond limit.
; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
; ALL-LABEL: {{^}}max_9_sgprs:
; ALL: SGPRBlocks: 1
; ALL: NumSGPRsForWavesPerEU: 9
define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3,
i32 addrspace(1)* %out4,
i32 addrspace(1)* %out5,
i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
store i32 %one, i32 addrspace(1)* %out1
store i32 %two, i32 addrspace(1)* %out2
store i32 %three, i32 addrspace(1)* %out3
store i32 %four, i32 addrspace(1)* %out4
store i32 %five, i32 addrspace(1)* %out5
define amdgpu_kernel void @max_9_sgprs() #0 {
%one = load volatile i32, i32 addrspace(4)* undef
%two = load volatile i32, i32 addrspace(4)* undef
%three = load volatile i32, i32 addrspace(4)* undef
%four = load volatile i32, i32 addrspace(4)* undef
%five = load volatile i32, i32 addrspace(4)* undef
%six = load volatile i32, i32 addrspace(4)* undef
%seven = load volatile i32, i32 addrspace(4)* undef
%eight = load volatile i32, i32 addrspace(4)* undef
%nine = load volatile i32, i32 addrspace(4)* undef
%ten = load volatile i32, i32 addrspace(4)* undef
call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine)
store volatile i32 %one, i32 addrspace(1)* undef
store volatile i32 %two, i32 addrspace(1)* undef
store volatile i32 %three, i32 addrspace(1)* undef
store volatile i32 %four, i32 addrspace(1)* undef
store volatile i32 %five, i32 addrspace(1)* undef
store volatile i32 %six, i32 addrspace(1)* undef
store volatile i32 %seven, i32 addrspace(1)* undef
store volatile i32 %eight, i32 addrspace(1)* undef
store volatile i32 %nine, i32 addrspace(1)* undef
store volatile i32 %ten, i32 addrspace(1)* undef
ret void
}

View File

@ -29,7 +29,8 @@ end:
; GCN-LABEL: {{^}}test_brcc_i1:
; GCN: s_load_dword [[VAL:s[0-9]+]]
; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}}
; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]]
; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
; GCN: s_cmp_eq_u32
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}v_ubfe_sub_i32:
; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
@ -48,10 +48,9 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
}
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
; GCN: s_load_dword [[SRC:s[0-9]+]]
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@ -63,11 +62,10 @@ define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
}
; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
; GCN: s_load_dword [[SRC:s[0-9]+]]
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@ -126,10 +124,9 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
}
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
; GCN: s_load_dword [[SRC:s[0-9]+]]
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@ -141,11 +138,10 @@ define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
}
; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
; GCN: s_load_dword [[SRC:s[0-9]+]]
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x

View File

@ -1,6 +1,6 @@
; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck -check-prefixes=R600,FUNC %s
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
; BFI_INT Definition pattern from ISA docs
; (y & x) | (z & ~x)
@ -119,10 +119,10 @@ entry:
; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_bfi_b32
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_bfi_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN-DAG: v_bfi_b32
define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
%and0 = and i64 %a, %b
%not.a = xor i64 %a, -1
@ -136,10 +136,10 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1:
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_bfi_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN-DAG: v_bfi_b32
define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b
@ -155,8 +155,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN: v_bfi_b32
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
; GCN-DAG: v_bfi_b32
define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
%xor.0 = xor i64 %a, %mask
%and = and i64 %xor.0, %b

View File

@ -11,22 +11,32 @@
; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
; GCN: s_cbranch_vccnz
; GCN: one{{$}}
; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
; GCN: buffer_store_short
; GCN: s_endpgm
; SI: one{{$}}
; SI: v_cvt_f16_f32_e32 v[[CVT:[0-9]+]], v[[A_F32]]
; GCN: two{{$}}
; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
; GCN: buffer_store_short v[[B_F16]]
; GCN: s_endpgm
; SI: two{{$}}
; SI: v_cvt_f16_f32_e32 v[[CVT]], v[[B_F32]]
; SI: one{{$}}
; SI: buffer_store_short v[[CVT]]
; SI: s_endpgm
; VI: one{{$}}
; VI: buffer_store_short v[[A_F16]]
; VI: s_endpgm
; VI: two{{$}}
; VI: buffer_store_short v[[B_F16]]
; VI: s_endpgm
define amdgpu_kernel void @br_cc_f16(
half addrspace(1)* %r,
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%fcmp = fcmp olt half %a.val, %b.val
br i1 %fcmp, label %one, label %two

View File

@ -490,7 +490,7 @@ ret:
; GCN-LABEL: {{^}}long_branch_hang:
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
; GCN: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:

View File

@ -9,7 +9,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
@ -23,7 +23,7 @@
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel

View File

@ -123,7 +123,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
}
; FUNC-LABEL: {{^}}s_ctlz_i64:
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
@ -133,7 +133,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
store i64 %ctlz, i64 addrspace(1)* %out
ret void

View File

@ -98,7 +98,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
}
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
@ -108,7 +108,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
store i64 %ctlz, i64 addrspace(1)* %out
ret void

View File

@ -305,14 +305,14 @@ define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %o
; but there are some cases when the should be allowed.
; FUNC-LABEL: {{^}}ctpop_i32_in_br:
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x16
; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x58
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
; EG: BCNT_INT
define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
entry:
%tmp0 = icmp eq i32 %cond, 0
br i1 %tmp0, label %if, label %else

View File

@ -308,7 +308,9 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %o
; FUNC-LABEL: {{^}}ctpop_i16_in_br:
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
; GCN: s_and_b32 [[CTPOP_ARG:s[0-9]+]], [[VAL]], 0xffff
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[CTPOP_ARG]]
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
; GCN: buffer_store_short [[RESULT]],
; GCN: s_endpgm

View File

@ -13,13 +13,13 @@ declare i65 @llvm.ctpop.i65(i65) nounwind readnone
declare i128 @llvm.ctpop.i128(i128) nounwind readnone
; FUNC-LABEL: {{^}}s_ctpop_i64:
; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
; GCN: buffer_store_dword [[VRESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
%truncctpop = trunc i64 %ctpop to i32
store i32 %truncctpop, i32 addrspace(1)* %out, align 4

View File

@ -58,11 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(
}
; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
; SI: s_load_dword s
; SI: s_load_dword s
; GFX89: s_load_dwordx2
; GFX89: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: buffer_store_short
; GCN: buffer_store_short
@ -78,8 +75,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
; FIXME: Why sometimes vector shift?
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx2 s
; SI: s_load_dwordx2 s
; GFX89: s_load_dwordx2 s
; GFX89: s_load_dwordx2 s
@ -87,9 +84,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
; GCN: {{buffer|global}}_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {

View File

@ -27,7 +27,7 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x
; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
; GCN: buffer_store_short [[VELT1]]
; GCN: ScratchSize: 0
define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
%elt = extractelement <2 x i16> %vec, i32 %idx
store i16 %elt, i16 addrspace(1)* %out, align 2
@ -58,12 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1
}
; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
; SI: s_load_dword s
; SI: s_load_dwordx2 s
; SI: s_load_dword s
; GFX89: s_load_dwordx2
; GFX89: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN-NOT: {{buffer|flat|global}}_load
@ -79,8 +75,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x
}
; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx2
; SI: buffer_store_short
; SI: buffer_store_short
@ -100,12 +95,12 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx2 s
; SI: s_load_dwordx2 s
; GFX89-DAG: s_load_dwordx2
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
; GFX89-DAG: s_load_dword s
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x24
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x4c
; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54
; GCN-NOT: {{buffer|flat|global}}
@ -113,17 +108,13 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
; SICI: buffer_store_short
; SICI: buffer_store_short
; SICI: buffer_load_ushort
; SICI: buffer_store_short
; GFX9-NOT: s_pack_ll_b32_b16
; GFX9-NOT: s_pack_lh_b32_b16
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}}
; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
; GCN: {{buffer|global}}_store_short
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
%p0 = extractelement <3 x i16> %foo, i32 %idx
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
store i16 %p0, i16 addrspace(1)* %out

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
; GCN: s_load_dword [[LOAD:s[0-9]+]]
@ -14,7 +14,8 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i
; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
; GCN: s_load_dword s
; GCN-NOT: {{flat|buffer|global}}
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}}
; GCN-NOT: {{flat|buffer|global}}
; GCN: buffer_store_byte
; GCN: buffer_store_byte
@ -22,8 +23,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i
%p0 = extractelement <2 x i8> %foo, i32 0
%p1 = extractelement <2 x i8> %foo, i32 1
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p1, i8 addrspace(1)* %out
store i8 %p0, i8 addrspace(1)* %out1
store volatile i8 %p1, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out1
ret void
}
@ -38,8 +39,8 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i
%p0 = extractelement <3 x i8> %foo, i32 0
%p1 = extractelement <3 x i8> %foo, i32 2
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p1, i8 addrspace(1)* %out
store i8 %p0, i8 addrspace(1)* %out1
store volatile i8 %p1, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out1
ret void
}
@ -54,24 +55,24 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i
%p0 = extractelement <4 x i8> %foo, i32 0
%p1 = extractelement <4 x i8> %foo, i32 2
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p1, i8 addrspace(1)* %out
store i8 %p0, i8 addrspace(1)* %out1
store volatile i8 %p1, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out1
ret void
}
; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
; GCN-NOT: {{s|flat|buffer|global}}_load
; GCN: s_load_dword [[VAL:s[0-9]+]]
; GCN-NOT: {{flat|buffer|global}}
; GCN-NOT: {{s|flat|buffer|global}}_load
; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
; GCN-NOT: {{flat|buffer|global}}
; GCN-NOT: {{s|flat|buffer|global}}_load
; GCN: buffer_store_byte
; GCN: buffer_store_byte
define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
%p0 = extractelement <8 x i8> %foo, i32 0
%p1 = extractelement <8 x i8> %foo, i32 2
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p1, i8 addrspace(1)* %out
store i8 %p0, i8 addrspace(1)* %out1
store volatile i8 %p1, i8 addrspace(1)* null
store volatile i8 %p0, i8 addrspace(1)* null
ret void
}
@ -87,25 +88,25 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x
%p0 = extractelement <16 x i8> %foo, i32 0
%p1 = extractelement <16 x i8> %foo, i32 2
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p1, i8 addrspace(1)* %out
store i8 %p0, i8 addrspace(1)* %out1
store volatile i8 %p1, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out1
ret void
}
; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
; GCN: s_load_dword [[LOAD0:s[0-9]+]]
; GCN-NOT: {{flat|buffer|global}}
; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
; GCN-NOT: {{s|flat|buffer|global}}_load
; GCN: s_load_dword [[VAL:s[0-9]+]]
; GCN-NOT: {{s|flat|buffer|global}}_load
; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
; GCN: buffer_store_byte [[V_ELT2]]
; GCN: buffer_store_byte [[V_LOAD0]]
define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
%p0 = extractelement <32 x i8> %foo, i32 0
%p1 = extractelement <32 x i8> %foo, i32 2
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p1, i8 addrspace(1)* %out
store i8 %p0, i8 addrspace(1)* %out1
store volatile i8 %p1, i8 addrspace(1)* null
store volatile i8 %p0, i8 addrspace(1)* null
ret void
}
@ -121,8 +122,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
%p0 = extractelement <64 x i8> %foo, i32 0
%p1 = extractelement <64 x i8> %foo, i32 2
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p1, i8 addrspace(1)* %out
store i8 %p0, i8 addrspace(1)* %out1
store volatile i8 %p1, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out1
ret void
}
@ -132,42 +133,36 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
; isTypeDesirableForOp in SimplifyDemandedBits
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
; VI-NOT: {{flat|buffer|global}}
; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]]
; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]]
; VI: buffer_store_byte [[EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
; VI: buffer_store_byte [[ELT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
%elt = extractelement <2 x i8> %foo, i32 %idx
store i8 %elt, i8 addrspace(1)* %out
store volatile i8 %elt, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
; VI-NOT: {{flat|buffer|global}}
; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8
; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]]
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]]
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
; VI: buffer_store_byte [[EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
; VI: buffer_store_byte [[V_ELT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
%p0 = extractelement <3 x i8> %foo, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p0, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30
; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
@ -175,16 +170,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out
; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
; VI: buffer_store_byte [[V_EXTRACT]]
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
%vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
%p0 = extractelement <4 x i8> %vec, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p0, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10
; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
@ -195,7 +190,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out
%vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
%p0 = extractelement <8 x i8> %vec, i32 %idx
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
store i8 %p0, i8 addrspace(1)* %out
store volatile i8 %p0, i8 addrspace(1)* %out
ret void
}

View File

@ -39,9 +39,9 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
}
; GCN-LABEL: {{^}}s_fabs_v4f16:
; CI: s_load_dword s[[LO:[0-9]+]]
; CI: s_load_dword s[[HI:[0-9]+]]
; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2
; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
@ -54,7 +54,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
; GCN-LABEL: {{^}}fabs_fold_f16:
; GCN: s_load_dword [[IN0:s[0-9]+]]
; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
; GCN-DAG: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
@ -62,6 +62,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
; GFX89-NOT: and
; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]

View File

@ -53,11 +53,11 @@ define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x doub
}
; SI-LABEL: {{^}}fabs_fold_f64:
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-NOT: and
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
; SI: s_endpgm
define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
%fabs = call double @llvm.fabs.f64(double %in0)
%fmul = fmul double %fabs, %in1
store double %fmul, double addrspace(1)* %out
@ -65,11 +65,11 @@ define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0,
}
; SI-LABEL: {{^}}fabs_fn_fold_f64:
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-NOT: and
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
; SI: s_endpgm
define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
%fabs = call double @fabs(double %in0)
%fmul = fmul double %fabs, %in1
store double %fmul, double addrspace(1)* %out

View File

@ -70,10 +70,11 @@ define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float
}
; GCN-LABEL: {{^}}fabs_fn_fold:
; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
; GCN-NOT: and
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
%fabs = call float @fabs(float %in0)
%fmul = fmul float %fabs, %in1
@ -82,10 +83,11 @@ define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, fl
}
; FUNC-LABEL: {{^}}fabs_fold:
; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
; GCN-NOT: and
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
%fabs = call float @llvm.fabs.f32(float %in0)
%fmul = fmul float %fabs, %in1

View File

@ -16,8 +16,8 @@ define amdgpu_kernel void @fadd_f16(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fadd half %a.val, %b.val
store half %r.val, half addrspace(1)* %r
ret void
@ -65,10 +65,10 @@ entry:
; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
@ -102,13 +102,13 @@ entry:
; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
@ -133,13 +133,13 @@ entry:
; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

View File

@ -16,8 +16,8 @@ define amdgpu_kernel void @fcmp_f16_lt(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp olt half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -42,8 +42,8 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%a.abs = call half @llvm.fabs.f16(half %a.val)
%b.abs = call half @llvm.fabs.f16(half %b.val)
%r.val = fcmp olt half %a.abs, %b.abs
@ -67,8 +67,8 @@ define amdgpu_kernel void @fcmp_f16_eq(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp oeq half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -90,8 +90,8 @@ define amdgpu_kernel void @fcmp_f16_le(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp ole half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -113,8 +113,8 @@ define amdgpu_kernel void @fcmp_f16_gt(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp ogt half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -136,8 +136,8 @@ define amdgpu_kernel void @fcmp_f16_lg(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp one half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -159,8 +159,8 @@ define amdgpu_kernel void @fcmp_f16_ge(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp oge half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -182,8 +182,8 @@ define amdgpu_kernel void @fcmp_f16_o(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp ord half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -205,8 +205,8 @@ define amdgpu_kernel void @fcmp_f16_u(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp uno half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -228,8 +228,8 @@ define amdgpu_kernel void @fcmp_f16_nge(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp ult half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -251,8 +251,8 @@ define amdgpu_kernel void @fcmp_f16_nlg(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp ueq half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -274,8 +274,8 @@ define amdgpu_kernel void @fcmp_f16_ngt(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp ule half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -297,8 +297,8 @@ define amdgpu_kernel void @fcmp_f16_nle(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp ugt half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -320,8 +320,8 @@ define amdgpu_kernel void @fcmp_f16_neq(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp une half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r
@ -343,8 +343,8 @@ define amdgpu_kernel void @fcmp_f16_nlt(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fcmp uge half %a.val, %b.val
%r.val.sext = sext i1 %r.val to i32
store i32 %r.val.sext, i32 addrspace(1)* %r

View File

@ -30,8 +30,8 @@ define amdgpu_kernel void @test_copysign_f16(
half addrspace(1)* %arg_mag,
half addrspace(1)* %arg_sign) {
entry:
%mag = load half, half addrspace(1)* %arg_mag
%sign = load half, half addrspace(1)* %arg_sign
%mag = load volatile half, half addrspace(1)* %arg_mag
%sign = load volatile half, half addrspace(1)* %arg_sign
%out = call half @llvm.copysign.f16(half %mag, half %sign)
store half %out, half addrspace(1)* %arg_out
ret void

View File

@ -8,12 +8,11 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
; Try to identify arg based on higher address.
; FUNC-LABEL: {{^}}test_copysign_f32:
; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
; SI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0xb
; VI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0x2c
; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]]
; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]]
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
; GCN: buffer_store_dword [[RESULT]],

View File

@ -6,10 +6,10 @@ declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind r
declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
; FUNC-LABEL: {{^}}test_copysign_f64:
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x74
; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
@ -17,15 +17,15 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
; GCN: s_endpgm
define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
%result = call double @llvm.copysign.f64(double %mag, double %sign)
store double %result, double addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}test_copysign_f64_f32:
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
@ -33,7 +33,7 @@ define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %
; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind {
%c = fpext float %sign to double
%result = call double @llvm.copysign.f64(double %mag, double %c)
store double %result, double addrspace(1)* %out, align 8

View File

@ -64,9 +64,9 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float>
; SI: v_fma_f32
; SI: v_fma_f32
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
; EG-DAG: FMA {{\*? *}}[[RES]].X

View File

@ -25,14 +25,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addr
}
; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
%cmp = fcmp ule float %a, %b

View File

@ -1,6 +1,6 @@
; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
@ -44,7 +44,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out,
; GCN-DAG: buffer_store_dword [[MUL2]]
; GCN-DAG: buffer_store_dword [[MAD]]
; GCN: s_endpgm
define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 {
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
%mul2 = fmul fast float %x, 2.0
%mad = fadd fast float %mul2, %y

View File

@ -17,8 +17,8 @@ define amdgpu_kernel void @fmul_f16(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fmul half %a.val, %b.val
store half %r.val, half addrspace(1)* %r
ret void
@ -36,7 +36,7 @@ define amdgpu_kernel void @fmul_f16_imm_a(
half addrspace(1)* %r,
half addrspace(1)* %b) {
entry:
%b.val = load half, half addrspace(1)* %b
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fmul half 3.0, %b.val
store half %r.val, half addrspace(1)* %r
ret void
@ -55,24 +55,24 @@ define amdgpu_kernel void @fmul_f16_imm_b(
half addrspace(1)* %r,
half addrspace(1)* %a) {
entry:
%a.val = load half, half addrspace(1)* %a
%a.val = load volatile half, half addrspace(1)* %a
%r.val = fmul half %a.val, 4.0
store half %r.val, half addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fmul_v2f16:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
@ -82,6 +82,8 @@ entry:
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; GCN: buffer_store_dword v[[R_V2_F16]]
@ -100,13 +102,13 @@ entry:
; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
@ -133,13 +135,13 @@ entry:
; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@ -164,13 +166,15 @@ entry:
}
; GCN-LABEL: {{^}}fmul_v4f16:
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
; VI: v_mul_f16_sdwa
; VI: v_mul_f16_e32
; VI: v_mul_f16_sdwa

View File

@ -109,8 +109,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h
}
; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
; CI: s_load_dword s
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
; CI: s_load_dword [[IN:s[0-9]+]]
; CI: s_or_b32 [[FNEG_FABS:s[0-9]+]], [[IN]], 0x80008000
; CI: s_lshr_b32
; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}

View File

@ -55,14 +55,13 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64
}
; GCN-LABEL: {{^}}fneg_fabs_f64:
; GCN-DAG: s_load_dwordx2
; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13
; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c
; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) {
%fabs = call double @llvm.fabs.f64(double %in)
%fsub = fsub double -0.000000e+00, %fabs
store double %fsub, double addrspace(1)* %out, align 8

View File

@ -4,7 +4,7 @@
; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
; SI-NOT: and
; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}|
define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
%fabs = call float @llvm.fabs.f32(float %x)
%fsub = fsub float -0.000000e+00, %fabs
@ -15,7 +15,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x
; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
; SI-NOT: and
; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}|
; SI-NOT: and
define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
%fabs = call float @llvm.fabs.f32(float %x)

View File

@ -48,11 +48,11 @@ define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
}
; GCN-LABEL: {{^}}fneg_fold_f64:
; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN-NOT: xor
; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, [8 x i32], double %in) {
%fsub = fsub double -0.0, %in
%fmul = fmul double %fsub, %in
store double %fmul, double addrspace(1)* %out

View File

@ -13,18 +13,17 @@ target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:
define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
entry:
; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
; CHECK: s_load_dword s2, s[0:1], 0xb
; CHECK: s_load_dword s0, s[0:1], 0xc
; CHECK: s_load_dwordx2 s[0:1], s[0:1], 0xb
; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
; CHECK: s_mov_b32 s10, -1
; CHECK: s_waitcnt lgkmcnt(0)
; CHECK: s_lshl_b32 s1, s2, 2
; CHECK: v_mov_b32_e32 v0, 4
; CHECK: s_mov_b32 s11, 0xe8f000
; CHECK: v_add_i32_e32 v1, vcc, s1, v0
; CHECK: v_mov_b32_e32 v2, 7
; CHECK: s_waitcnt lgkmcnt(0)
; CHECK: s_lshl_b32 s0, s0, 2
; CHECK: v_add_i32_e32 v1, vcc, s0, v0
; CHECK: s_lshl_b32 s0, s1, 2
; CHECK: s_mov_b32 s11, 0xe8f000
; CHECK: v_mov_b32_e32 v2, 7
; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
; CHECK: v_add_i32_e32 v0, vcc, s0, v0
; CHECK: s_mov_b32 s7, 0xf000
@ -35,14 +34,14 @@ entry:
; CHECK: s_endpgm
%x = alloca [100 x i32], align 4, addrspace(5)
%0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
%alloca.bc = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
%arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i
store i32 7, i32 addrspace(5)* %arrayidx, align 4
%arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j
%1 = load i32, i32 addrspace(5)* %arrayidx2, align 4
store i32 %1, i32 addrspace(1)* %a, align 4
call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
%ld = load i32, i32 addrspace(5)* %arrayidx2, align 4
store i32 %ld, i32 addrspace(1)* %a, align 4
call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
ret void
}

View File

@ -17,8 +17,8 @@ define amdgpu_kernel void @fsub_f16(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fsub half %a.val, %b.val
store half %r.val, half addrspace(1)* %r
ret void
@ -36,7 +36,7 @@ define amdgpu_kernel void @fsub_f16_imm_a(
half addrspace(1)* %r,
half addrspace(1)* %b) {
entry:
%b.val = load half, half addrspace(1)* %b
%b.val = load volatile half, half addrspace(1)* %b
%r.val = fsub half 1.0, %b.val
store half %r.val, half addrspace(1)* %r
ret void
@ -54,33 +54,41 @@ define amdgpu_kernel void @fsub_f16_imm_b(
half addrspace(1)* %r,
half addrspace(1)* %a) {
entry:
%a.val = load half, half addrspace(1)* %a
%a.val = load volatile half, half addrspace(1)* %a
%r.val = fsub half %a.val, 2.0
store half %r.val, half addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fsub_v2f16:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
; GCN: buffer_store_dword v[[R_V2_F16]]
@ -101,13 +109,13 @@ entry:
; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
@ -135,13 +143,13 @@ entry:
; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

View File

@ -5,7 +5,7 @@
; CHECK: s_load_dwordx4
; CHECK-NOT: flat_load_dword
define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) {
define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) {
bb:
%tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
%tmp3 = fadd float %tmp2, 0.000000e+00
@ -28,7 +28,7 @@ bb:
; CHECK: flat_load_dword
; CHECK-NOT: s_load_dwordx4
define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 {
define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) #0 {
bb:
%tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
%tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
@ -59,7 +59,7 @@ bb:
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
store i32 0, i32 addrspace(1)* %out0
%val = load i32, i32 addrspace(1)* %in
store i32 %val, i32 addrspace(1)* %out1
@ -71,7 +71,7 @@ define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i3
; CHECK: flat_store_dword
; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
store i32 0, i32 addrspace(1)* %out0
%val = load i32, i32 addrspace(1)* %in
store i32 %val, i32 addrspace(1)* %out1
@ -80,19 +80,20 @@ define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0
; uniform load from global array
; CHECK-LABEL: @global_array
; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]]
; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]]
; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0
; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0
; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
@A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
entry:
%0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
%1 = load i32, i32 addrspace(1)* %0, align 4
store i32 %1, i32 addrspace(1)* %out, align 4
%load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
%load1 = load i32, i32 addrspace(1)* %load0, align 4
store i32 %load1, i32 addrspace(1)* %out, align 4
ret void
}
@ -105,13 +106,13 @@ entry:
; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) {
define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) {
entry:
%gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
store i32 12, i32 addrspace(1) * %gep
%0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
%1 = load i32, i32 addrspace(1)* %0, align 4
store i32 %1, i32 addrspace(1)* %out, align 4
%load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
%load1 = load i32, i32 addrspace(1)* %load0, align 4
store i32 %load1, i32 addrspace(1)* %out, align 4
ret void
}

View File

@ -22,13 +22,8 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
}
; GCN-LABEL: {{^}}load_v3f16_arg:
; SI: s_load_dwordx2
; SI: s_load_dword s
; SI: s_load_dword s
; VI: s_load_dwordx2
; VI: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN-NOT: {buffer|flat|global}}_load_
@ -45,11 +40,7 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
; FIXME: Why not one load?
; GCN-LABEL: {{^}}load_v4f16_arg:
; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3
; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}}
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]]
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]]
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
@ -86,14 +77,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)*
}
; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
; SI: s_load_dwordx2 s
; SI: s_load_dword s
; SI: s_load_dword s
; VI: s_load_dwordx2
; VI: s_load_dwordx2
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; GCN: s_load_dwordx2 s
; GCN: s_load_dwordx2 s
; GCN-NOT: _load
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
@ -116,14 +101,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)*
}
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; VI: s_load_dwordx2 s
; VI: s_load_dwordx2 s
; VI: s_load_dwordx2 s
; GCN: s_load_dwordx4
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
@ -154,7 +132,7 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal
}
; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
; GCN: s_load_dword
; GCN-DAG: s_load_dword s
; GCN: s_lshr_b32
; GCN-DAG: v_cvt_f32_f16_e32
@ -169,14 +147,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)*
}
; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
; SI: s_load_dword
; SI: s_load_dword
; VI: s_load_dwordx2
; VI: s_load_dwordx2
; GCN: s_lshr_b32
; GCN: s_load_dwordx2 s
; GCN: s_load_dwordx2 s
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
@ -191,19 +163,17 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)*
}
; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
; SI: s_load_dword s
; SI: s_load_dword s
; GCN: s_load_dwordx2 s
; GCN: s_load_dwordx2 s
; VI: s_load_dwordx2 s
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f64_f32_e32
; GCN-DAG: v_cvt_f64_f32_e32
; GCN-DAG: v_cvt_f64_f32_e32
; GCN-DAG: v_cvt_f64_f32_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN: s_endpgm
define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
%ext = fpext <4 x half> %arg to <4 x double>
@ -212,14 +182,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)*
}
; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
; SI: s_load_dword s
; SI-NEXT: s_load_dword s
; SI-NEXT: s_load_dword s
; SI-NEXT: s_load_dword s
; SI-NOT: _load_
; VI: s_load_dwordx2 s
; VI: s_load_dwordx2 s
; GCN: s_load_dwordx2 s
; GCN: s_load_dwordx4 s
; GCN-DAG: v_cvt_f32_f16_e32
; GCN-DAG: v_cvt_f32_f16_e32
@ -299,12 +263,13 @@ define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, h
; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
; GCN: flat_load_dword [[LOAD:v[0-9]+]],
; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
; SI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
; GCN: s_endpgm
@ -348,6 +313,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; GCN: flat_store_dwordx4
@ -361,7 +327,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_sdwa
@ -430,19 +395,19 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(
; XVI-NOT: v_cvt_f32_f16
; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
; SI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GCN: v_cvt_f32_f16_e32
; GCN: v_cvt_f32_f16_e32
; SI: v_cvt_f32_f16_e32
; VI: v_cvt_f32_f16_sdwa
; GCN-NOT: v_cvt_f32_f16
; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN: v_cvt_f64_f32_e32
; GCN-NOT: v_cvt_f64_f32_e32
; GCN-DAG: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[XLO]]:[[YHI]]{{\]}}
; GCN-DAG: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[Z]]
; GCN-DAG: flat_store_dwordx4
; GCN-DAG: flat_store_dwordx2
; GCN: s_endpgm
define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
%val = load <3 x half>, <3 x half> addrspace(1)* %in

View File

@ -8,7 +8,7 @@
; CHECK: Version: [ 1, 0 ]
; CHECK: Kernels:
; CHECK: - Name: test
; CHECK-LABEL: - Name: test
; CHECK: SymbolName: 'test@kd'
; CHECK: CodeProps:
; CHECK: KernargSegmentSize: 24
@ -16,8 +16,8 @@
; CHECK: PrivateSegmentFixedSize: 0
; CHECK: KernargSegmentAlign: 8
; CHECK: WavefrontSize: 64
; CHECK: NumSGPRs: 6
; CHECK: NumVGPRs: 3
; CHECK: NumSGPRs: 8
; CHECK: NumVGPRs: 6
; CHECK: MaxFlatWorkGroupSize: 256
define amdgpu_kernel void @test(
half addrspace(1)* %r,
@ -31,18 +31,24 @@ entry:
ret void
}
; CHECK: - Name: num_spilled_sgprs
; CHECK-LABEL: - Name: num_spilled_sgprs
; CHECK: SymbolName: 'num_spilled_sgprs@kd'
; CHECK: CodeProps:
; CHECK: NumSpilledSGPRs: 41
; GFX700: NumSpilledSGPRs: 40
; GFX803: NumSpilledSGPRs: 24
; GFX900: NumSpilledSGPRs: 24
define amdgpu_kernel void @num_spilled_sgprs(
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %out2,
i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 addrspace(1)* %out5,
i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, i32 addrspace(1)* %out8,
i32 addrspace(1)* %out9, i32 addrspace(1)* %outa, i32 addrspace(1)* %outb,
i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, i32 addrspace(1)* %oute,
i32 addrspace(1)* %outf, i32 %in0, i32 %in1, i32 %in2, i32 %in3, i32 %in4,
i32 %in5, i32 %in6, i32 %in7, i32 %in8, i32 %in9, i32 %ina, i32 %inb,
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
entry:
store i32 %in0, i32 addrspace(1)* %out0
@ -64,7 +70,7 @@ entry:
ret void
}
; CHECK: - Name: num_spilled_vgprs
; CHECK-LABEL: - Name: num_spilled_vgprs
; CHECK: SymbolName: 'num_spilled_vgprs@kd'
; CHECK: CodeProps:
; CHECK: NumSpilledVGPRs: 14

View File

@ -344,114 +344,114 @@ define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float
; GCN-LABEL: {{^}}add_inline_imm_0.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0.0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_0.5_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0.5
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, -0.5
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_1.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 1.0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, -1.0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_2.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 2.0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, -2.0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_4.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 4.0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, -4.0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_inv_2pi_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
; SI-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
; SI: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
; VI: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0x3fc45f306dc9c882
store double %y, double addrspace(1)* %out
ret void
@ -461,40 +461,40 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out,
; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0xbfc45f306dc9c882
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_1_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0x0000000000000001
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_2_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0x0000000000000002
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_16_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0x0000000000000010
store double %y, double addrspace(1)* %out
ret void
@ -504,7 +504,7 @@ define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, doub
; GCN: v_mov_b32_e32 v0, -1
; GCN: v_mov_b32_e32 v1, v0
; GCN: buffer_store_dwordx2 v[0:1]
define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0xffffffffffffffff
store double %y, double addrspace(1)* %out
ret void
@ -514,7 +514,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, d
; GCN: v_mov_b32_e32 v0, -2
; GCN: v_mov_b32_e32 v1, -1
; GCN: buffer_store_dwordx2 v[0:1]
define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0xfffffffffffffffe
store double %y, double addrspace(1)* %out
ret void
@ -524,29 +524,29 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, d
; GCN: v_mov_b32_e32 v0, -16
; GCN: v_mov_b32_e32 v1, -1
; GCN: buffer_store_dwordx2 v[0:1]
define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0xfffffffffffffff0
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_63_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0x000000000000003F
store double %y, double addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}add_inline_imm_64_f64:
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
; GCN: buffer_store_dwordx2 [[REG]]
define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) {
%y = fadd double %x, 0x0000000000000040
store double %y, double addrspace(1)* %out
ret void

View File

@ -310,9 +310,9 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)*
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
; GFX9: buffer_store_dword [[REG]]
; VI: buffer_load_dword
; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
; VI-DAG: buffer_load_dword
; VI-NOT: and
; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
; VI: v_or_b32

View File

@ -1,5 +1,5 @@
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
; FIXME: Broken on evergreen
; FIXME: For some reason the 8 and 16 vectors are being stored as
@ -75,8 +75,9 @@ define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out,
; GCN-LABEL: {{^}}insertelement_to_sgpr:
; GCN-NOT: v_readfirstlane
define amdgpu_ps <4 x float> @insertelement_to_sgpr(<4 x i32> inreg %samp) nounwind {
%tmp1 = insertelement <4 x i32> %samp, i32 0, i32 0
define <4 x float> @insertelement_to_sgpr() nounwind {
%tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
%tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
%tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
ret <4 x float> %tmp2
}
@ -154,11 +155,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
}
; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
ret void
@ -201,23 +202,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
}
; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-NOT: _load
; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]]
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]]
; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]]
; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]]
; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]]
; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
; VI: buffer_store_short [[OR]]
define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
ret void
@ -227,68 +222,51 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
; isTypeDesirableForOp in SimplifyDemandedBits
; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-NOT: _load
; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8
; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]]
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]]
; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}}
; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}}
; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]]
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]]
; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]]
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]]
; VI: buffer_store_short [[BFI]]
; VI: buffer_store_byte [[HI2]]
define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
; VI-DAG: buffer_store_short [[BFI]]
; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
; VI: buffer_store_byte [[V_HI2]]
define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
%vecins = insertelement <3 x i8> %a, i8 5, i32 %b
store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-NOT: _load
; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8
; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}}
; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24
; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16
; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]]
; VI: v_or_b32_e32
; VI: v_or_b32_sdwa
; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI: v_or_b32_sdwa
; VI: s_lshl_b32
; VI: v_bfi_b32
define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
; VI: buffer_store_dword [[BFI]]
define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
%vecins = insertelement <4 x i8> %a, i8 5, i32 %b
store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
; VI-NOT: {{buffer|flat|global}}
; VI: s_load_dword [[IDX:s[0-9]]]
; VI-NOT: {{buffer|flat|global}}
; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; VI-NOT: {{buffer|flat|global}}
; VI-NOT: {{buffer|flat|global}}_load
; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10
; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}}
; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
@ -307,13 +285,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
; GCN: s_load_dwordx2
; GCN: s_load_dwordx4
; GCN: s_load_dword s
; GCN: s_load_dword s
; GCN: s_load_dword s
; GCN: s_load_dword s
; GCN: s_load_dword s
; GCN-NOT: _load_
; GCN: buffer_store_byte
; GCN: buffer_store_byte
@ -368,7 +341,7 @@ endif:
; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
@ -390,7 +363,7 @@ endif:
; GCN: buffer_store_dwordx4
; GCN: s_endpgm
define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
%vecins = insertelement <2 x double> %a, double 8.0, i32 %b
store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
ret void
@ -420,19 +393,18 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
; space is also 2x what should be required.
; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
; GCN: SCRATCH_RSRC_DWORD
; Stack store
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
; Write element
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
; Stack reload
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
; Store result
; GCN: buffer_store_dwordx4
@ -447,19 +419,17 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
}
; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
; GCN-DAG: SCRATCH_RSRC_DWORD
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
; GCN: buffer_store_dwordx4
; GCN: buffer_store_dwordx4

View File

@ -1,9 +1,9 @@
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
@ -18,17 +18,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
}
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
; GCN: s_load_dword [[ELT0:s[0-9]+]]
; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
; CIVI-DAG: s_and_b32 [[ELT0:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
; GFX9-NOT: [[ELT0]]
; GFX9-NOT: [[VEC]]
; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[VEC]]
define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@ -36,29 +36,29 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
}
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
; GCN: s_load_dword [[ELT0:s[0-9]+]]
; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
; CI-DAG: s_and_b32 [[ELT0_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0_MASKED]], [[ELT1]]
; CI-DAG: ; use [[SHR]]
; FIXME: Should be able to void mask of upper bits
; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
; VI-DAG: s_and_b32 [[ELT_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]]
; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
; VI-DAG: s_or_b32 [[OR:s[0-9]+]], [[ELT_MASKED]], [[VEC_HIMASK]]
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
; VI-DAG: ; use [[SHR]]
; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[ELT1]]
; GFX9-DAG: ; use [[ELT1]]
define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
%elt1 = extractelement <2 x i16> %vec, i32 1
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
@ -69,16 +69,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
}
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN-DAG: s_load_dword [[ELT_ARG:s[0-9]+]], s[4:5],
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; CIVI: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[ELT1]]
; GFX9-NOT: [[ELT0]]
; GFX9-NOT: [[VEC]]
; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
%elt.hi = lshr i32 %elt.arg, 16
%elt = trunc i32 %elt.hi to i16
@ -88,7 +89,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
}
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
; GCN: s_load_dword [[VEC:s[0-9]+]],
; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
@ -110,7 +111,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
}
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
; GCN: s_load_dword [[VEC:s[0-9]+]],
; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
@ -161,15 +162,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
}
; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
; GCN: s_load_dword [[ELT1:s[0-9]+]]
; GCN: s_load_dword [[VEC:s[0-9]+]]
; GCN-DAG: s_load_dword [[ELT1_LOAD:s[0-9]+]], s[4:5],
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[ELT1_LOAD]], 16
; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
; GCN-NOT: shlr
; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1_LOAD]]
define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@ -444,12 +446,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
@ -473,12 +474,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
@ -501,17 +501,18 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
}
; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]
; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]]
; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]]
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
@ -531,12 +532,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]
; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]]
; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL_HI]], [[AND]]
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
@ -553,17 +555,18 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
}
; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
%tid.ext = sext i32 %tid to i64
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
@ -583,12 +586,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]
; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_HI]], [[AND]]
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
@ -611,8 +615,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {

View File

@ -210,8 +210,10 @@ entry:
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
; GCN: s_load_dword s
; GCN-NOT: {{buffer|flat|global}}_load_
; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
entry:
store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
@ -226,8 +228,7 @@ entry:
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
@ -236,6 +237,7 @@ entry:
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
ret void
}
; FUNC-LABEL: {{^}}v3i32_arg:
; HSA-VI: kernarg_segment_byte_size = 32
; HSA-VI: kernarg_segment_alignment = 4
@ -274,8 +276,8 @@ entry:
; EG: VTX_READ_8
; EG: VTX_READ_8
; GCN: s_load_dword s
; GCN-NOT: {{buffer|flat|global}}_load_
; GCN-DAG: s_load_dwordx2 s
; GCN-DAG: s_load_dword s
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
entry:
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
@ -290,12 +292,18 @@ entry:
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c
; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
entry:
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
@ -348,23 +356,16 @@ entry:
; EG: VTX_READ_8
; EG: VTX_READ_8
; SI: s_load_dword s
; SI: s_load_dword s
; SI-NOT: {{buffer|flat|global}}_load
; SI: s_load_dwordx2 s
; SI-NEXT: s_load_dwordx2 s
; SI-NOT: {{buffer|flat|global}}_load
; VI: s_load_dword s
; VI: s_load_dword s
; VI: v_lshlrev_b16
; VI: v_or_b32_e32
; VI: v_or_b32_sdwa
; VI: v_or_b32_sdwa
; VI: v_lshlrev_b16
; VI: s_lshr_b32
; VI: v_or_b32_sdwa
; VI: v_or_b32_sdwa
; VI: s_load_dwordx2 s
; VI-NEXT: s_load_dwordx2 s
; VI-NOT: lshl
; VI-NOT: _or
; VI-NOT: _sdwa
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
entry:
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
@ -383,19 +384,14 @@ entry:
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx2
; SI: s_load_dwordx4
; SI-NEXT: s_load_dwordx2
; SI-NOT: {{buffer|flat|global}}_load
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c
; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18
; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
entry:
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
@ -413,6 +409,7 @@ entry:
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
@ -462,33 +459,16 @@ entry:
; EG: VTX_READ_8
; EG: VTX_READ_8
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx2
; SI: s_load_dwordx4 s
; SI-NEXT: s_load_dwordx2 s
; SI-NOT: {{buffer|flat|global}}_load
; VI: s_load_dword s
; VI: s_load_dword s
; VI: s_load_dword s
; VI: s_load_dword s
; VI: s_lshr_b32
; VI: v_lshlrev_b16
; VI: s_lshr_b32
; VI: s_lshr_b32
; VI: v_or_b32_sdwa
; VI: v_or_b32_sdwa
; VI: v_lshlrev_b16
; VI: v_lshlrev_b16
; VI: v_or_b32_sdwa
; VI: v_or_b32_sdwa
; VI: v_lshlrev_b16
; VI: v_lshlrev_b16
; VI: v_or_b32_sdwa
; VI: v_or_b32_sdwa
; VI: s_load_dwordx4 s
; VI-NOT: shr
; VI-NOT: shl
; VI-NOT: _sdwa
; VI-NOT: _or_
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
entry:
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
@ -516,27 +496,14 @@ entry:
; EG: VTX_READ_16
; EG: VTX_READ_16
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dword s
; SI: s_load_dwordx8 s
; SI-NEXT: s_load_dwordx2 s
; SI-NOT: {{buffer|flat|global}}_load
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c
; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38
; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
entry:
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
@ -600,22 +567,21 @@ entry:
}
; FUNC-LABEL: {{^}}kernel_arg_i64:
; MESA-GCN: s_load_dwordx2
; MESA-GCN: s_load_dwordx2
; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
; MESA-GCN: buffer_store_dwordx2
; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
store i64 %a, i64 addrspace(1)* %out, align 8
ret void
}
; FUNC-LABEL: {{^}}f64_kernel_arg:
; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
; MESA-GCN: buffer_store_dwordx2
; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
entry:
store double %in, double addrspace(1)* %out

View File

@ -6,7 +6,7 @@
; GCN: s_load_dword s[[LO:[0-9]+]]
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) {
define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
main_body:
call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
ret void

View File

@ -4,8 +4,8 @@ declare half @llvm.fabs.f16(half %a)
declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
; GCN-LABEL: {{^}}class_f16:
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN-DAG: buffer_load_dword v[[B_I32:[0-9]+]]
; VI: v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]]
; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
; GCN: buffer_store_dword v[[R_I32]]
@ -33,7 +33,9 @@ entry:
; GCN: s_endpgm
define amdgpu_kernel void @class_f16_fabs(
i32 addrspace(1)* %r,
[8 x i32],
half %a.val,
[8 x i32],
i32 %b.val) {
entry:
%a.val.fabs = call half @llvm.fabs.f16(half %a.val)
@ -53,7 +55,9 @@ entry:
; GCN: s_endpgm
define amdgpu_kernel void @class_f16_fneg(
i32 addrspace(1)* %r,
[8 x i32],
half %a.val,
[8 x i32],
i32 %b.val) {
entry:
%a.val.fneg = fsub half -0.0, %a.val
@ -73,7 +77,9 @@ entry:
; GCN: s_endpgm
define amdgpu_kernel void @class_f16_fabs_fneg(
i32 addrspace(1)* %r,
[8 x i32],
half %a.val,
[8 x i32],
i32 %b.val) {
entry:
%a.val.fabs = call half @llvm.fabs.f16(half %a.val)

View File

@ -1,4 +1,4 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
declare i1 @llvm.amdgcn.class.f64(double, i32) #1
@ -7,14 +7,14 @@ declare float @llvm.fabs.f32(float) #1
declare double @llvm.fabs.f64(double) #1
; SI-LABEL: {{^}}test_class_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
@ -22,14 +22,14 @@ define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32
}
; SI-LABEL: {{^}}test_class_fabs_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
%a.fabs = call float @llvm.fabs.f32(float %a) #1
%result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
%sext = sext i1 %result to i32
@ -38,14 +38,14 @@ define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a,
}
; SI-LABEL: {{^}}test_class_fneg_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
%a.fneg = fsub float -0.0, %a
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
%sext = sext i1 %result to i32
@ -54,14 +54,14 @@ define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a,
}
; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
%a.fabs = call float @llvm.fabs.f32(float %a) #1
%a.fneg.fabs = fsub float -0.0, %a.fabs
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@ -183,14 +183,14 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspac
}
; SI-LABEL: {{^}}test_class_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4
@ -198,14 +198,14 @@ define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32
}
; SI-LABEL: {{^}}test_class_fabs_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
%a.fabs = call double @llvm.fabs.f64(double %a) #1
%result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
%sext = sext i1 %result to i32
@ -214,14 +214,14 @@ define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a
}
; SI-LABEL: {{^}}test_class_fneg_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
%a.fneg = fsub double -0.0, %a
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
%sext = sext i1 %result to i32
@ -230,14 +230,14 @@ define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a
}
; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
%a.fabs = call double @llvm.fabs.f64(double %a) #1
%a.fneg.fabs = fsub double -0.0, %a.fabs
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@ -268,14 +268,14 @@ define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a)
; Set all 9 bits of mask
; SI-LABEL: {{^}}test_class_full_mask_f64:
; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
; SI-NOT: vcc
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
; SI-NEXT: buffer_store_dword [[RESULT]]
; SI: s_endpgm
define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 {
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
%sext = sext i1 %result to i32
store i32 %sext, i32 addrspace(1)* %out, align 4

View File

@ -4,11 +4,10 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[SX]], [[VY]]
define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
%r = bitcast <2 x i16> %result to i32

View File

@ -4,11 +4,10 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[SX]], [[VY]]
define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
%r = bitcast <2 x i16> %result to i32

View File

@ -4,11 +4,10 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
%r = bitcast <2 x i16> %result to i32

View File

@ -4,11 +4,10 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
%r = bitcast <2 x i16> %result to i32

View File

@ -3,11 +3,10 @@
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]]
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
store <2 x half> %result, <2 x half> addrspace(1)* %out

View File

@ -15,9 +15,9 @@ define amdgpu_kernel void @div_fixup_f16(
half addrspace(1)* %b,
half addrspace(1)* %c) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%c.val = load half, half addrspace(1)* %c
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%c.val = load volatile half, half addrspace(1)* %c
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val)
store half %r.val, half addrspace(1)* %r
ret void
@ -35,8 +35,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_a(
half addrspace(1)* %b,
half addrspace(1)* %c) {
entry:
%b.val = load half, half addrspace(1)* %b
%c.val = load half, half addrspace(1)* %c
%b.val = load volatile half, half addrspace(1)* %b
%c.val = load volatile half, half addrspace(1)* %c
%r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val)
store half %r.val, half addrspace(1)* %r
ret void
@ -54,8 +54,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_b(
half addrspace(1)* %a,
half addrspace(1)* %c) {
entry:
%a.val = load half, half addrspace(1)* %a
%c.val = load half, half addrspace(1)* %c
%a.val = load volatile half, half addrspace(1)* %a
%c.val = load volatile half, half addrspace(1)* %c
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val)
store half %r.val, half addrspace(1)* %r
ret void
@ -73,8 +73,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_c(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0)
store half %r.val, half addrspace(1)* %r
ret void
@ -90,7 +90,7 @@ define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
half addrspace(1)* %r,
half addrspace(1)* %c) {
entry:
%c.val = load half, half addrspace(1)* %c
%c.val = load volatile half, half addrspace(1)* %c
%r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val)
store half %r.val, half addrspace(1)* %r
ret void

View File

@ -5,18 +5,20 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone
declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
; GCN-LABEL: {{^}}test_div_fixup_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) nounwind {
%result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void

View File

@ -1,5 +1,5 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
; FIXME: Enable for VI.
@ -8,33 +8,36 @@ declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readno
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
; GCN-LABEL: {{^}}test_div_fmas_f32:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}}
; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
; GCN: buffer_store_dword [[RESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
%result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
@ -43,26 +46,32 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
}
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
; SI: buffer_store_dword [[RESULT]],
; SI: s_endpgm
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
ret void
@ -77,8 +86,8 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
}
; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
%cmp = icmp eq i32 %i, 0
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
@ -87,8 +96,8 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
}
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
; SI: s_mov_b64 vcc, 0
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; GCN: s_mov_b64 vcc, 0
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
store float %result, float addrspace(1)* %out, align 4
@ -96,8 +105,8 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
}
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
; SI: s_mov_b64 vcc, -1
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; GCN: s_mov_b64 vcc, -1
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
store float %result, float addrspace(1)* %out, align 4

View File

@ -230,13 +230,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
}
; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1:
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
@ -244,13 +244,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %
}
; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2:
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
; SI: buffer_store_dword [[RESULT0]]
; SI: s_endpgm
define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
%result0 = extractvalue { float, i1 } %result, 0
store float %result0, float addrspace(1)* %out, align 4
@ -258,14 +258,14 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %
}
; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8
@ -273,14 +273,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)*
}
; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x1d
; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
; SI: buffer_store_dwordx2 [[RESULT0]]
; SI: s_endpgm
define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
%result0 = extractvalue { double, i1 } %result, 0
store double %result0, double addrspace(1)* %out, align 8

View File

@ -7,7 +7,7 @@
; GCN: s_load_dword s[[S_LO:[0-9]+]]
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) {
main_body:
call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
ret void
@ -41,7 +41,6 @@ main_body:
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen

View File

@ -24,14 +24,14 @@ entry:
; GCN-LABEL: {{^}}ceil_v2f16:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

View File

@ -24,26 +24,31 @@ entry:
}
; GCN-LABEL: {{^}}cos_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI-DAG: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; VI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
; GCN-NOT: and

View File

@ -2,11 +2,9 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s
; GCN-LABEL: {{^}}test_debug_value:
; NOOPT: s_load_dwordx2 s[4:5]
; FIXME: Why is the SGPR4_SGPR5 reference being removed from DBG_VALUE?
; NOOPT: ; kill: def $sgpr8_sgpr9 killed $sgpr4_sgpr5
; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef
; NOOPT: .loc 1 1 42 prologue_end ; /tmp/test_debug_value.cl:1:42
; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5
; GCN: flat_store_dword
; GCN: s_endpgm

View File

@ -24,13 +24,13 @@ entry:
; GCN-LABEL: {{^}}floor_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

View File

@ -105,16 +105,17 @@ define amdgpu_kernel void @fma_f16_imm_c(
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
@ -145,8 +146,9 @@ define amdgpu_kernel void @fma_v2f16(
}
; GCN-LABEL: {{^}}fma_v2f16_imm_a:
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
@ -157,13 +159,14 @@ define amdgpu_kernel void @fma_v2f16(
; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
@ -186,8 +189,8 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
}
; GCN-LABEL: {{^}}fma_v2f16_imm_b:
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
@ -195,10 +198,10 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
@ -229,8 +232,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
}
; GCN-LABEL: {{^}}fma_v2f16_imm_c:
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
@ -238,26 +241,31 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; GCN-NOT: and
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
; GCN-NOT: and
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fma_v2f16_imm_c(

View File

@ -58,8 +58,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
half addrspace(1)* %r,
half addrspace(1)* %b,
half addrspace(1)* %c) {
%b.val = load half, half addrspace(1)* %b
%c.val = load half, half addrspace(1)* %c
%b.val = load volatile half, half addrspace(1)* %b
%c.val = load volatile half, half addrspace(1)* %c
%r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
store half %r.val, half addrspace(1)* %r
ret void
@ -87,56 +87,64 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
half addrspace(1)* %r,
half addrspace(1)* %a,
half addrspace(1)* %c) {
%a.val = load half, half addrspace(1)* %a
%c.val = load half, half addrspace(1)* %c
%a.val = load volatile half, half addrspace(1)* %a
%c.val = load volatile half, half addrspace(1)* %c
%r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
store half %r.val, half addrspace(1)* %r
ret void
}
; GCN-LABEL: {{^}}fmuladd_v2f16
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
; VI-FLUSH-NOT: v_and_b32
; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
; VI-DENORM-NOT: v_and_b32
; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @fmuladd_v2f16(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,

View File

@ -22,8 +22,8 @@ define amdgpu_kernel void @maxnum_f16(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
store half %r.val, half addrspace(1)* %r
ret void
@ -66,17 +66,16 @@ entry:
}
; GCN-LABEL: {{^}}maxnum_v2f16:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
@ -89,7 +88,7 @@ entry:
; VI-NOT: and
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
@ -107,13 +106,13 @@ entry:
; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
@ -127,7 +126,6 @@ entry:
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16_imm_a(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %b) {
@ -140,13 +138,13 @@ entry:
; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
@ -162,7 +160,6 @@ entry:
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @maxnum_v2f16_imm_b(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
@ -192,8 +189,8 @@ entry:
; GCN-LABEL: {{^}}maxnum_v4f16:
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
define amdgpu_kernel void @maxnum_v4f16(
<4 x half> addrspace(1)* %r,

View File

@ -22,8 +22,8 @@ define amdgpu_kernel void @minnum_f16(
half addrspace(1)* %a,
half addrspace(1)* %b) {
entry:
%a.val = load half, half addrspace(1)* %a
%b.val = load half, half addrspace(1)* %b
%a.val = load volatile half, half addrspace(1)* %a
%b.val = load volatile half, half addrspace(1)* %b
%r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
store half %r.val, half addrspace(1)* %r
ret void
@ -66,20 +66,20 @@ entry:
}
; GCN-LABEL: {{^}}minnum_v2f16:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: and
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
@ -88,10 +88,9 @@ entry:
; VI-NOT: and
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_v2f16(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a,
@ -106,15 +105,13 @@ entry:
; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
@ -123,6 +120,7 @@ entry:
; SIVI-NOT: and
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
@ -139,26 +137,28 @@ entry:
; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SIVI-NOT: and
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
; GCN: buffer_store_dword v[[R_V2_F16]]
; GCN: s_endpgm
define amdgpu_kernel void @minnum_v2f16_imm_b(
<2 x half> addrspace(1)* %r,
<2 x half> addrspace(1)* %a) {
@ -188,8 +188,8 @@ entry:
; GCN-LABEL: {{^}}minnum_v4f16:
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
define amdgpu_kernel void @minnum_v4f16(
<4 x half> addrspace(1)* %r,

View File

@ -25,13 +25,13 @@ entry:
; GCN-LABEL: {{^}}rint_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

View File

@ -1,10 +1,10 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
declare half @llvm.sin.f16(half %a)
declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
; GCN-LABEL: {{^}}sin_f16
; GCN-LABEL: {{^}}sin_f16:
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{0.15915494|0x3e22f983}}, v[[A_F32]]
@ -23,16 +23,20 @@ entry:
ret void
}
; GCN-LABEL: {{^}}sin_v2f16
; GCN-LABEL: {{^}}sin_v2f16:
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
; SI: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; SI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
; SI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@ -40,12 +44,11 @@ entry:
; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
; VI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
; VI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

View File

@ -24,13 +24,13 @@ entry:
; GCN-LABEL: {{^}}trunc_v2f16
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
; SI-DAG: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
; SI-DAG: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
; SI-NOT: v_and_b32
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]

View File

@ -17,7 +17,7 @@
; GCN-NOT: load_dword
; GCN: flat_store_dwordx2
define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64* %ptr1, i64 addrspace(1)* %ptr2) {
define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %ptr0, [8 x i32], i64* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
%tmp2 = icmp eq i32 %tmp, 0
%tmp3 = load i64, i64* %ptr0, align 8
%tmp4 = load i64, i64* %ptr1, align 8
@ -38,7 +38,7 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64*
; GCN: v_cndmask_b32
; GCN: v_cndmask_b32
; GCN: flat_store_dwordx2
define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, i64 addrspace(1)* %ptr0, i64 addrspace(1)* %ptr1, i64 addrspace(1)* %ptr2) {
define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
%tmp2 = icmp eq i32 %tmp, 0
%tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8
%tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8

File diff suppressed because it is too large Load Diff

View File

@ -8,28 +8,14 @@
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
; VI: s_load_dword [[LHS:s[0-9]+]]
; VI: s_load_dword [[RHS:s[0-9]+]]
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
; VI-DAG: s_lshl_b32
; VI: v_or_b32_e32
; CI: s_load_dword s
; CI-NEXT: s_load_dword s
; CI-NOT: {{buffer|flat}}
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; CI: s_and_b32
; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; CI: s_and_b32
; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
; CI: s_lshl_b32
; CI: v_or_b32_e32
; CIVI: s_load_dword [[LHS:s[0-9]+]]
; CIVI: s_load_dword [[RHS:s[0-9]+]]
; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
; CIVI-DAG: s_lshl_b32
; CIVI: v_or_b32_e32
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
%result = lshr <2 x i16> %lhs, %rhs
store <2 x i16> %result, <2 x i16> addrspace(1)* %out

View File

@ -206,14 +206,14 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
; SIFoldOperands should not fold the SGPR copy into the instruction
; because the implicit immediate already uses the constant bus.
; GCN-LABEL: {{^}}madak_constant_bus_violation:
; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
; GFX6: buffer_store_dword [[MUL]]
; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
bb:
%tmp = icmp eq i32 %arg1, 0
br i1 %tmp, label %bb3, label %bb4

View File

@ -83,7 +83,7 @@ define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out
; GCN-NOT: v_madmk_f32
; GCN: v_mac_f32_e32
; GCN: s_endpgm
define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid

View File

@ -216,14 +216,14 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %ou
; Make sure redundant and removed
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
; SI: buffer_store_dword [[VMAX]]
; EG: MAX_UINT
define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind {
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
%cmp = icmp ugt i32 %a.ext, %b.ext
@ -236,14 +236,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspac
; Make sure redundant sign_extend_inreg removed.
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16:
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
; SI: buffer_store_dword [[VMAX]]
; EG: MAX_INT
define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind {
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32
%cmp = icmp sgt i32 %a.ext, %b.ext
@ -262,7 +262,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace
; SI: s_max_i32
; EG: MAX_INT
define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind {
%cmp = icmp sge i16 %a, %b
%val = select i1 %cmp, i16 %a, i16 %b
store i16 %val, i16 addrspace(1)* %out

View File

@ -1,7 +1,7 @@
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
; GCN: v_min_i32_e32
@ -65,16 +65,14 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <
; GCN: s_sext_i32_i8
; GCN: s_sext_i32_i8
; GCN: s_min_i32
define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
%cmp = icmp sle i8 %a, %b
%val = select i1 %cmp, i8 %a, i8 %b
store i8 %val, i8 addrspace(1)* %out
ret void
}
; XXX - should be able to use s_min if we stop unnecessarily doing
; extloads with mubuf instructions.
; FIXME: Why vector and sdwa for last element?
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
; GCN: s_load_dword s
; GCN: s_load_dword s
@ -88,7 +86,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %
; VI: s_min_i32
; VI: s_min_i32
; VI: s_min_i32
; VI: s_min_i32
; VI: v_min_i32_sdwa
; GFX9: v_min_i16
; GFX9: v_min_i16
@ -99,7 +97,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %
; EG: MIN_INT
; EG: MIN_INT
; EG: MIN_INT
define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
%cmp = icmp sle <4 x i8> %a, %b
%val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
store <4 x i8> %val, <4 x i8> addrspace(1)* %out
@ -110,9 +108,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4
; GCN: s_load_dword s
; GCN: s_load_dword s
; SI: s_ashr_i32
; SI: s_ashr_i32
; SI: s_sext_i32_i16
; SI: s_ashr_i32
; SI: s_sext_i32_i16
; SI: s_min_i32
; SI: s_min_i32
@ -346,8 +344,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs
}
; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
; SI: buffer_load_ubyte
; SI: buffer_load_ubyte
; SI: {{buffer|flat|global}}_load_ubyte
; SI: {{buffer|flat|global}}_load_ubyte
; SI: v_min_u32_e32
; GFX89: {{flat|global}}_load_ubyte
@ -490,14 +488,14 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <
; Make sure redundant and removed
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
; GCN: buffer_store_dword [[VMIN]]
; EG: MIN_UINT
define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
%a.ext = zext i16 %a to i32
%b.ext = zext i16 %b to i32
%cmp = icmp ult i32 %a.ext, %b.ext
@ -510,14 +508,17 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspac
; Make sure redundant sign_extend_inreg removed.
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]]
; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]]
; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]]
; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
; GCN: buffer_store_dword [[VMIN]]
; EG: MIN_INT
define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
%a.ext = sext i16 %a to i32
%b.ext = sext i16 %b to i32
%cmp = icmp slt i32 %a.ext, %b.ext

View File

@ -6,7 +6,7 @@
; resulting in losing the store to gptr
; FUNC-LABEL: {{^}}missing_store_reduced:
; SI: s_load_dwordx2
; SI: s_load_dwordx4
; SI: ds_read_b64
; SI-DAG: buffer_store_dword
; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}

View File

@ -19,7 +19,7 @@
; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
bb:
%tmp = icmp sgt i32 %arg3, 0
br i1 %tmp, label %bb4, label %bb17

View File

@ -16,7 +16,8 @@ define i16 @v_mul_i16(i16 %a, i16 %b) {
; FIXME: Should emit scalar mul or maybe i16 v_mul here
; GCN-LABEL: {{^}}s_mul_i16:
; GCN: v_mul_u32_u24
; SI: v_mul_u32_u24
; VI: s_mul_i16
define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) {
%r.val = mul i16 %a, %b
store volatile i16 %r.val, i16 addrspace(1)* null

View File

@ -114,7 +114,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 a
; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
; GCN: buffer_store_dword [[VRESULT]],
; GCN: s_endpgm
define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
%mul = mul i32 %a, %b
store i32 %mul, i32 addrspace(1)* %out, align 4
ret void
@ -201,10 +201,8 @@ endif:
; FIXME: Load dwordx4
; FUNC-LABEL: {{^}}s_mul_i128:
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx2
; GCN: s_load_dwordx4
; GCN: s_load_dwordx4
; SI: v_mul_hi_u32
; SI: v_mul_hi_u32
@ -220,18 +218,23 @@ endif:
; SI-DAG: s_mul_i32
; SI-DAG: v_mul_hi_u32
; VI: s_mul_i32
; VI: v_mul_hi_u32
; VI: s_mul_i32
; VI: s_mul_i32
; VI: v_mul_hi_u32
; VI: v_mul_hi_u32
; VI: s_mul_i32
; VI: v_mad_u64_u32
; VI: s_mul_i32
; VI: v_mad_u64_u32
; VI: s_mul_i32
; VI: s_mul_i32
; VI: v_mad_u64_u32
; VI: s_mul_i32
; GCN: buffer_store_dwordx4
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
%mul = mul i128 %a, %b
store i128 %mul, i128 addrspace(1)* %out
ret void

View File

@ -70,7 +70,7 @@ entry:
; GCN-DAG: v_mul_i32_i24_e32
; GCN: buffer_store_dwordx2
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
%shl.i = shl i32 %a, 8
%shr.i = ashr i32 %shl.i, 8
%conv.i = sext i32 %shr.i to i64

View File

@ -18,8 +18,11 @@ entry:
}
; FUNC-LABEL: {{^}}test_umul24_i16_sext:
; GCN: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; GCN: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
; SI: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
; VI: s_mul_i32 [[MUL:s[0-9]+]]
; VI: s_sext_i32_i16 s{{[0-9]+}}, [[MUL]]
define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
entry:
%mul = mul i16 %a, %b
@ -46,9 +49,12 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16
}
; FUNC-LABEL: {{^}}test_umul24_i16:
; GCN: s_and_b32
; GCN: v_mul_u32_u24_e32
; GCN: v_and_b32_e32
; SI: s_and_b32
; SI: v_mul_u32_u24_e32
; SI: v_and_b32_e32
; VI: s_mul_i32
; VI: s_and_b32
define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
entry:
%mul = mul i16 %a, %b
@ -147,7 +153,7 @@ entry:
; GCN-NOT: s_and_b32
; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
entry:
%tmp0 = shl i64 %a, 40
%a.24 = lshr i64 %tmp0, 40

View File

@ -70,14 +70,14 @@
; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
; GCN: ; %Flow1
; GCN: ; %Flow5
; GCN-NEXT: s_or_b64 exec, exec
; GCN: v_cmp_ne_u32_e32 vcc, 0
; GCN: ; %exit1
; GCN: ds_write_b32
; GCN: %Flow2
; GCN: %Flow6
; GCN-NEXT: s_or_b64 exec, exec
; GCN: v_cmp_ne_u32_e32 vcc, 0
; GCN-NEXT: s_and_saveexec_b64

View File

@ -78,7 +78,7 @@ define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out,
; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
; SI: s_load_dword s
; SI: buffer_store_dword v
define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
%trunc = trunc i64 %arg to i32
store i32 %trunc, i32 addrspace(1)* %out
ret void
@ -100,7 +100,7 @@ define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %ou
; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
; SI: s_load_dword s
; SI: buffer_store_dword v
define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
%srl = lshr i64 %arg, 32
%trunc = trunc i64 %srl to i32
store i32 %trunc, i32 addrspace(1)* %out
@ -147,7 +147,7 @@ define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out,
; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
; SI: s_load_dword s
; SI: buffer_store_byte v
define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
%srl = lshr i64 %arg, 32
%trunc = trunc i64 %srl to i8
store i8 %trunc, i8 addrspace(1)* %out
@ -171,7 +171,7 @@ define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64
; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
; SI: s_load_dword s
; SI: buffer_store_byte v
define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
%trunc = trunc i64 %arg to i8
store i8 %trunc, i8 addrspace(1)* %out
ret void

View File

@ -6,7 +6,7 @@
; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
; GCN: flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, i32 addrspace(1)* nocapture %arg1) {
define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, [8 x i32], i32 addrspace(1)* nocapture %arg1) {
bb:
%tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
%tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5

View File

@ -4,14 +4,18 @@
; Make sure there isn't an extra space between the instruction name and first operands.
; GCN-LABEL: {{^}}add_f32:
; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
; SI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
; SI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
; SI: v_mov_b32_e32 [[VREGA:v[0-9]+]], [[SREGA]]
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGB]], [[VREGA]]
; VI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
; VI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
; VI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
; VI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
; GCN: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
define amdgpu_kernel void @add_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
%result = fadd float %a, %b
store float %result, float addrspace(1)* %out
ret void

View File

@ -63,26 +63,26 @@ define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a)
}
; FUNC-LABEL: {{^}}scalar_or_literal_i64:
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
%or = or i64 %a, 4261135838621753
store i64 %or, i64 addrspace(1)* %out
ret void
}
; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
%or = or i64 %a, 4261135838621753
store i64 %or, i64 addrspace(1)* %out
@ -92,7 +92,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
}
; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
; SI-NOT: or_b32
; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
; SI-NOT: or_b32
@ -101,7 +101,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
; SI-NOT: or_b32
; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
%or = or i64 %a, 63
store i64 %or, i64 addrspace(1)* %out
ret void
@ -125,7 +125,7 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)*
; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
%or = or i64 %a, -8
store i64 %or, i64 addrspace(1)* %out
ret void
@ -239,7 +239,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64
; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
; SI: buffer_store_dword [[VRESULT]],
define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
%add = or i64 %b, %a
%trunc = trunc i64 %add to i32
store i32 %trunc, i32 addrspace(1)* %out, align 8
@ -249,7 +249,7 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i
; FUNC-LABEL: {{^}}or_i1:
; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], vcc
define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
%a = load float, float addrspace(1)* %in0
%b = load float, float addrspace(1)* %in1

View File

@ -10,26 +10,26 @@
; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
; GCN: def s[8:15]
; GCN: def s[16:23]
; GCN: def s[24:31]
; GCN: def s[32:39]
; GCN: def s[40:47]
; GCN: def s[48:55]
; GCN: def s[56:63]
; GCN: def s[64:71]
; GCN: def s[72:79]
; GCN: def s[80:87]
; GCN: def s[88:95]
; GCN: def s[4:11]
; GCN: def s[12:19]
; GCN: def s[20:27]
; GCN: def s[28:35]
; GCN: def s[36:43]
; GCN: def s[44:51]
; GCN: def s[52:59]
; GCN: def s[60:67]
; GCN: def s[68:75]
; GCN: def s[76:83]
; GCN: def s[84:91]
; GCN: v_writelane_b32 v0, s8, 0
; GCN-NEXT: v_writelane_b32 v0, s9, 1
; GCN-NEXT: v_writelane_b32 v0, s10, 2
; GCN-NEXT: v_writelane_b32 v0, s11, 3
; GCN-NEXT: v_writelane_b32 v0, s12, 4
; GCN-NEXT: v_writelane_b32 v0, s13, 5
; GCN-NEXT: v_writelane_b32 v0, s14, 6
; GCN-NEXT: v_writelane_b32 v0, s15, 7
; GCN: v_writelane_b32 v0, s4, 0
; GCN-NEXT: v_writelane_b32 v0, s5, 1
; GCN-NEXT: v_writelane_b32 v0, s6, 2
; GCN-NEXT: v_writelane_b32 v0, s7, 3
; GCN-NEXT: v_writelane_b32 v0, s8, 4
; GCN-NEXT: v_writelane_b32 v0, s9, 5
; GCN-NEXT: v_writelane_b32 v0, s10, 6
; GCN-NEXT: v_writelane_b32 v0, s11, 7
; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
@ -37,8 +37,8 @@
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
; GCN-NEXT: v_writelane_b32 v0, s13, 13
; GCN-NEXT: v_writelane_b32 v0, s14, 14
; GCN-NEXT: v_writelane_b32 v0, s9, 13
; GCN-NEXT: v_writelane_b32 v0, s10, 14
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@ -47,8 +47,8 @@
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
; GCN-NEXT: v_writelane_b32 v0, s13, 21
; GCN-NEXT: v_writelane_b32 v0, s14, 22
; GCN-NEXT: v_writelane_b32 v0, s9, 21
; GCN-NEXT: v_writelane_b32 v0, s10, 22
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@ -57,8 +57,8 @@
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
; GCN-NEXT: v_writelane_b32 v0, s13, 29
; GCN-NEXT: v_writelane_b32 v0, s14, 30
; GCN-NEXT: v_writelane_b32 v0, s9, 29
; GCN-NEXT: v_writelane_b32 v0, s10, 30
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@ -67,8 +67,8 @@
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
; GCN-NEXT: v_writelane_b32 v0, s13, 37
; GCN-NEXT: v_writelane_b32 v0, s14, 38
; GCN-NEXT: v_writelane_b32 v0, s9, 37
; GCN-NEXT: v_writelane_b32 v0, s10, 38
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@ -77,8 +77,8 @@
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
; GCN-NEXT: v_writelane_b32 v0, s13, 45
; GCN-NEXT: v_writelane_b32 v0, s14, 46
; GCN-NEXT: v_writelane_b32 v0, s9, 45
; GCN-NEXT: v_writelane_b32 v0, s10, 46
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
@ -87,90 +87,90 @@
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
; GCN-NEXT: v_writelane_b32 v0, s13, 53
; GCN-NEXT: v_writelane_b32 v0, s14, 54
; GCN-NEXT: v_writelane_b32 v0, s9, 53
; GCN-NEXT: v_writelane_b32 v0, s10, 54
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
; GCN-NEXT: v_writelane_b32 v0, s88, 56
; GCN-NEXT: v_writelane_b32 v0, s89, 57
; GCN-NEXT: v_writelane_b32 v0, s90, 58
; GCN-NEXT: v_writelane_b32 v0, s91, 59
; GCN-NEXT: v_writelane_b32 v0, s92, 60
; GCN-NEXT: v_writelane_b32 v0, s93, 61
; GCN-NEXT: v_writelane_b32 v0, s94, 62
; GCN-NEXT: v_writelane_b32 v0, s95, 63
; GCN-NEXT: v_writelane_b32 v1, s16, 0
; GCN-NEXT: v_writelane_b32 v1, s17, 1
; GCN-NEXT: v_writelane_b32 v1, s18, 2
; GCN-NEXT: v_writelane_b32 v1, s19, 3
; GCN-NEXT: v_writelane_b32 v1, s20, 4
; GCN-NEXT: v_writelane_b32 v1, s21, 5
; GCN-NEXT: v_writelane_b32 v1, s22, 6
; GCN-NEXT: v_writelane_b32 v1, s23, 7
; GCN-NEXT: v_writelane_b32 v1, s24, 8
; GCN-NEXT: v_writelane_b32 v1, s25, 9
; GCN-NEXT: v_writelane_b32 v1, s26, 10
; GCN-NEXT: v_writelane_b32 v1, s27, 11
; GCN-NEXT: v_writelane_b32 v1, s28, 12
; GCN-NEXT: v_writelane_b32 v1, s29, 13
; GCN-NEXT: v_writelane_b32 v1, s30, 14
; GCN-NEXT: v_writelane_b32 v1, s31, 15
; GCN-NEXT: v_writelane_b32 v1, s32, 16
; GCN-NEXT: v_writelane_b32 v1, s33, 17
; GCN-NEXT: v_writelane_b32 v1, s34, 18
; GCN-NEXT: v_writelane_b32 v1, s35, 19
; GCN-NEXT: v_writelane_b32 v1, s36, 20
; GCN-NEXT: v_writelane_b32 v1, s37, 21
; GCN-NEXT: v_writelane_b32 v1, s38, 22
; GCN-NEXT: v_writelane_b32 v1, s39, 23
; GCN-NEXT: v_writelane_b32 v1, s40, 24
; GCN-NEXT: v_writelane_b32 v1, s41, 25
; GCN-NEXT: v_writelane_b32 v1, s42, 26
; GCN-NEXT: v_writelane_b32 v1, s43, 27
; GCN-NEXT: v_writelane_b32 v1, s44, 28
; GCN-NEXT: v_writelane_b32 v1, s45, 29
; GCN-NEXT: v_writelane_b32 v1, s46, 30
; GCN-NEXT: v_writelane_b32 v1, s47, 31
; GCN-NEXT: v_writelane_b32 v1, s48, 32
; GCN-NEXT: v_writelane_b32 v1, s49, 33
; GCN-NEXT: v_writelane_b32 v1, s50, 34
; GCN-NEXT: v_writelane_b32 v1, s51, 35
; GCN-NEXT: v_writelane_b32 v1, s52, 36
; GCN-NEXT: v_writelane_b32 v1, s53, 37
; GCN-NEXT: v_writelane_b32 v1, s54, 38
; GCN-NEXT: v_writelane_b32 v1, s55, 39
; GCN-NEXT: v_writelane_b32 v1, s56, 40
; GCN-NEXT: v_writelane_b32 v1, s57, 41
; GCN-NEXT: v_writelane_b32 v1, s58, 42
; GCN-NEXT: v_writelane_b32 v1, s59, 43
; GCN-NEXT: v_writelane_b32 v1, s60, 44
; GCN-NEXT: v_writelane_b32 v1, s61, 45
; GCN-NEXT: v_writelane_b32 v1, s62, 46
; GCN-NEXT: v_writelane_b32 v1, s63, 47
; GCN-NEXT: v_writelane_b32 v1, s64, 48
; GCN-NEXT: v_writelane_b32 v1, s65, 49
; GCN-NEXT: v_writelane_b32 v1, s66, 50
; GCN-NEXT: v_writelane_b32 v1, s67, 51
; GCN-NEXT: v_writelane_b32 v1, s68, 52
; GCN-NEXT: v_writelane_b32 v1, s69, 53
; GCN-NEXT: v_writelane_b32 v1, s70, 54
; GCN-NEXT: v_writelane_b32 v1, s71, 55
; GCN-NEXT: v_writelane_b32 v1, s72, 56
; GCN-NEXT: v_writelane_b32 v1, s73, 57
; GCN-NEXT: v_writelane_b32 v1, s74, 58
; GCN-NEXT: v_writelane_b32 v1, s75, 59
; GCN-NEXT: v_writelane_b32 v1, s76, 60
; GCN-NEXT: v_writelane_b32 v1, s77, 61
; GCN-NEXT: v_writelane_b32 v1, s78, 62
; GCN-NEXT: v_writelane_b32 v1, s79, 63
; GCN-NEXT: v_writelane_b32 v2, s80, 0
; GCN-NEXT: v_writelane_b32 v2, s81, 1
; GCN-NEXT: v_writelane_b32 v2, s82, 2
; GCN-NEXT: v_writelane_b32 v2, s83, 3
; GCN-NEXT: v_writelane_b32 v2, s84, 4
; GCN-NEXT: v_writelane_b32 v2, s85, 5
; GCN-NEXT: v_writelane_b32 v2, s86, 6
; GCN-NEXT: v_writelane_b32 v2, s87, 7
; GCN-NEXT: v_writelane_b32 v0, s84, 56
; GCN-NEXT: v_writelane_b32 v0, s85, 57
; GCN-NEXT: v_writelane_b32 v0, s86, 58
; GCN-NEXT: v_writelane_b32 v0, s87, 59
; GCN-NEXT: v_writelane_b32 v0, s88, 60
; GCN-NEXT: v_writelane_b32 v0, s89, 61
; GCN-NEXT: v_writelane_b32 v0, s90, 62
; GCN-NEXT: v_writelane_b32 v0, s91, 63
; GCN-NEXT: v_writelane_b32 v1, s12, 0
; GCN-NEXT: v_writelane_b32 v1, s13, 1
; GCN-NEXT: v_writelane_b32 v1, s14, 2
; GCN-NEXT: v_writelane_b32 v1, s15, 3
; GCN-NEXT: v_writelane_b32 v1, s16, 4
; GCN-NEXT: v_writelane_b32 v1, s17, 5
; GCN-NEXT: v_writelane_b32 v1, s18, 6
; GCN-NEXT: v_writelane_b32 v1, s19, 7
; GCN-NEXT: v_writelane_b32 v1, s20, 8
; GCN-NEXT: v_writelane_b32 v1, s21, 9
; GCN-NEXT: v_writelane_b32 v1, s22, 10
; GCN-NEXT: v_writelane_b32 v1, s23, 11
; GCN-NEXT: v_writelane_b32 v1, s24, 12
; GCN-NEXT: v_writelane_b32 v1, s25, 13
; GCN-NEXT: v_writelane_b32 v1, s26, 14
; GCN-NEXT: v_writelane_b32 v1, s27, 15
; GCN-NEXT: v_writelane_b32 v1, s28, 16
; GCN-NEXT: v_writelane_b32 v1, s29, 17
; GCN-NEXT: v_writelane_b32 v1, s30, 18
; GCN-NEXT: v_writelane_b32 v1, s31, 19
; GCN-NEXT: v_writelane_b32 v1, s32, 20
; GCN-NEXT: v_writelane_b32 v1, s33, 21
; GCN-NEXT: v_writelane_b32 v1, s34, 22
; GCN-NEXT: v_writelane_b32 v1, s35, 23
; GCN-NEXT: v_writelane_b32 v1, s36, 24
; GCN-NEXT: v_writelane_b32 v1, s37, 25
; GCN-NEXT: v_writelane_b32 v1, s38, 26
; GCN-NEXT: v_writelane_b32 v1, s39, 27
; GCN-NEXT: v_writelane_b32 v1, s40, 28
; GCN-NEXT: v_writelane_b32 v1, s41, 29
; GCN-NEXT: v_writelane_b32 v1, s42, 30
; GCN-NEXT: v_writelane_b32 v1, s43, 31
; GCN-NEXT: v_writelane_b32 v1, s44, 32
; GCN-NEXT: v_writelane_b32 v1, s45, 33
; GCN-NEXT: v_writelane_b32 v1, s46, 34
; GCN-NEXT: v_writelane_b32 v1, s47, 35
; GCN-NEXT: v_writelane_b32 v1, s48, 36
; GCN-NEXT: v_writelane_b32 v1, s49, 37
; GCN-NEXT: v_writelane_b32 v1, s50, 38
; GCN-NEXT: v_writelane_b32 v1, s51, 39
; GCN-NEXT: v_writelane_b32 v1, s52, 40
; GCN-NEXT: v_writelane_b32 v1, s53, 41
; GCN-NEXT: v_writelane_b32 v1, s54, 42
; GCN-NEXT: v_writelane_b32 v1, s55, 43
; GCN-NEXT: v_writelane_b32 v1, s56, 44
; GCN-NEXT: v_writelane_b32 v1, s57, 45
; GCN-NEXT: v_writelane_b32 v1, s58, 46
; GCN-NEXT: v_writelane_b32 v1, s59, 47
; GCN-NEXT: v_writelane_b32 v1, s60, 48
; GCN-NEXT: v_writelane_b32 v1, s61, 49
; GCN-NEXT: v_writelane_b32 v1, s62, 50
; GCN-NEXT: v_writelane_b32 v1, s63, 51
; GCN-NEXT: v_writelane_b32 v1, s64, 52
; GCN-NEXT: v_writelane_b32 v1, s65, 53
; GCN-NEXT: v_writelane_b32 v1, s66, 54
; GCN-NEXT: v_writelane_b32 v1, s67, 55
; GCN-NEXT: v_writelane_b32 v1, s68, 56
; GCN-NEXT: v_writelane_b32 v1, s69, 57
; GCN-NEXT: v_writelane_b32 v1, s70, 58
; GCN-NEXT: v_writelane_b32 v1, s71, 59
; GCN-NEXT: v_writelane_b32 v1, s72, 60
; GCN-NEXT: v_writelane_b32 v1, s73, 61
; GCN-NEXT: v_writelane_b32 v1, s74, 62
; GCN-NEXT: v_writelane_b32 v1, s75, 63
; GCN-NEXT: v_writelane_b32 v2, s76, 0
; GCN-NEXT: v_writelane_b32 v2, s77, 1
; GCN-NEXT: v_writelane_b32 v2, s78, 2
; GCN-NEXT: v_writelane_b32 v2, s79, 3
; GCN-NEXT: v_writelane_b32 v2, s80, 4
; GCN-NEXT: v_writelane_b32 v2, s81, 5
; GCN-NEXT: v_writelane_b32 v2, s82, 6
; GCN-NEXT: v_writelane_b32 v2, s83, 7
; GCN: s_cbranch_scc1
@ -393,24 +393,25 @@ ret:
; into the next available VGPR.
; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
; GCN: def s[24:39]
; GCN: def s[4:19]
; GCN: def s[20:35]
; GCN: v_writelane_b32 v0, s24, 50
; GCN-NEXT: v_writelane_b32 v0, s25, 51
; GCN-NEXT: v_writelane_b32 v0, s26, 52
; GCN-NEXT: v_writelane_b32 v0, s27, 53
; GCN-NEXT: v_writelane_b32 v0, s28, 54
; GCN-NEXT: v_writelane_b32 v0, s29, 55
; GCN-NEXT: v_writelane_b32 v0, s30, 56
; GCN-NEXT: v_writelane_b32 v0, s31, 57
; GCN-NEXT: v_writelane_b32 v0, s32, 58
; GCN-NEXT: v_writelane_b32 v0, s33, 59
; GCN-NEXT: v_writelane_b32 v0, s34, 60
; GCN-NEXT: v_writelane_b32 v0, s35, 61
; GCN-NEXT: v_writelane_b32 v0, s36, 62
; GCN-NEXT: v_writelane_b32 v0, s37, 63
; GCN-NEXT: v_writelane_b32 v1, s38, 0
; GCN-NEXT: v_writelane_b32 v1, s39, 1
; GCN: v_writelane_b32 v0, s4, 50
; GCN-NEXT: v_writelane_b32 v0, s5, 51
; GCN-NEXT: v_writelane_b32 v0, s6, 52
; GCN-NEXT: v_writelane_b32 v0, s7, 53
; GCN-NEXT: v_writelane_b32 v0, s8, 54
; GCN-NEXT: v_writelane_b32 v0, s9, 55
; GCN-NEXT: v_writelane_b32 v0, s10, 56
; GCN-NEXT: v_writelane_b32 v0, s11, 57
; GCN-NEXT: v_writelane_b32 v0, s12, 58
; GCN-NEXT: v_writelane_b32 v0, s13, 59
; GCN-NEXT: v_writelane_b32 v0, s14, 60
; GCN-NEXT: v_writelane_b32 v0, s15, 61
; GCN-NEXT: v_writelane_b32 v0, s16, 62
; GCN-NEXT: v_writelane_b32 v0, s17, 63
; GCN-NEXT: v_writelane_b32 v1, s18, 0
; GCN-NEXT: v_writelane_b32 v1, s19, 1
; GCN: v_readlane_b32 s4, v0, 50
; GCN-NEXT: v_readlane_b32 s5, v0, 51

View File

@ -40,8 +40,7 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
; GCN: s_load_dword s
; GCN-NEXT: s_load_dword s
; GCN-NEXT: s_load_dword s
; GCN-NEXT: s_load_dwordx2 s
; GCN-NOT: {{buffer|flat|global}}
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}

View File

@ -1,4 +1,4 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
; GCN-LABEL: {{^}}v_sad_u32_pat1:
; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
@ -203,8 +203,11 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i1
}
; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out) {
%a = load volatile i16, i16 addrspace(1)* undef
%b = load volatile i16, i16 addrspace(1)* undef
%c = load volatile i16, i16 addrspace(1)* undef
%icmp0 = icmp ugt i16 %a, %b
%sub0 = sub i16 %a, %b
%sub1 = sub i16 %b, %a
@ -233,8 +236,31 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b
}
; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
%a = load volatile i8, i8 addrspace(1)* undef
%b = load volatile i8, i8 addrspace(1)* undef
%c = load volatile i8, i8 addrspace(1)* undef
%icmp0 = icmp ugt i8 %a, %b
%sub0 = sub i8 %a, %b
%sub1 = sub i8 %b, %a
%ret0 = select i1 %icmp0, i8 %sub0, i8 %sub1
%ret = add i8 %ret0, %c
store i8 %ret, i8 addrspace(1)* %out
ret void
}
; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
; GCN: s_load_dword
; GCN: s_bfe_u32
; GCN: s_sub_i32
; GCN: s_and_b32
; GCN: s_sub_i32
; GCN: s_lshr_b32
; GCN: v_add_i32_e32
define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
%icmp0 = icmp ugt i8 %a, %b
%sub0 = sub i8 %a, %b
%sub1 = sub i8 %b, %a

View File

@ -2,14 +2,11 @@
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
; FUNC-LABEL: {{^}}cluster_arg_loads:
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
; SI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
; VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
store i32 %x, i32 addrspace(1)* %out0, align 4
store i32 %y, i32 addrspace(1)* %out1, align 4
@ -42,7 +39,7 @@ define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
entry:
%value = add i64 %arg125, %arg126
%value = add i64 %arg124, %arg126
store i64 %value, i64 addrspace(1)* %out, align 8
ret void
}

View File

@ -1,10 +1,13 @@
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s
; SI: NumSgprs: {{[1-9]$}}
; SI: NumVgprs: {{[1-9]$}}
; SI-MINREG: NumSgprs: {{[1-9]$}}
; SI-MINREG: NumVgprs: {{[1-9]$}}
; SI-MAXOCC: NumSgprs: {{[0-4][0-9]$}}
; SI-MAXOCC: NumVgprs: {{[0-4][0-9]$}}
; stores may alias loads
; VI: NumSgprs: {{[0-9]$}}

View File

@ -1,5 +1,5 @@
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
@ -18,12 +18,12 @@ define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1
; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
%cmp = icmp slt i1 %cond, false
%sel = select i1 %cmp, i1 %a, i1 %b
store i1 %sel, i1 addrspace(1)* %out, align 4

Some files were not shown because too many files have changed in this diff Show More