forked from OSchip/llvm-project
AMDGPU: Add pass to lower kernel arguments to loads
This replaces most argument uses with loads, but for now not all. The code in SelectionDAG for calling convention lowering is actively harmful for amdgpu_kernel. It attempts to split the argument types into register legal types, which results in low quality code for arbitary types. Since all kernel arguments are passed in memory, we just want the raw types. I've tried a couple of methods of mitigating this in SelectionDAG, but it's easier to just bypass this problem alltogether. It's possible to hack around the problem in the initial lowering, but the real problem is the DAG then expects to be able to use CopyToReg/CopyFromReg for uses of the arguments outside the block. Exposing the argument loads in the IR also has the advantage that the LoadStoreVectorizer can merge them. I'm not sure the best approach to dealing with the IR argument list is. The patch as-is just leaves the IR arguments in place, so all the existing code will still compute the same kernarg size and pointlessly lowers the arguments. Arguably the frontend should emit kernels with an empty argument list in the first place. Alternatively a dummy array could be inserted as a single argument just to reserve space. This does have some disadvantages. Local pointer kernel arguments can no longer have AssertZext placed on them as the equivalent !range metadata is not valid on pointer typed loads. This is mostly bad for SI which needs to know about the known bits in order to use the DS instruction offset, so in this case this is not done. More importantly, this skips noalias arguments since this pass does not yet convert this to the equivalent !alias.scope and !noalias metadata. Producing this metadata correctly seems to be tricky, although this logically is the same as inlining into a function which doesn't exist. Additionally, exposing these loads to the vectorizer may result in degraded aliasing information if a pointer load is merged with another argument load. I'm also not entirely sure this is preserving the current clover ABI, although I would greatly prefer if it would stop widening arguments and match the HSA ABI. As-is I think it is extending < 4-byte arguments to 4-bytes but doesn't align them to 4-bytes. llvm-svn: 335650
This commit is contained in:
parent
7e991d30c0
commit
8c4a35237a
|
@ -73,6 +73,10 @@ ModulePass *createAMDGPULowerIntrinsicsPass();
|
||||||
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
|
void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
|
||||||
extern char &AMDGPULowerIntrinsicsID;
|
extern char &AMDGPULowerIntrinsicsID;
|
||||||
|
|
||||||
|
FunctionPass *createAMDGPULowerKernelArgumentsPass();
|
||||||
|
void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &);
|
||||||
|
extern char &AMDGPULowerKernelArgumentsID;
|
||||||
|
|
||||||
ModulePass *createAMDGPULowerKernelAttributesPass();
|
ModulePass *createAMDGPULowerKernelAttributesPass();
|
||||||
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
|
void initializeAMDGPULowerKernelAttributesPass(PassRegistry &);
|
||||||
extern char &AMDGPULowerKernelAttributesID;
|
extern char &AMDGPULowerKernelAttributesID;
|
||||||
|
|
|
@ -0,0 +1,267 @@
|
||||||
|
//===-- AMDGPULowerKernelArguments.cpp ------------------------------------------===//
|
||||||
|
//
|
||||||
|
// The LLVM Compiler Infrastructure
|
||||||
|
//
|
||||||
|
// This file is distributed under the University of Illinois Open Source
|
||||||
|
// License. See LICENSE.TXT for details.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
//
|
||||||
|
/// \file This pass replaces accesses to kernel arguments with loads from
|
||||||
|
/// offsets from the kernarg base pointer.
|
||||||
|
//
|
||||||
|
//===----------------------------------------------------------------------===//
|
||||||
|
|
||||||
|
#include "AMDGPU.h"
|
||||||
|
#include "AMDGPUSubtarget.h"
|
||||||
|
#include "AMDGPUTargetMachine.h"
|
||||||
|
#include "llvm/ADT/StringRef.h"
|
||||||
|
#include "llvm/Analysis/DivergenceAnalysis.h"
|
||||||
|
#include "llvm/Analysis/Loads.h"
|
||||||
|
#include "llvm/CodeGen/Passes.h"
|
||||||
|
#include "llvm/CodeGen/TargetPassConfig.h"
|
||||||
|
#include "llvm/IR/Attributes.h"
|
||||||
|
#include "llvm/IR/BasicBlock.h"
|
||||||
|
#include "llvm/IR/Constants.h"
|
||||||
|
#include "llvm/IR/DerivedTypes.h"
|
||||||
|
#include "llvm/IR/Function.h"
|
||||||
|
#include "llvm/IR/IRBuilder.h"
|
||||||
|
#include "llvm/IR/InstrTypes.h"
|
||||||
|
#include "llvm/IR/Instruction.h"
|
||||||
|
#include "llvm/IR/Instructions.h"
|
||||||
|
#include "llvm/IR/LLVMContext.h"
|
||||||
|
#include "llvm/IR/MDBuilder.h"
|
||||||
|
#include "llvm/IR/Metadata.h"
|
||||||
|
#include "llvm/IR/Operator.h"
|
||||||
|
#include "llvm/IR/Type.h"
|
||||||
|
#include "llvm/IR/Value.h"
|
||||||
|
#include "llvm/Pass.h"
|
||||||
|
#include "llvm/Support/Casting.h"
|
||||||
|
|
||||||
|
#define DEBUG_TYPE "amdgpu-lower-kernel-arguments"
|
||||||
|
|
||||||
|
using namespace llvm;
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
class AMDGPULowerKernelArguments : public FunctionPass{
|
||||||
|
public:
|
||||||
|
static char ID;
|
||||||
|
|
||||||
|
AMDGPULowerKernelArguments() : FunctionPass(ID) {}
|
||||||
|
|
||||||
|
bool runOnFunction(Function &F) override;
|
||||||
|
|
||||||
|
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||||
|
AU.addRequired<TargetPassConfig>();
|
||||||
|
AU.setPreservesAll();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
} // end anonymous namespace
|
||||||
|
|
||||||
|
bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
|
||||||
|
CallingConv::ID CC = F.getCallingConv();
|
||||||
|
if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
auto &TPC = getAnalysis<TargetPassConfig>();
|
||||||
|
|
||||||
|
const TargetMachine &TM = TPC.getTM<TargetMachine>();
|
||||||
|
const SISubtarget &ST = TM.getSubtarget<SISubtarget>(F);
|
||||||
|
LLVMContext &Ctx = F.getParent()->getContext();
|
||||||
|
const DataLayout &DL = F.getParent()->getDataLayout();
|
||||||
|
BasicBlock &EntryBlock = *F.begin();
|
||||||
|
IRBuilder<> Builder(&*EntryBlock.begin());
|
||||||
|
|
||||||
|
SmallVector<Type *, 16> ArgTypes;
|
||||||
|
for (Argument &Arg : F.args()) {
|
||||||
|
Type *ArgTy = Arg.getType();
|
||||||
|
unsigned Size = DL.getTypeStoreSizeInBits(ArgTy);
|
||||||
|
bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
|
||||||
|
!ST.isAmdHsaOS();
|
||||||
|
|
||||||
|
// Clover seems to always pad i8/i16 to i32, but doesn't properly align
|
||||||
|
// them?
|
||||||
|
// Make sure the struct elements have correct size and alignment for ext
|
||||||
|
// args. These seem to be padded up to 4-bytes but not correctly aligned.
|
||||||
|
ArgTypes.push_back(
|
||||||
|
IsExtArg ? ArrayType::get(ArgTy, 32 / Size) : Arg.getType());
|
||||||
|
}
|
||||||
|
|
||||||
|
StructType *ArgStructTy = StructType::create(Ctx, ArgTypes, F.getName());
|
||||||
|
const StructLayout *Layout = DL.getStructLayout(ArgStructTy);
|
||||||
|
|
||||||
|
// Minimum alignment for kern segment is 16.
|
||||||
|
unsigned KernArgBaseAlign = std::max(16u, DL.getABITypeAlignment(ArgStructTy));
|
||||||
|
const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
|
||||||
|
|
||||||
|
// FIXME: Alignment is broken broken with explicit arg offset.;
|
||||||
|
const uint64_t TotalKernArgSize = BaseOffset +
|
||||||
|
ST.getKernArgSegmentSize(F, DL.getTypeAllocSize(ArgStructTy));
|
||||||
|
|
||||||
|
CallInst *KernArgSegment =
|
||||||
|
Builder.CreateIntrinsic(Intrinsic::amdgcn_kernarg_segment_ptr, nullptr,
|
||||||
|
F.getName() + ".kernarg.segment");
|
||||||
|
|
||||||
|
KernArgSegment->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
|
||||||
|
KernArgSegment->addAttribute(AttributeList::ReturnIndex,
|
||||||
|
Attribute::getWithDereferenceableBytes(Ctx, TotalKernArgSize));
|
||||||
|
KernArgSegment->addAttribute(AttributeList::ReturnIndex,
|
||||||
|
Attribute::getWithAlignment(Ctx, KernArgBaseAlign));
|
||||||
|
|
||||||
|
Value *KernArgBase = KernArgSegment;
|
||||||
|
if (BaseOffset != 0) {
|
||||||
|
KernArgBase = Builder.CreateConstInBoundsGEP1_64(KernArgBase, BaseOffset);
|
||||||
|
KernArgBaseAlign = MinAlign(KernArgBaseAlign, BaseOffset);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned AS = KernArgSegment->getType()->getPointerAddressSpace();
|
||||||
|
Value *CastStruct = Builder.CreateBitCast(KernArgBase,
|
||||||
|
ArgStructTy->getPointerTo(AS));
|
||||||
|
for (Argument &Arg : F.args()) {
|
||||||
|
if (Arg.use_empty())
|
||||||
|
continue;
|
||||||
|
|
||||||
|
Type *ArgTy = Arg.getType();
|
||||||
|
if (PointerType *PT = dyn_cast<PointerType>(ArgTy)) {
|
||||||
|
// FIXME: Hack. We rely on AssertZext to be able to fold DS addressing
|
||||||
|
// modes on SI to know the high bits are 0 so pointer adds don't wrap. We
|
||||||
|
// can't represent this with range metadata because it's only allowed for
|
||||||
|
// integer types.
|
||||||
|
if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
|
||||||
|
ST.getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// FIXME: We can replace this with equivalent alias.scope/noalias
|
||||||
|
// metadata, but this appears to be a lot of work.
|
||||||
|
if (Arg.hasNoAliasAttr())
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
VectorType *VT = dyn_cast<VectorType>(ArgTy);
|
||||||
|
bool IsV3 = VT && VT->getNumElements() == 3;
|
||||||
|
VectorType *V4Ty = nullptr;
|
||||||
|
|
||||||
|
unsigned Size = DL.getTypeSizeInBits(ArgTy);
|
||||||
|
bool IsExtArg = Size < 32 && (Arg.hasZExtAttr() || Arg.hasSExtAttr()) &&
|
||||||
|
!ST.isAmdHsaOS();
|
||||||
|
int64_t EltOffset = Layout->getElementOffset(Arg.getArgNo());
|
||||||
|
int64_t AlignDownOffset = alignDown(EltOffset, 4);
|
||||||
|
int64_t OffsetDiff = EltOffset - AlignDownOffset;
|
||||||
|
unsigned AdjustedAlign = MinAlign(KernArgBaseAlign, AlignDownOffset);
|
||||||
|
|
||||||
|
Value *ArgPtr;
|
||||||
|
if (Size < 32) {
|
||||||
|
// Since we don't have sub-dword scalar loads, avoid doing an extload by
|
||||||
|
// loading earlier than the argument address, and extracting the relevant
|
||||||
|
// bits.
|
||||||
|
//
|
||||||
|
// Additionally widen any sub-dword load to i32 even if suitably aligned,
|
||||||
|
// so that CSE between different argument loads works easily.
|
||||||
|
|
||||||
|
ArgPtr = Builder.CreateConstGEP1_64(KernArgBase, AlignDownOffset);
|
||||||
|
ArgPtr = Builder.CreateBitCast(
|
||||||
|
ArgPtr,
|
||||||
|
Builder.getInt32Ty()->getPointerTo(AS),
|
||||||
|
Arg.getName() + ".kernarg.offset.align.down");
|
||||||
|
} else {
|
||||||
|
ArgPtr = Builder.CreateStructGEP(CastStruct, Arg.getArgNo(),
|
||||||
|
Arg.getName() + ".kernarg.offset");
|
||||||
|
}
|
||||||
|
|
||||||
|
assert((!IsExtArg || !IsV3) && "incompatible situation");
|
||||||
|
|
||||||
|
|
||||||
|
if (IsV3 && Size >= 32) {
|
||||||
|
V4Ty = VectorType::get(VT->getVectorElementType(), 4);
|
||||||
|
// Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
|
||||||
|
ArgPtr = Builder.CreateBitCast(ArgPtr, V4Ty->getPointerTo(AS));
|
||||||
|
}
|
||||||
|
|
||||||
|
LoadInst *Load = Builder.CreateAlignedLoad(ArgPtr, AdjustedAlign);
|
||||||
|
Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
|
||||||
|
|
||||||
|
MDBuilder MDB(Ctx);
|
||||||
|
|
||||||
|
if (isa<PointerType>(ArgTy)) {
|
||||||
|
if (Arg.hasNonNullAttr())
|
||||||
|
Load->setMetadata(LLVMContext::MD_nonnull, MDNode::get(Ctx, {}));
|
||||||
|
|
||||||
|
uint64_t DerefBytes = Arg.getDereferenceableBytes();
|
||||||
|
if (DerefBytes != 0) {
|
||||||
|
Load->setMetadata(
|
||||||
|
LLVMContext::MD_dereferenceable,
|
||||||
|
MDNode::get(Ctx,
|
||||||
|
MDB.createConstant(
|
||||||
|
ConstantInt::get(Builder.getInt64Ty(), DerefBytes))));
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t DerefOrNullBytes = Arg.getDereferenceableOrNullBytes();
|
||||||
|
if (DerefOrNullBytes != 0) {
|
||||||
|
Load->setMetadata(
|
||||||
|
LLVMContext::MD_dereferenceable_or_null,
|
||||||
|
MDNode::get(Ctx,
|
||||||
|
MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
|
||||||
|
DerefOrNullBytes))));
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned ParamAlign = Arg.getParamAlignment();
|
||||||
|
if (ParamAlign != 0) {
|
||||||
|
Load->setMetadata(
|
||||||
|
LLVMContext::MD_align,
|
||||||
|
MDNode::get(Ctx,
|
||||||
|
MDB.createConstant(ConstantInt::get(Builder.getInt64Ty(),
|
||||||
|
ParamAlign))));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: Convert noalias arg to !noalias
|
||||||
|
|
||||||
|
if (Size < 32) {
|
||||||
|
if (IsExtArg && OffsetDiff == 0) {
|
||||||
|
Type *I32Ty = Builder.getInt32Ty();
|
||||||
|
bool IsSext = Arg.hasSExtAttr();
|
||||||
|
Metadata *LowAndHigh[] = {
|
||||||
|
ConstantAsMetadata::get(
|
||||||
|
ConstantInt::get(I32Ty, IsSext ? minIntN(Size) : 0)),
|
||||||
|
ConstantAsMetadata::get(
|
||||||
|
ConstantInt::get(I32Ty,
|
||||||
|
IsSext ? maxIntN(Size) + 1 : maxUIntN(Size) + 1))
|
||||||
|
};
|
||||||
|
|
||||||
|
Load->setMetadata(LLVMContext::MD_range, MDNode::get(Ctx, LowAndHigh));
|
||||||
|
}
|
||||||
|
|
||||||
|
Value *ExtractBits = OffsetDiff == 0 ?
|
||||||
|
Load : Builder.CreateLShr(Load, OffsetDiff * 8);
|
||||||
|
|
||||||
|
IntegerType *ArgIntTy = Builder.getIntNTy(Size);
|
||||||
|
Value *Trunc = Builder.CreateTrunc(ExtractBits, ArgIntTy);
|
||||||
|
Value *NewVal = Builder.CreateBitCast(Trunc, ArgTy,
|
||||||
|
Arg.getName() + ".load");
|
||||||
|
Arg.replaceAllUsesWith(NewVal);
|
||||||
|
} else if (IsV3) {
|
||||||
|
Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
|
||||||
|
{0, 1, 2},
|
||||||
|
Arg.getName() + ".load");
|
||||||
|
Arg.replaceAllUsesWith(Shuf);
|
||||||
|
} else {
|
||||||
|
Load->setName(Arg.getName() + ".load");
|
||||||
|
Arg.replaceAllUsesWith(Load);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
INITIALIZE_PASS_BEGIN(AMDGPULowerKernelArguments, DEBUG_TYPE,
|
||||||
|
"AMDGPU Lower Kernel Arguments", false, false)
|
||||||
|
INITIALIZE_PASS_END(AMDGPULowerKernelArguments, DEBUG_TYPE, "AMDGPU Lower Kernel Arguments",
|
||||||
|
false, false)
|
||||||
|
|
||||||
|
char AMDGPULowerKernelArguments::ID = 0;
|
||||||
|
|
||||||
|
FunctionPass *llvm::createAMDGPULowerKernelArgumentsPass() {
|
||||||
|
return new AMDGPULowerKernelArguments();
|
||||||
|
}
|
|
@ -130,6 +130,12 @@ static cl::opt<bool> EnableLibCallSimplify(
|
||||||
cl::init(true),
|
cl::init(true),
|
||||||
cl::Hidden);
|
cl::Hidden);
|
||||||
|
|
||||||
|
static cl::opt<bool> EnableLowerKernelArguments(
|
||||||
|
"amdgpu-ir-lower-kernel-arguments",
|
||||||
|
cl::desc("Lower kernel argument loads in IR pass"),
|
||||||
|
cl::init(true),
|
||||||
|
cl::Hidden);
|
||||||
|
|
||||||
extern "C" void LLVMInitializeAMDGPUTarget() {
|
extern "C" void LLVMInitializeAMDGPUTarget() {
|
||||||
// Register the target
|
// Register the target
|
||||||
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
|
RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
|
||||||
|
@ -155,6 +161,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
|
||||||
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
|
initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
|
||||||
initializeAMDGPUAnnotateUniformValuesPass(*PR);
|
initializeAMDGPUAnnotateUniformValuesPass(*PR);
|
||||||
initializeAMDGPUArgumentUsageInfoPass(*PR);
|
initializeAMDGPUArgumentUsageInfoPass(*PR);
|
||||||
|
initializeAMDGPULowerKernelArgumentsPass(*PR);
|
||||||
initializeAMDGPULowerKernelAttributesPass(*PR);
|
initializeAMDGPULowerKernelAttributesPass(*PR);
|
||||||
initializeAMDGPULowerIntrinsicsPass(*PR);
|
initializeAMDGPULowerIntrinsicsPass(*PR);
|
||||||
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
|
initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
|
||||||
|
@ -669,6 +676,10 @@ void AMDGPUPassConfig::addIRPasses() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void AMDGPUPassConfig::addCodeGenPrepare() {
|
void AMDGPUPassConfig::addCodeGenPrepare() {
|
||||||
|
if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
|
||||||
|
EnableLowerKernelArguments)
|
||||||
|
addPass(createAMDGPULowerKernelArgumentsPass());
|
||||||
|
|
||||||
TargetPassConfig::addCodeGenPrepare();
|
TargetPassConfig::addCodeGenPrepare();
|
||||||
|
|
||||||
if (EnableLoadStoreVectorizer)
|
if (EnableLoadStoreVectorizer)
|
||||||
|
|
|
@ -40,6 +40,7 @@ add_llvm_target(AMDGPUCodeGen
|
||||||
AMDGPULibCalls.cpp
|
AMDGPULibCalls.cpp
|
||||||
AMDGPULibFunc.cpp
|
AMDGPULibFunc.cpp
|
||||||
AMDGPULowerIntrinsics.cpp
|
AMDGPULowerIntrinsics.cpp
|
||||||
|
AMDGPULowerKernelArguments.cpp
|
||||||
AMDGPULowerKernelAttributes.cpp
|
AMDGPULowerKernelAttributes.cpp
|
||||||
AMDGPUMachineCFGStructurizer.cpp
|
AMDGPUMachineCFGStructurizer.cpp
|
||||||
AMDGPUMachineFunction.cpp
|
AMDGPUMachineFunction.cpp
|
||||||
|
|
|
@ -9,11 +9,11 @@
|
||||||
; GCN-LABEL: {{^}}smrd0:
|
; GCN-LABEL: {{^}}smrd0:
|
||||||
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
|
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
|
||||||
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
|
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
|
||||||
define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
|
define amdgpu_kernel void @smrd0(i32 addrspace(4)* %ptr) {
|
||||||
entry:
|
entry:
|
||||||
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
|
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 1
|
||||||
%1 = load i32, i32 addrspace(4)* %0
|
%1 = load i32, i32 addrspace(4)* %0
|
||||||
store i32 %1, i32 addrspace(1)* %out
|
store i32 %1, i32 addrspace(1)* undef
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -21,11 +21,11 @@ entry:
|
||||||
; GCN-LABEL: {{^}}smrd1:
|
; GCN-LABEL: {{^}}smrd1:
|
||||||
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
|
; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
|
||||||
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
|
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
|
||||||
define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
|
define amdgpu_kernel void @smrd1(i32 addrspace(4)* %ptr) {
|
||||||
entry:
|
entry:
|
||||||
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
|
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 255
|
||||||
%1 = load i32, i32 addrspace(4)* %0
|
%1 = load i32, i32 addrspace(4)* %0
|
||||||
store i32 %1, i32 addrspace(1)* %out
|
store i32 %1, i32 addrspace(1)* undef
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -36,11 +36,11 @@ entry:
|
||||||
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
|
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
|
||||||
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
|
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
|
define amdgpu_kernel void @smrd2(i32 addrspace(4)* %ptr) {
|
||||||
entry:
|
entry:
|
||||||
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
|
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 256
|
||||||
%1 = load i32, i32 addrspace(4)* %0
|
%1 = load i32, i32 addrspace(4)* %0
|
||||||
store i32 %1, i32 addrspace(1)* %out
|
store i32 %1, i32 addrspace(1)* undef
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -51,11 +51,11 @@ entry:
|
||||||
; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
|
; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
|
||||||
; TODO: Add VI checks
|
; TODO: Add VI checks
|
||||||
; XGCN: s_endpgm
|
; XGCN: s_endpgm
|
||||||
define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
|
define amdgpu_kernel void @smrd3(i32 addrspace(4)* %ptr) {
|
||||||
entry:
|
entry:
|
||||||
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
|
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 4294967296 ; 2 ^ 32
|
||||||
%1 = load i32, i32 addrspace(4)* %0
|
%1 = load i32, i32 addrspace(4)* %0
|
||||||
store i32 %1, i32 addrspace(1)* %out
|
store i32 %1, i32 addrspace(1)* undef
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -65,11 +65,11 @@ entry:
|
||||||
; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
|
; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
|
||||||
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
|
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
|
||||||
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
|
; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
|
||||||
define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
|
define amdgpu_kernel void @smrd4(i32 addrspace(4)* %ptr) {
|
||||||
entry:
|
entry:
|
||||||
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
|
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262143
|
||||||
%1 = load i32, i32 addrspace(4)* %0
|
%1 = load i32, i32 addrspace(4)* %0
|
||||||
store i32 %1, i32 addrspace(1)* %out
|
store i32 %1, i32 addrspace(1)* undef
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -79,11 +79,11 @@ entry:
|
||||||
; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
|
; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
|
||||||
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
|
; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(4)* %ptr) {
|
define amdgpu_kernel void @smrd5(i32 addrspace(4)* %ptr) {
|
||||||
entry:
|
entry:
|
||||||
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
|
%0 = getelementptr i32, i32 addrspace(4)* %ptr, i64 262144
|
||||||
%1 = load i32, i32 addrspace(4)* %0
|
%1 = load i32, i32 addrspace(4)* %0
|
||||||
store i32 %1, i32 addrspace(1)* %out
|
store i32 %1, i32 addrspace(1)* undef
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@ define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out,
|
||||||
; SI-NOT: addc
|
; SI-NOT: addc
|
||||||
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
||||||
; SI: buffer_store_dword [[VRESULT]],
|
; SI: buffer_store_dword [[VRESULT]],
|
||||||
define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
|
define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i32, i64 %a, i32, i64 %b) {
|
||||||
%add = add i64 %b, %a
|
%add = add i64 %b, %a
|
||||||
%trunc = trunc i64 %add to i32
|
%trunc = trunc i64 %add to i32
|
||||||
store i32 %trunc, i32 addrspace(1)* %out, align 8
|
store i32 %trunc, i32 addrspace(1)* %out, align 8
|
||||||
|
|
|
@ -3,7 +3,7 @@
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
|
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
|
||||||
; TRAP-HANDLER-ENABLE: NumSgprs: 60
|
; TRAP-HANDLER-ENABLE: NumSgprs: 60
|
||||||
; TRAP-HANDLER-DISABLE: NumSgprs: 76
|
; TRAP-HANDLER-DISABLE: NumSgprs: 78
|
||||||
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
|
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
|
||||||
i32 addrspace(1)* %out0, i32 %in0,
|
i32 addrspace(1)* %out0, i32 %in0,
|
||||||
i32 addrspace(1)* %out1, i32 %in1,
|
i32 addrspace(1)* %out1, i32 %in1,
|
||||||
|
|
|
@ -217,7 +217,7 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out,
|
||||||
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
|
; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: buffer_store_dwordx2
|
; SI: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
|
define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i32, i64 %a) {
|
||||||
%and = and i64 %a, 1234567
|
%and = and i64 %a, 1234567
|
||||||
store i64 %and, i64 addrspace(1)* %out, align 8
|
store i64 %and, i64 addrspace(1)* %out, align 8
|
||||||
ret void
|
ret void
|
||||||
|
@ -235,7 +235,7 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64
|
||||||
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
|
; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: buffer_store_dwordx2
|
; SI: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
|
define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) {
|
||||||
%shl.a = shl i64 %a, 1
|
%shl.a = shl i64 %a, 1
|
||||||
%shl.b = shl i64 %b, 1
|
%shl.b = shl i64 %b, 1
|
||||||
%and0 = and i64 %shl.a, 62
|
%and0 = and i64 %shl.a, 62
|
||||||
|
@ -381,7 +381,7 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 a
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: s_add_u32
|
; SI: s_add_u32
|
||||||
; SI-NEXT: s_addc_u32
|
; SI-NEXT: s_addc_u32
|
||||||
define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
|
define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i32, i64 %b) {
|
||||||
%shl = shl i64 %a, 1
|
%shl = shl i64 %a, 1
|
||||||
%and = and i64 %shl, 64
|
%and = and i64 %shl, 64
|
||||||
%add = add i64 %and, %b
|
%add = add i64 %and, %b
|
||||||
|
|
|
@ -12,25 +12,17 @@
|
||||||
; CIVI: s_load_dword [[LHS:s[0-9]+]]
|
; CIVI: s_load_dword [[LHS:s[0-9]+]]
|
||||||
; CIVI: s_load_dword [[RHS:s[0-9]+]]
|
; CIVI: s_load_dword [[RHS:s[0-9]+]]
|
||||||
|
|
||||||
; VI: s_ashr_i32
|
; CIVI-DAG: s_ashr_i32
|
||||||
; VI: s_ashr_i32
|
; CIVI-DAG: s_ashr_i32
|
||||||
; VI: s_sext_i32_i16
|
; CIVI-DAG: s_sext_i32_i16
|
||||||
; VI: s_sext_i32_i16
|
; CIVI-DAG: s_sext_i32_i16
|
||||||
; VI: s_ashr_i32
|
; CIVI-DAG: s_ashr_i32
|
||||||
; VI: s_ashr_i32
|
; CIVI-DAG: s_ashr_i32
|
||||||
; VI: s_lshl_b32
|
; CIVI-DAG: s_lshl_b32
|
||||||
; VI: s_and_b32
|
; CIVI: s_and_b32
|
||||||
; VI: s_or_b32
|
; CIVI: s_or_b32
|
||||||
|
|
||||||
; CI: s_ashr_i32
|
define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, i32, <2 x i16> %lhs, i32, <2 x i16> %rhs) #0 {
|
||||||
; CI: s_and_b32
|
|
||||||
; CI: s_lshr_b32
|
|
||||||
; CI: s_sext_i32_i16
|
|
||||||
; CI: s_ashr_i32
|
|
||||||
; CI: s_ashr_i32
|
|
||||||
; CI: s_lshl_b32
|
|
||||||
; CI: s_and_b32
|
|
||||||
define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
|
|
||||||
%result = ashr <2 x i16> %lhs, %rhs
|
%result = ashr <2 x i16> %lhs, %rhs
|
||||||
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -7,16 +7,16 @@
|
||||||
; GFX9-NOT: m0
|
; GFX9-NOT: m0
|
||||||
; SICIVI-DAG: s_mov_b32 m0
|
; SICIVI-DAG: s_mov_b32 m0
|
||||||
|
|
||||||
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
|
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
|
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
|
||||||
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
|
; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
|
define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, [8 x i32], i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
|
||||||
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
||||||
%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
|
%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
|
||||||
%result = extractvalue { i32, i1 } %pair, 0
|
%result = extractvalue { i32, i1 } %pair, 0
|
||||||
|
@ -70,15 +70,15 @@ define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspac
|
||||||
|
|
||||||
|
|
||||||
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
; SICI-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||||
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xa
|
; SICI-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x12
|
||||||
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
; GFX89-DAG: s_load_dword [[PTR:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||||
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
|
; GFX89-DAG: s_load_dword [[SWAP:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x48
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
|
; GCN-DAG: v_mov_b32_e32 [[VCMP:v[0-9]+]], 7
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
; GCN-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
|
; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
|
||||||
; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
|
; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
|
define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, [8 x i32], i32 %swap) nounwind {
|
||||||
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
%gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
|
||||||
%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
|
%pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
|
||||||
%result = extractvalue { i32, i1 } %pair, 0
|
%result = extractvalue { i32, i1 } %pair, 0
|
||||||
|
|
|
@ -0,0 +1,33 @@
|
||||||
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
|
||||||
|
|
||||||
|
; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
|
||||||
|
|
||||||
|
; ALL-LABEL: {{^}}max_9_sgprs:
|
||||||
|
; ALL: SGPRBlocks: 1
|
||||||
|
; ALL: NumSGPRsForWavesPerEU: 9
|
||||||
|
define amdgpu_kernel void @max_9_sgprs() #0 {
|
||||||
|
%one = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%two = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%three = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%four = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%five = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%six = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%seven = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%eight = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%nine = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
%ten = load volatile i32, i32 addrspace(4)* undef
|
||||||
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight)
|
||||||
|
store volatile i32 %one, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %two, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %three, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %four, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %five, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %six, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %seven, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %eight, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %nine, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %ten, i32 addrspace(1)* undef
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
|
|
@ -1,25 +1,37 @@
|
||||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
|
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSGPR -check-prefix=ALL %s
|
||||||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -amdgpu-spill-sgpr-to-smem=1 -verify-machineinstrs < %s | FileCheck -check-prefix=TOSMEM -check-prefix=ALL %s
|
|
||||||
|
|
||||||
; If spilling to smem, additional registers are used for the resource
|
; If spilling to smem, additional registers are used for the resource
|
||||||
; descriptor.
|
; descriptor.
|
||||||
|
|
||||||
|
; FIXME: Vectorization can increase required SGPR count beyond limit.
|
||||||
|
; FIXME: SGPR-to-SMEM requires an additional SGPR always to scavenge m0
|
||||||
|
|
||||||
; ALL-LABEL: {{^}}max_9_sgprs:
|
; ALL-LABEL: {{^}}max_9_sgprs:
|
||||||
|
|
||||||
; ALL: SGPRBlocks: 1
|
; ALL: SGPRBlocks: 1
|
||||||
; ALL: NumSGPRsForWavesPerEU: 9
|
; ALL: NumSGPRsForWavesPerEU: 9
|
||||||
define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
|
define amdgpu_kernel void @max_9_sgprs() #0 {
|
||||||
|
%one = load volatile i32, i32 addrspace(4)* undef
|
||||||
i32 addrspace(1)* %out2,
|
%two = load volatile i32, i32 addrspace(4)* undef
|
||||||
i32 addrspace(1)* %out3,
|
%three = load volatile i32, i32 addrspace(4)* undef
|
||||||
i32 addrspace(1)* %out4,
|
%four = load volatile i32, i32 addrspace(4)* undef
|
||||||
i32 addrspace(1)* %out5,
|
%five = load volatile i32, i32 addrspace(4)* undef
|
||||||
i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
|
%six = load volatile i32, i32 addrspace(4)* undef
|
||||||
store i32 %one, i32 addrspace(1)* %out1
|
%seven = load volatile i32, i32 addrspace(4)* undef
|
||||||
store i32 %two, i32 addrspace(1)* %out2
|
%eight = load volatile i32, i32 addrspace(4)* undef
|
||||||
store i32 %three, i32 addrspace(1)* %out3
|
%nine = load volatile i32, i32 addrspace(4)* undef
|
||||||
store i32 %four, i32 addrspace(1)* %out4
|
%ten = load volatile i32, i32 addrspace(4)* undef
|
||||||
store i32 %five, i32 addrspace(1)* %out5
|
call void asm sideeffect "", "s,s,s,s,s,s,s,s,s"(i32 %one, i32 %two, i32 %three, i32 %four, i32 %five, i32 %six, i32 %seven, i32 %eight, i32 %nine)
|
||||||
|
store volatile i32 %one, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %two, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %three, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %four, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %five, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %six, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %seven, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %eight, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %nine, i32 addrspace(1)* undef
|
||||||
|
store volatile i32 %ten, i32 addrspace(1)* undef
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -29,7 +29,8 @@ end:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_brcc_i1:
|
; GCN-LABEL: {{^}}test_brcc_i1:
|
||||||
; GCN: s_load_dword [[VAL:s[0-9]+]]
|
; GCN: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; GCNNOOPT: s_and_b32 s{{[0-9]+}}, 1, [[VAL]]
|
; GCNNOOPT: s_mov_b32 [[ONE:s[0-9]+]], 1{{$}}
|
||||||
|
; GCNNOOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], [[ONE]]
|
||||||
; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
|
; GCNOPT: s_and_b32 s{{[0-9]+}}, [[VAL]], 1
|
||||||
; GCN: s_cmp_eq_u32
|
; GCN: s_cmp_eq_u32
|
||||||
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
|
; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_ubfe_sub_i32:
|
; GCN-LABEL: {{^}}v_ubfe_sub_i32:
|
||||||
; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
|
; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
|
||||||
|
@ -48,10 +48,9 @@ define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
|
; GCN-LABEL: {{^}}s_ubfe_sub_i32:
|
||||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
|
||||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
|
||||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
|
; GCN: v_bfe_u32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
|
||||||
; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
|
|
||||||
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
||||||
|
@ -63,11 +62,10 @@ define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
|
; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
|
||||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
|
||||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
|
||||||
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
|
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
|
||||||
; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
|
; GCN: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
|
||||||
; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
|
|
||||||
define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
||||||
|
@ -126,10 +124,9 @@ define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out,
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
|
; GCN-LABEL: {{^}}s_sbfe_sub_i32:
|
||||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
|
||||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], s[[WIDTH]]
|
||||||
; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]], [[WIDTH]]
|
; GCN: v_bfe_i32 v{{[0-9]+}}, s[[SRC]], 0, [[VWIDTH]]
|
||||||
; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
|
|
||||||
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
||||||
|
@ -141,11 +138,10 @@ define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
|
; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
|
||||||
; GCN: s_load_dword [[SRC:s[0-9]+]]
|
; GCN: s_load_dwordx2 s{{\[}}[[SRC:[0-9]+]]:[[WIDTH:[0-9]+]]{{\]}}, s[0:1], {{0xb|0x2c}}
|
||||||
; GCN: s_load_dword [[WIDTH:s[0-9]+]]
|
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, s[[WIDTH]]
|
||||||
; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
|
; GCN: s_lshl_b32 [[SHL:s[0-9]+]], s[[SRC]], [[SUB]]
|
||||||
; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
|
; GCN: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
|
||||||
; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
|
|
||||||
define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
|
||||||
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
%id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
|
||||||
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
%out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
|
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
|
||||||
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefixes=GCN,FUNC %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FUNC %s
|
||||||
; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck -check-prefixes=R600,FUNC %s
|
; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -enable-var-scope -check-prefixes=R600,FUNC %s
|
||||||
|
|
||||||
; BFI_INT Definition pattern from ISA docs
|
; BFI_INT Definition pattern from ISA docs
|
||||||
; (y & x) | (z & ~x)
|
; (y & x) | (z & ~x)
|
||||||
|
@ -119,10 +119,10 @@ entry:
|
||||||
; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0:
|
; FUNC-LABEL: {{^}}s_bitselect_i64_pat_0:
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_bfi_b32
|
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_bfi_b32
|
; GCN-DAG: v_bfi_b32
|
||||||
|
; GCN-DAG: v_bfi_b32
|
||||||
define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
|
define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
|
||||||
%and0 = and i64 %a, %b
|
%and0 = and i64 %a, %b
|
||||||
%not.a = xor i64 %a, -1
|
%not.a = xor i64 %a, -1
|
||||||
|
@ -136,10 +136,10 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) {
|
||||||
; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1:
|
; FUNC-LABEL: {{^}}s_bitselect_i64_pat_1:
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN-DAG: v_bfi_b32
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_bfi_b32
|
; GCN-DAG: v_bfi_b32
|
||||||
|
; GCN-DAG: v_bfi_b32
|
||||||
define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
|
define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
|
||||||
%xor.0 = xor i64 %a, %mask
|
%xor.0 = xor i64 %a, %mask
|
||||||
%and = and i64 %xor.0, %b
|
%and = and i64 %xor.0, %b
|
||||||
|
@ -155,8 +155,8 @@ define amdgpu_kernel void @s_bitselect_i64_pat_1(i64 %a, i64 %b, i64 %mask) {
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN-DAG: v_bfi_b32
|
; GCN-DAG: v_bfi_b32
|
||||||
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_mov_b32_e32 v{{[0-9]+}}, s
|
; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, s
|
||||||
; GCN: v_bfi_b32
|
; GCN-DAG: v_bfi_b32
|
||||||
define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
|
define amdgpu_kernel void @s_bitselect_i64_pat_2(i64 %a, i64 %b, i64 %mask) {
|
||||||
%xor.0 = xor i64 %a, %mask
|
%xor.0 = xor i64 %a, %mask
|
||||||
%and = and i64 %xor.0, %b
|
%and = and i64 %xor.0, %b
|
||||||
|
|
|
@ -11,22 +11,32 @@
|
||||||
; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
|
; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
|
||||||
; GCN: s_cbranch_vccnz
|
; GCN: s_cbranch_vccnz
|
||||||
|
|
||||||
; GCN: one{{$}}
|
; SI: one{{$}}
|
||||||
; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
|
; SI: v_cvt_f16_f32_e32 v[[CVT:[0-9]+]], v[[A_F32]]
|
||||||
; GCN: buffer_store_short
|
|
||||||
; GCN: s_endpgm
|
|
||||||
|
|
||||||
; GCN: two{{$}}
|
; SI: two{{$}}
|
||||||
; SI: v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
|
; SI: v_cvt_f16_f32_e32 v[[CVT]], v[[B_F32]]
|
||||||
; GCN: buffer_store_short v[[B_F16]]
|
|
||||||
; GCN: s_endpgm
|
; SI: one{{$}}
|
||||||
|
; SI: buffer_store_short v[[CVT]]
|
||||||
|
; SI: s_endpgm
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
; VI: one{{$}}
|
||||||
|
; VI: buffer_store_short v[[A_F16]]
|
||||||
|
; VI: s_endpgm
|
||||||
|
|
||||||
|
; VI: two{{$}}
|
||||||
|
; VI: buffer_store_short v[[B_F16]]
|
||||||
|
; VI: s_endpgm
|
||||||
define amdgpu_kernel void @br_cc_f16(
|
define amdgpu_kernel void @br_cc_f16(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%fcmp = fcmp olt half %a.val, %b.val
|
%fcmp = fcmp olt half %a.val, %b.val
|
||||||
br i1 %fcmp, label %one, label %two
|
br i1 %fcmp, label %one, label %two
|
||||||
|
|
||||||
|
|
|
@ -490,7 +490,7 @@ ret:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}long_branch_hang:
|
; GCN-LABEL: {{^}}long_branch_hang:
|
||||||
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
|
; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
|
||||||
; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
|
; GCN: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
|
||||||
; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
|
; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
|
||||||
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
|
; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
|
||||||
|
|
||||||
|
|
|
@ -9,7 +9,7 @@
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
|
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
||||||
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
||||||
|
@ -23,7 +23,7 @@
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 6
|
; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 8
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
|
||||||
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
|
||||||
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
|
||||||
|
|
|
@ -123,7 +123,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}s_ctlz_i64:
|
; FUNC-LABEL: {{^}}s_ctlz_i64:
|
||||||
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
|
||||||
; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
|
; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
|
||||||
; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
|
; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
|
||||||
; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
|
; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
|
||||||
|
@ -133,7 +133,7 @@ define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace
|
||||||
; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
|
; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||||
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
|
define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
|
||||||
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
|
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
|
||||||
store i64 %ctlz, i64 addrspace(1)* %out
|
store i64 %ctlz, i64 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -98,7 +98,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
|
; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64:
|
||||||
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
; GCN: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
|
||||||
; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
|
; GCN-DAG: v_cmp_eq_u32_e64 vcc, s[[HI]], 0{{$}}
|
||||||
; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
|
; GCN-DAG: s_flbit_i32_b32 [[FFBH_LO:s[0-9]+]], s[[LO]]
|
||||||
; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
|
; GCN-DAG: s_add_i32 [[ADD:s[0-9]+]], [[FFBH_LO]], 32
|
||||||
|
@ -108,7 +108,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i
|
||||||
; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
|
; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
|
||||||
; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
|
define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
|
||||||
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
|
%ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
|
||||||
store i64 %ctlz, i64 addrspace(1)* %out
|
store i64 %ctlz, i64 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -305,14 +305,14 @@ define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %o
|
||||||
; but there are some cases when the should be allowed.
|
; but there are some cases when the should be allowed.
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}ctpop_i32_in_br:
|
; FUNC-LABEL: {{^}}ctpop_i32_in_br:
|
||||||
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
|
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x16
|
||||||
; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
|
; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x58
|
||||||
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
||||||
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
||||||
; GCN: buffer_store_dword [[RESULT]],
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
; EG: BCNT_INT
|
; EG: BCNT_INT
|
||||||
define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
|
define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, [8 x i32], i32 %cond) {
|
||||||
entry:
|
entry:
|
||||||
%tmp0 = icmp eq i32 %cond, 0
|
%tmp0 = icmp eq i32 %cond, 0
|
||||||
br i1 %tmp0, label %if, label %else
|
br i1 %tmp0, label %if, label %else
|
||||||
|
|
|
@ -308,7 +308,9 @@ define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(i16 addrspace(1)* noalias %o
|
||||||
; FUNC-LABEL: {{^}}ctpop_i16_in_br:
|
; FUNC-LABEL: {{^}}ctpop_i16_in_br:
|
||||||
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
|
; SI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xd
|
||||||
; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
|
; VI: s_load_dword [[VAL:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x34
|
||||||
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[VAL]]
|
|
||||||
|
; GCN: s_and_b32 [[CTPOP_ARG:s[0-9]+]], [[VAL]], 0xffff
|
||||||
|
; GCN: s_bcnt1_i32_b32 [[SRESULT:s[0-9]+]], [[CTPOP_ARG]]
|
||||||
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], [[SRESULT]]
|
||||||
; GCN: buffer_store_short [[RESULT]],
|
; GCN: buffer_store_short [[RESULT]],
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
|
|
|
@ -13,13 +13,13 @@ declare i65 @llvm.ctpop.i65(i65) nounwind readnone
|
||||||
declare i128 @llvm.ctpop.i128(i128) nounwind readnone
|
declare i128 @llvm.ctpop.i128(i128) nounwind readnone
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}s_ctpop_i64:
|
; FUNC-LABEL: {{^}}s_ctpop_i64:
|
||||||
; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[SVAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
|
; GCN: s_bcnt1_i32_b64 [[SRESULT:s[0-9]+]], [[SVAL]]
|
||||||
; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
||||||
; GCN: buffer_store_dword [[VRESULT]],
|
; GCN: buffer_store_dword [[VRESULT]],
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
|
define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, [8 x i32], i64 %val) nounwind {
|
||||||
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
|
%ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
|
||||||
%truncctpop = trunc i64 %ctpop to i32
|
%truncctpop = trunc i64 %ctpop to i32
|
||||||
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
|
store i32 %truncctpop, i32 addrspace(1)* %out, align 4
|
||||||
|
|
|
@ -58,11 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
|
; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2
|
||||||
|
|
||||||
; GFX89: s_load_dwordx2
|
|
||||||
; GFX89: s_load_dwordx2
|
|
||||||
|
|
||||||
; GCN: buffer_store_short
|
; GCN: buffer_store_short
|
||||||
; GCN: buffer_store_short
|
; GCN: buffer_store_short
|
||||||
|
@ -78,8 +75,8 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
|
||||||
; FIXME: Why sometimes vector shift?
|
; FIXME: Why sometimes vector shift?
|
||||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
|
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dword s
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx2 s
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx2 s
|
||||||
|
|
||||||
; GFX89: s_load_dwordx2 s
|
; GFX89: s_load_dwordx2 s
|
||||||
; GFX89: s_load_dwordx2 s
|
; GFX89: s_load_dwordx2 s
|
||||||
|
@ -87,9 +84,7 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
|
||||||
|
|
||||||
|
|
||||||
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
||||||
; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
|
; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
|
||||||
|
|
||||||
; SI: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
|
|
||||||
|
|
||||||
; GCN: {{buffer|global}}_store_short
|
; GCN: {{buffer|global}}_store_short
|
||||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
|
define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
|
||||||
|
|
|
@ -27,7 +27,7 @@ define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x
|
||||||
; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
|
; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
|
||||||
; GCN: buffer_store_short [[VELT1]]
|
; GCN: buffer_store_short [[VELT1]]
|
||||||
; GCN: ScratchSize: 0
|
; GCN: ScratchSize: 0
|
||||||
define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %idx) #0 {
|
define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
|
||||||
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
||||||
%elt = extractelement <2 x i16> %vec, i32 %idx
|
%elt = extractelement <2 x i16> %vec, i32 %idx
|
||||||
store i16 %elt, i16 addrspace(1)* %out, align 2
|
store i16 %elt, i16 addrspace(1)* %out, align 2
|
||||||
|
@ -58,12 +58,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
|
; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2
|
||||||
; SI: s_load_dwordx2 s
|
; GCN: s_load_dwordx2
|
||||||
; SI: s_load_dword s
|
|
||||||
|
|
||||||
; GFX89: s_load_dwordx2
|
|
||||||
; GFX89: s_load_dwordx2
|
|
||||||
|
|
||||||
; GCN-NOT: {{buffer|flat|global}}_load
|
; GCN-NOT: {{buffer|flat|global}}_load
|
||||||
|
|
||||||
|
@ -79,8 +75,7 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
|
; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx2
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: buffer_store_short
|
; SI: buffer_store_short
|
||||||
; SI: buffer_store_short
|
; SI: buffer_store_short
|
||||||
|
|
||||||
|
@ -100,12 +95,12 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
|
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dword s
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx2 s
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx2 s
|
||||||
|
|
||||||
; GFX89-DAG: s_load_dwordx2
|
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x24
|
||||||
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
|
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x4c
|
||||||
; GFX89-DAG: s_load_dword s
|
; GFX89-DAG: s_load_dword s{{[0-9]+}}, s[0:1], 0x54
|
||||||
|
|
||||||
; GCN-NOT: {{buffer|flat|global}}
|
; GCN-NOT: {{buffer|flat|global}}
|
||||||
|
|
||||||
|
@ -113,17 +108,13 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
|
||||||
; SICI: buffer_store_short
|
; SICI: buffer_store_short
|
||||||
; SICI: buffer_store_short
|
; SICI: buffer_store_short
|
||||||
|
|
||||||
; SICI: buffer_load_ushort
|
|
||||||
; SICI: buffer_store_short
|
|
||||||
|
|
||||||
; GFX9-NOT: s_pack_ll_b32_b16
|
; GFX9-NOT: s_pack_ll_b32_b16
|
||||||
; GFX9-NOT: s_pack_lh_b32_b16
|
; GFX9-NOT: s_pack_lh_b32_b16
|
||||||
|
|
||||||
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
||||||
; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}}
|
; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
|
||||||
|
|
||||||
; GCN: {{buffer|global}}_store_short
|
; GCN: {{buffer|global}}_store_short
|
||||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
|
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, [8 x i32], <3 x i16> %foo, i32 %idx) #0 {
|
||||||
%p0 = extractelement <3 x i16> %foo, i32 %idx
|
%p0 = extractelement <3 x i16> %foo, i32 %idx
|
||||||
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
|
||||||
store i16 %p0, i16 addrspace(1)* %out
|
store i16 %p0, i16 addrspace(1)* %out
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
|
; GCN-LABEL: {{^}}extract_vector_elt_v1i8:
|
||||||
; GCN: s_load_dword [[LOAD:s[0-9]+]]
|
; GCN: s_load_dword [[LOAD:s[0-9]+]]
|
||||||
|
@ -14,7 +14,8 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
|
; GCN-LABEL: {{^}}extract_vector_elt_v2i8:
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN-NOT: {{flat|buffer|global}}
|
; GCN-NOT: {{flat|buffer|global}}
|
||||||
; GCN: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
; SI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8
|
||||||
|
; VI: v_lshrrev_b16_e64 v{{[0-9]+}}, 8, s{{[0-9]+}}
|
||||||
; GCN-NOT: {{flat|buffer|global}}
|
; GCN-NOT: {{flat|buffer|global}}
|
||||||
; GCN: buffer_store_byte
|
; GCN: buffer_store_byte
|
||||||
; GCN: buffer_store_byte
|
; GCN: buffer_store_byte
|
||||||
|
@ -22,8 +23,8 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i
|
||||||
%p0 = extractelement <2 x i8> %foo, i32 0
|
%p0 = extractelement <2 x i8> %foo, i32 0
|
||||||
%p1 = extractelement <2 x i8> %foo, i32 1
|
%p1 = extractelement <2 x i8> %foo, i32 1
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p1, i8 addrspace(1)* %out
|
store volatile i8 %p1, i8 addrspace(1)* %out
|
||||||
store i8 %p0, i8 addrspace(1)* %out1
|
store volatile i8 %p0, i8 addrspace(1)* %out1
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -38,8 +39,8 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i
|
||||||
%p0 = extractelement <3 x i8> %foo, i32 0
|
%p0 = extractelement <3 x i8> %foo, i32 0
|
||||||
%p1 = extractelement <3 x i8> %foo, i32 2
|
%p1 = extractelement <3 x i8> %foo, i32 2
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p1, i8 addrspace(1)* %out
|
store volatile i8 %p1, i8 addrspace(1)* %out
|
||||||
store i8 %p0, i8 addrspace(1)* %out1
|
store volatile i8 %p0, i8 addrspace(1)* %out1
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -54,24 +55,24 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i
|
||||||
%p0 = extractelement <4 x i8> %foo, i32 0
|
%p0 = extractelement <4 x i8> %foo, i32 0
|
||||||
%p1 = extractelement <4 x i8> %foo, i32 2
|
%p1 = extractelement <4 x i8> %foo, i32 2
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p1, i8 addrspace(1)* %out
|
store volatile i8 %p1, i8 addrspace(1)* %out
|
||||||
store i8 %p0, i8 addrspace(1)* %out1
|
store volatile i8 %p0, i8 addrspace(1)* %out1
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
|
; GCN-LABEL: {{^}}extract_vector_elt_v8i8:
|
||||||
|
; GCN-NOT: {{s|flat|buffer|global}}_load
|
||||||
; GCN: s_load_dword [[VAL:s[0-9]+]]
|
; GCN: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; GCN-NOT: {{flat|buffer|global}}
|
; GCN-NOT: {{s|flat|buffer|global}}_load
|
||||||
; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
|
; GCN: s_lshr_b32 s{{[0-9]+}}, [[VAL]], 16
|
||||||
; GCN-NOT: {{flat|buffer|global}}
|
; GCN-NOT: {{s|flat|buffer|global}}_load
|
||||||
; GCN: buffer_store_byte
|
; GCN: buffer_store_byte
|
||||||
; GCN: buffer_store_byte
|
; GCN: buffer_store_byte
|
||||||
define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
|
define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
|
||||||
%p0 = extractelement <8 x i8> %foo, i32 0
|
%p0 = extractelement <8 x i8> %foo, i32 0
|
||||||
%p1 = extractelement <8 x i8> %foo, i32 2
|
%p1 = extractelement <8 x i8> %foo, i32 2
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
store volatile i8 %p1, i8 addrspace(1)* null
|
||||||
store i8 %p1, i8 addrspace(1)* %out
|
store volatile i8 %p0, i8 addrspace(1)* null
|
||||||
store i8 %p0, i8 addrspace(1)* %out1
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -87,25 +88,25 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x
|
||||||
%p0 = extractelement <16 x i8> %foo, i32 0
|
%p0 = extractelement <16 x i8> %foo, i32 0
|
||||||
%p1 = extractelement <16 x i8> %foo, i32 2
|
%p1 = extractelement <16 x i8> %foo, i32 2
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p1, i8 addrspace(1)* %out
|
store volatile i8 %p1, i8 addrspace(1)* %out
|
||||||
store i8 %p0, i8 addrspace(1)* %out1
|
store volatile i8 %p0, i8 addrspace(1)* %out1
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
|
; GCN-LABEL: {{^}}extract_vector_elt_v32i8:
|
||||||
; GCN: s_load_dword [[LOAD0:s[0-9]+]]
|
; GCN-NOT: {{s|flat|buffer|global}}_load
|
||||||
; GCN-NOT: {{flat|buffer|global}}
|
; GCN: s_load_dword [[VAL:s[0-9]+]]
|
||||||
; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[LOAD0]], 16
|
; GCN-NOT: {{s|flat|buffer|global}}_load
|
||||||
; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], [[LOAD0]]
|
; GCN: s_lshr_b32 [[ELT2:s[0-9]+]], [[VAL]], 16
|
||||||
|
; GCN-DAG: v_mov_b32_e32 [[V_LOAD0:v[0-9]+]], s{{[0-9]+}}
|
||||||
; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
|
; GCN-DAG: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
|
||||||
; GCN: buffer_store_byte [[V_ELT2]]
|
; GCN: buffer_store_byte [[V_ELT2]]
|
||||||
; GCN: buffer_store_byte [[V_LOAD0]]
|
; GCN: buffer_store_byte [[V_LOAD0]]
|
||||||
define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
|
define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
|
||||||
%p0 = extractelement <32 x i8> %foo, i32 0
|
%p0 = extractelement <32 x i8> %foo, i32 0
|
||||||
%p1 = extractelement <32 x i8> %foo, i32 2
|
%p1 = extractelement <32 x i8> %foo, i32 2
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
store volatile i8 %p1, i8 addrspace(1)* null
|
||||||
store i8 %p1, i8 addrspace(1)* %out
|
store volatile i8 %p0, i8 addrspace(1)* null
|
||||||
store i8 %p0, i8 addrspace(1)* %out1
|
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -121,8 +122,8 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
|
||||||
%p0 = extractelement <64 x i8> %foo, i32 0
|
%p0 = extractelement <64 x i8> %foo, i32 0
|
||||||
%p1 = extractelement <64 x i8> %foo, i32 2
|
%p1 = extractelement <64 x i8> %foo, i32 2
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p1, i8 addrspace(1)* %out
|
store volatile i8 %p1, i8 addrspace(1)* %out
|
||||||
store i8 %p0, i8 addrspace(1)* %out1
|
store volatile i8 %p0, i8 addrspace(1)* %out1
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -132,42 +133,36 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x
|
||||||
; isTypeDesirableForOp in SimplifyDemandedBits
|
; isTypeDesirableForOp in SimplifyDemandedBits
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
|
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v2i8:
|
||||||
; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
|
; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
|
||||||
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
|
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
|
||||||
; VI-NOT: {{flat|buffer|global}}
|
; VI-NOT: {{flat|buffer|global}}
|
||||||
; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
|
; VI-DAG: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
|
||||||
; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
|
|
||||||
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
|
|
||||||
; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[ELT0]], [[ELT2]]
|
|
||||||
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
; VI: v_lshrrev_b16_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[BUILD_VEC]]
|
; VI: v_lshrrev_b16_e32 [[ELT:v[0-9]+]], [[SCALED_IDX]], [[V_LOAD]]
|
||||||
; VI: buffer_store_byte [[EXTRACT]]
|
; VI: buffer_store_byte [[ELT]]
|
||||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo, i32 %idx) #0 {
|
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(i8 addrspace(1)* %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
|
||||||
%elt = extractelement <2 x i8> %foo, i32 %idx
|
%elt = extractelement <2 x i8> %foo, i32 %idx
|
||||||
store i8 %elt, i8 addrspace(1)* %out
|
store volatile i8 %elt, i8 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
|
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i8:
|
||||||
; VI: s_load_dword [[LOAD:s[0-9]+]], s[0:1], 0x2c
|
; VI: s_load_dword [[LOAD:s[0-9]+]], s[4:5], 0x28
|
||||||
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x30
|
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x4c
|
||||||
; VI-NOT: {{flat|buffer|global}}
|
; VI-NOT: {{flat|buffer|global}}
|
||||||
; VI: s_lshr_b32 [[ELT12:s[0-9]+]], [[LOAD]], 8
|
|
||||||
; VI: v_lshlrev_b16_e64 [[ELT1:v[0-9]+]], 8, [[ELT12]]
|
|
||||||
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
|
|
||||||
; VI: v_or_b32_e32 [[VEC3:v[0-9]+]], [[ELT0]], [[ELT1]]
|
|
||||||
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
; VI: v_lshrrev_b32_e32 [[EXTRACT:v[0-9]+]], [[SCALED_IDX]], [[VEC3]]
|
; VI: s_lshr_b32 [[ELT:s[0-9]+]], [[LOAD]], [[SCALED_IDX]]
|
||||||
; VI: buffer_store_byte [[EXTRACT]]
|
; VI: v_mov_b32_e32 [[V_ELT:v[0-9]+]], [[ELT]]
|
||||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
|
; VI: buffer_store_byte [[V_ELT]]
|
||||||
|
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
|
||||||
%p0 = extractelement <3 x i8> %foo, i32 %idx
|
%p0 = extractelement <3 x i8> %foo, i32 %idx
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p0, i8 addrspace(1)* %out
|
store volatile i8 %p0, i8 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
|
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i8:
|
||||||
; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
|
; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x30
|
||||||
; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
; VI: s_load_dword [[VEC4:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
|
@ -175,16 +170,16 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out
|
||||||
|
|
||||||
; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
|
; VI: v_mov_b32_e32 [[V_EXTRACT:v[0-9]+]], [[EXTRACT]]
|
||||||
; VI: buffer_store_byte [[V_EXTRACT]]
|
; VI: buffer_store_byte [[V_EXTRACT]]
|
||||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, i32 %idx) #0 {
|
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(4)* %vec.ptr, [8 x i32], i32 %idx) #0 {
|
||||||
%vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
|
%vec = load <4 x i8>, <4 x i8> addrspace(4)* %vec.ptr
|
||||||
%p0 = extractelement <4 x i8> %vec, i32 %idx
|
%p0 = extractelement <4 x i8> %vec, i32 %idx
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p0, i8 addrspace(1)* %out
|
store volatile i8 %p0, i8 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
|
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v8i8:
|
||||||
; VI: s_load_dword [[IDX:s[0-9]+]], s[0:1], 0x34
|
; VI: s_load_dword [[IDX:s[0-9]+]], s[4:5], 0x10
|
||||||
; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
; VI: s_load_dwordx2 [[VEC8:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
|
@ -195,7 +190,7 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(i8 addrspace(1)* %out
|
||||||
%vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
|
%vec = load <8 x i8>, <8 x i8> addrspace(4)* %vec.ptr
|
||||||
%p0 = extractelement <8 x i8> %vec, i32 %idx
|
%p0 = extractelement <8 x i8> %vec, i32 %idx
|
||||||
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
%out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
|
||||||
store i8 %p0, i8 addrspace(1)* %out
|
store volatile i8 %p0, i8 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -39,9 +39,9 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_fabs_v4f16:
|
; GCN-LABEL: {{^}}s_fabs_v4f16:
|
||||||
; CI: s_load_dword s[[LO:[0-9]+]]
|
; CI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2
|
||||||
; CI: s_load_dword s[[HI:[0-9]+]]
|
|
||||||
; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||||
|
|
||||||
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
|
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
|
||||||
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
|
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
|
||||||
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
|
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
|
||||||
|
@ -54,7 +54,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fabs_fold_f16:
|
; GCN-LABEL: {{^}}fabs_fold_f16:
|
||||||
; GCN: s_load_dword [[IN0:s[0-9]+]]
|
; GCN: s_load_dword [[IN0:s[0-9]+]]
|
||||||
; GCN: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
|
; GCN-DAG: s_lshr_b32 [[IN1:s[0-9]+]], [[IN0]], 16
|
||||||
|
|
||||||
; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
|
; CI-DAG: v_cvt_f32_f16_e64 [[CVT0:v[0-9]+]], |[[IN0]]|
|
||||||
; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
|
; CI-DAG: v_cvt_f32_f16_e32 [[ABS_CVT1:v[0-9]+]], [[IN1]]
|
||||||
|
@ -62,6 +62,7 @@ define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half
|
||||||
; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
|
; CI-DAG: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
|
||||||
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
|
; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
|
||||||
|
|
||||||
|
; GFX89-NOT: and
|
||||||
; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
|
; GFX89: v_mov_b32_e32 [[V_IN1:v[0-9]+]], [[IN1]]
|
||||||
; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
|
; GFX89: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN0]]|, [[V_IN1]]
|
||||||
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
|
||||||
|
|
|
@ -53,11 +53,11 @@ define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x doub
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}fabs_fold_f64:
|
; SI-LABEL: {{^}}fabs_fold_f64:
|
||||||
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
|
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
|
define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
|
||||||
%fabs = call double @llvm.fabs.f64(double %in0)
|
%fabs = call double @llvm.fabs.f64(double %in0)
|
||||||
%fmul = fmul double %fabs, %in1
|
%fmul = fmul double %fabs, %in1
|
||||||
store double %fmul, double addrspace(1)* %out
|
store double %fmul, double addrspace(1)* %out
|
||||||
|
@ -65,11 +65,11 @@ define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0,
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}fabs_fn_fold_f64:
|
; SI-LABEL: {{^}}fabs_fn_fold_f64:
|
||||||
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
|
; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
|
define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, [8 x i32], double %in0, [8 x i32], double %in1) {
|
||||||
%fabs = call double @fabs(double %in0)
|
%fabs = call double @fabs(double %in0)
|
||||||
%fmul = fmul double %fabs, %in1
|
%fmul = fmul double %fabs, %in1
|
||||||
store double %fmul, double addrspace(1)* %out
|
store double %fmul, double addrspace(1)* %out
|
||||||
|
|
|
@ -70,10 +70,11 @@ define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fabs_fn_fold:
|
; GCN-LABEL: {{^}}fabs_fn_fold:
|
||||||
; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
|
; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
|
||||||
; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
|
; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
|
||||||
; GCN-NOT: and
|
; GCN-NOT: and
|
||||||
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
|
; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
|
||||||
|
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
|
||||||
define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
|
define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
|
||||||
%fabs = call float @fabs(float %in0)
|
%fabs = call float @fabs(float %in0)
|
||||||
%fmul = fmul float %fabs, %in1
|
%fmul = fmul float %fabs, %in1
|
||||||
|
@ -82,10 +83,11 @@ define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, fl
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}fabs_fold:
|
; FUNC-LABEL: {{^}}fabs_fold:
|
||||||
; SI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0xb
|
; SI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
|
||||||
; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
|
; VI: s_load_dwordx2 s{{\[}}[[ABS_VALUE:[0-9]+]]:[[MUL_VAL:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
|
||||||
; GCN-NOT: and
|
; GCN-NOT: and
|
||||||
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ABS_VALUE]]|, v{{[0-9]+}}
|
; GCN: v_mov_b32_e32 [[V_MUL_VI:v[0-9]+]], s[[MUL_VAL]]
|
||||||
|
; GCN: v_mul_f32_e64 v{{[0-9]+}}, |s[[ABS_VALUE]]|, [[V_MUL_VI]]
|
||||||
define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
|
define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
|
||||||
%fabs = call float @llvm.fabs.f32(float %in0)
|
%fabs = call float @llvm.fabs.f32(float %in0)
|
||||||
%fmul = fmul float %fabs, %in1
|
%fmul = fmul float %fabs, %in1
|
||||||
|
|
|
@ -16,8 +16,8 @@ define amdgpu_kernel void @fadd_f16(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fadd half %a.val, %b.val
|
%r.val = fadd half %a.val, %b.val
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -65,10 +65,10 @@ entry:
|
||||||
; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
|
; VI: flat_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
|
; VI: flat_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
|
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
|
@ -102,13 +102,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
|
; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
|
||||||
; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
|
; GCN-DAG: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
|
; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
|
; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
@ -133,13 +133,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
|
; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
|
||||||
; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN-DAG: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
|
; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], 2.0, v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
|
; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
|
|
@ -16,8 +16,8 @@ define amdgpu_kernel void @fcmp_f16_lt(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp olt half %a.val, %b.val
|
%r.val = fcmp olt half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -42,8 +42,8 @@ define amdgpu_kernel void @fcmp_f16_lt_abs(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%a.abs = call half @llvm.fabs.f16(half %a.val)
|
%a.abs = call half @llvm.fabs.f16(half %a.val)
|
||||||
%b.abs = call half @llvm.fabs.f16(half %b.val)
|
%b.abs = call half @llvm.fabs.f16(half %b.val)
|
||||||
%r.val = fcmp olt half %a.abs, %b.abs
|
%r.val = fcmp olt half %a.abs, %b.abs
|
||||||
|
@ -67,8 +67,8 @@ define amdgpu_kernel void @fcmp_f16_eq(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp oeq half %a.val, %b.val
|
%r.val = fcmp oeq half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -90,8 +90,8 @@ define amdgpu_kernel void @fcmp_f16_le(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp ole half %a.val, %b.val
|
%r.val = fcmp ole half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -113,8 +113,8 @@ define amdgpu_kernel void @fcmp_f16_gt(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp ogt half %a.val, %b.val
|
%r.val = fcmp ogt half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -136,8 +136,8 @@ define amdgpu_kernel void @fcmp_f16_lg(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp one half %a.val, %b.val
|
%r.val = fcmp one half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -159,8 +159,8 @@ define amdgpu_kernel void @fcmp_f16_ge(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp oge half %a.val, %b.val
|
%r.val = fcmp oge half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -182,8 +182,8 @@ define amdgpu_kernel void @fcmp_f16_o(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp ord half %a.val, %b.val
|
%r.val = fcmp ord half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -205,8 +205,8 @@ define amdgpu_kernel void @fcmp_f16_u(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp uno half %a.val, %b.val
|
%r.val = fcmp uno half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -228,8 +228,8 @@ define amdgpu_kernel void @fcmp_f16_nge(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp ult half %a.val, %b.val
|
%r.val = fcmp ult half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -251,8 +251,8 @@ define amdgpu_kernel void @fcmp_f16_nlg(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp ueq half %a.val, %b.val
|
%r.val = fcmp ueq half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -274,8 +274,8 @@ define amdgpu_kernel void @fcmp_f16_ngt(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp ule half %a.val, %b.val
|
%r.val = fcmp ule half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -297,8 +297,8 @@ define amdgpu_kernel void @fcmp_f16_nle(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp ugt half %a.val, %b.val
|
%r.val = fcmp ugt half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -320,8 +320,8 @@ define amdgpu_kernel void @fcmp_f16_neq(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp une half %a.val, %b.val
|
%r.val = fcmp une half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
@ -343,8 +343,8 @@ define amdgpu_kernel void @fcmp_f16_nlt(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fcmp uge half %a.val, %b.val
|
%r.val = fcmp uge half %a.val, %b.val
|
||||||
%r.val.sext = sext i1 %r.val to i32
|
%r.val.sext = sext i1 %r.val to i32
|
||||||
store i32 %r.val.sext, i32 addrspace(1)* %r
|
store i32 %r.val.sext, i32 addrspace(1)* %r
|
||||||
|
|
|
@ -30,8 +30,8 @@ define amdgpu_kernel void @test_copysign_f16(
|
||||||
half addrspace(1)* %arg_mag,
|
half addrspace(1)* %arg_mag,
|
||||||
half addrspace(1)* %arg_sign) {
|
half addrspace(1)* %arg_sign) {
|
||||||
entry:
|
entry:
|
||||||
%mag = load half, half addrspace(1)* %arg_mag
|
%mag = load volatile half, half addrspace(1)* %arg_mag
|
||||||
%sign = load half, half addrspace(1)* %arg_sign
|
%sign = load volatile half, half addrspace(1)* %arg_sign
|
||||||
%out = call half @llvm.copysign.f16(half %mag, half %sign)
|
%out = call half @llvm.copysign.f16(half %mag, half %sign)
|
||||||
store half %out, half addrspace(1)* %arg_out
|
store half %out, half addrspace(1)* %arg_out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -8,12 +8,11 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
|
||||||
|
|
||||||
; Try to identify arg based on higher address.
|
; Try to identify arg based on higher address.
|
||||||
; FUNC-LABEL: {{^}}test_copysign_f32:
|
; FUNC-LABEL: {{^}}test_copysign_f32:
|
||||||
; SI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0xb
|
; SI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0xb
|
||||||
; SI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0xc
|
; VI: s_load_dwordx2 s{{\[}}[[SMAG:[0-9]+]]:[[SSIGN:[0-9]+]]{{\]}}, {{.*}} 0x2c
|
||||||
; VI: s_load_dword [[SMAG:s[0-9]+]], {{.*}} 0x2c
|
|
||||||
; VI: s_load_dword [[SSIGN:s[0-9]+]], {{.*}} 0x30
|
; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], s[[SSIGN]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VSIGN:v[0-9]+]], [[SSIGN]]
|
; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], s[[SMAG]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VMAG:v[0-9]+]], [[SMAG]]
|
|
||||||
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
|
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
|
||||||
; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
|
; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[SCONST]], [[VMAG]], [[VSIGN]]
|
||||||
; GCN: buffer_store_dword [[RESULT]],
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
|
|
|
@ -6,10 +6,10 @@ declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>) nounwind r
|
||||||
declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
|
declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind readnone
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_copysign_f64:
|
; FUNC-LABEL: {{^}}test_copysign_f64:
|
||||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
; VI-DAG: s_load_dwordx2 s{{\[}}[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x74
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
|
; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]]
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
|
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
|
||||||
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
|
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2
|
||||||
|
@ -17,15 +17,15 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
|
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
|
||||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
|
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
|
define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, [8 x i32], double %mag, [8 x i32], double %sign) nounwind {
|
||||||
%result = call double @llvm.copysign.f64(double %mag, double %sign)
|
%result = call double @llvm.copysign.f64(double %mag, double %sign)
|
||||||
store double %result, double addrspace(1)* %out, align 8
|
store double %result, double addrspace(1)* %out, align 8
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_copysign_f64_f32:
|
; FUNC-LABEL: {{^}}test_copysign_f64_f32:
|
||||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI-DAG: s_load_dwordx2 s{{\[}}[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
|
; GCN-DAG: s_load_dword s[[SSIGN:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}
|
||||||
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
|
; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2{{$}}
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
|
; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]]
|
||||||
|
@ -33,7 +33,7 @@ define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %
|
||||||
; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
|
; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
|
; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
|
||||||
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
|
; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
|
define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, [8 x i32], double %mag, float %sign) nounwind {
|
||||||
%c = fpext float %sign to double
|
%c = fpext float %sign to double
|
||||||
%result = call double @llvm.copysign.f64(double %mag, double %c)
|
%result = call double @llvm.copysign.f64(double %mag, double %c)
|
||||||
store double %result, double addrspace(1)* %out, align 8
|
store double %result, double addrspace(1)* %out, align 8
|
||||||
|
|
|
@ -64,9 +64,9 @@ define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float>
|
||||||
; SI: v_fma_f32
|
; SI: v_fma_f32
|
||||||
; SI: v_fma_f32
|
; SI: v_fma_f32
|
||||||
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
|
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
|
||||||
|
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
|
||||||
|
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
|
||||||
; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
|
; GFX906: v_fmac_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+$}}
|
||||||
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
|
|
||||||
; GFX906: v_fma_f32 {{v[0-9]+, v[0-9]+, v[0-9]+, v[0-9]+}}
|
|
||||||
|
|
||||||
; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
|
; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].{{[XYZW][XYZW][XYZW][XYZW]}}, {{T[0-9]\.[XYZW]}},
|
||||||
; EG-DAG: FMA {{\*? *}}[[RES]].X
|
; EG-DAG: FMA {{\*? *}}[[RES]].X
|
||||||
|
|
|
@ -25,14 +25,12 @@ define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addr
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
|
; FUNC-LABEL: {{^}}s_test_fmin_legacy_ule_f32:
|
||||||
; SI-DAG: s_load_dword [[A:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 s{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||||
; SI-DAG: s_load_dword [[B:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
|
||||||
|
|
||||||
; SI-SAFE-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
|
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], s[[B]]
|
||||||
; SI-NONAN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
|
|
||||||
|
|
||||||
; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
|
; SI-SAFE: v_min_legacy_f32_e64 {{v[0-9]+}}, [[VB]], s[[A]]
|
||||||
; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
|
; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, s[[A]], [[VB]]
|
||||||
|
|
||||||
define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
|
define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
|
||||||
%cmp = fcmp ule float %a, %b
|
%cmp = fcmp ule float %a, %b
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
; XUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
; XUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=+fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-DENORM %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-fp64-fp16-denormals,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VI-FLUSH %s
|
||||||
|
|
||||||
|
|
||||||
; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
|
; Make sure (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c)) doesn't
|
||||||
|
@ -44,7 +44,7 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out,
|
||||||
; GCN-DAG: buffer_store_dword [[MUL2]]
|
; GCN-DAG: buffer_store_dword [[MUL2]]
|
||||||
; GCN-DAG: buffer_store_dword [[MAD]]
|
; GCN-DAG: buffer_store_dword [[MAD]]
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
|
define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, [8 x i32], float %y) #0 {
|
||||||
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
|
%out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
|
||||||
%mul2 = fmul fast float %x, 2.0
|
%mul2 = fmul fast float %x, 2.0
|
||||||
%mad = fadd fast float %mul2, %y
|
%mad = fadd fast float %mul2, %y
|
||||||
|
|
|
@ -17,8 +17,8 @@ define amdgpu_kernel void @fmul_f16(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fmul half %a.val, %b.val
|
%r.val = fmul half %a.val, %b.val
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -36,7 +36,7 @@ define amdgpu_kernel void @fmul_f16_imm_a(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fmul half 3.0, %b.val
|
%r.val = fmul half 3.0, %b.val
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -55,24 +55,24 @@ define amdgpu_kernel void @fmul_f16_imm_b(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %a) {
|
half addrspace(1)* %a) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%r.val = fmul half %a.val, 4.0
|
%r.val = fmul half %a.val, 4.0
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fmul_v2f16:
|
; GCN-LABEL: {{^}}fmul_v2f16:
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; SIVI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; SIVI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
||||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
|
@ -82,6 +82,8 @@ entry:
|
||||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
|
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
|
; GFX9: v_pk_mul_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
|
@ -100,13 +102,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
|
; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
|
||||||
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
|
; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
|
||||||
|
|
||||||
; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
|
; VI-DAG: v_mov_b32_e32 v[[CONST4:[0-9]+]], 0x4400
|
||||||
|
@ -133,13 +135,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
|
; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
|
||||||
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
|
; SI-DAG: v_mul_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
; SI-DAG: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
|
||||||
; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
|
; VI-DAG: v_mov_b32_e32 v[[CONST3:[0-9]+]], 0x4200
|
||||||
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
|
@ -164,13 +166,15 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fmul_v4f16:
|
; GCN-LABEL: {{^}}fmul_v4f16:
|
||||||
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
|
; GFX9: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
|
||||||
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
|
; GFX9: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
|
||||||
|
|
||||||
; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
|
; GFX9-DAG: v_pk_mul_f16 v[[MUL_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
|
||||||
; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
|
; GFX9-DAG: v_pk_mul_f16 v[[MUL_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
|
||||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
|
; GFX9: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[MUL_HI]]{{\]}}
|
||||||
|
|
||||||
|
; VI: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
|
||||||
|
; VI: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
|
||||||
; VI: v_mul_f16_sdwa
|
; VI: v_mul_f16_sdwa
|
||||||
; VI: v_mul_f16_e32
|
; VI: v_mul_f16_e32
|
||||||
; VI: v_mul_f16_sdwa
|
; VI: v_mul_f16_sdwa
|
||||||
|
|
|
@ -109,8 +109,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x h
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
|
; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
|
||||||
; CI: s_load_dword s
|
; CI: s_load_dword [[IN:s[0-9]+]]
|
||||||
; CI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80008000
|
; CI: s_or_b32 [[FNEG_FABS:s[0-9]+]], [[IN]], 0x80008000
|
||||||
|
; CI: s_lshr_b32
|
||||||
; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||||
; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
; CI: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
|
||||||
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
|
||||||
|
|
|
@ -55,14 +55,13 @@ define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fneg_fabs_f64:
|
; GCN-LABEL: {{^}}fneg_fabs_f64:
|
||||||
; GCN-DAG: s_load_dwordx2
|
|
||||||
; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
|
; GCN-DAG: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
|
||||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0xb
|
; SI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x13
|
||||||
; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x2c
|
; VI-DAG: s_load_dwordx2 s{{\[}}[[LO_X:[0-9]+]]:[[HI_X:[0-9]+]]{{\]}}, s[{{[0-9]+:[0-9]+}}], 0x4c
|
||||||
; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
|
; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
|
; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
|
||||||
; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
|
; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
|
||||||
define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
|
define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, [8 x i32], double %in) {
|
||||||
%fabs = call double @llvm.fabs.f64(double %in)
|
%fabs = call double @llvm.fabs.f64(double %in)
|
||||||
%fsub = fsub double -0.000000e+00, %fabs
|
%fsub = fsub double -0.000000e+00, %fabs
|
||||||
store double %fsub, double addrspace(1)* %out, align 8
|
store double %fsub, double addrspace(1)* %out, align 8
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
|
; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: v_sub_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, |{{v[0-9]+}}|
|
; SI: v_sub_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{s[0-9]+}}|
|
||||||
define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
|
define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
|
||||||
%fabs = call float @llvm.fabs.f32(float %x)
|
%fabs = call float @llvm.fabs.f32(float %x)
|
||||||
%fsub = fsub float -0.000000e+00, %fabs
|
%fsub = fsub float -0.000000e+00, %fabs
|
||||||
|
@ -15,7 +15,7 @@ define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
|
; FUNC-LABEL: {{^}}fneg_fabs_fmul_f32:
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: v_mul_f32_e64 {{v[0-9]+}}, {{s[0-9]+}}, -|{{v[0-9]+}}|
|
; SI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{s[0-9]+}}|
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
|
define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
|
||||||
%fabs = call float @llvm.fabs.f32(float %x)
|
%fabs = call float @llvm.fabs.f32(float %x)
|
||||||
|
|
|
@ -48,11 +48,11 @@ define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fneg_fold_f64:
|
; GCN-LABEL: {{^}}fneg_fold_f64:
|
||||||
; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN-NOT: xor
|
; GCN-NOT: xor
|
||||||
; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
|
; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
|
||||||
define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
|
define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, [8 x i32], double %in) {
|
||||||
%fsub = fsub double -0.0, %in
|
%fsub = fsub double -0.0, %in
|
||||||
%fmul = fmul double %fsub, %in
|
%fmul = fmul double %fsub, %in
|
||||||
store double %fmul, double addrspace(1)* %out
|
store double %fmul, double addrspace(1)* %out
|
||||||
|
|
|
@ -13,18 +13,17 @@ target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-i64:
|
||||||
define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
|
define amdgpu_kernel void @f(i32 addrspace(1)* nocapture %a, i32 %i, i32 %j) local_unnamed_addr #0 {
|
||||||
entry:
|
entry:
|
||||||
; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
; CHECK: s_load_dwordx2 s[4:5], s[0:1], 0x9
|
||||||
; CHECK: s_load_dword s2, s[0:1], 0xb
|
; CHECK: s_load_dwordx2 s[0:1], s[0:1], 0xb
|
||||||
; CHECK: s_load_dword s0, s[0:1], 0xc
|
|
||||||
; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
; CHECK: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
|
||||||
; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
; CHECK: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
|
||||||
; CHECK: s_mov_b32 s10, -1
|
; CHECK: s_mov_b32 s10, -1
|
||||||
; CHECK: s_waitcnt lgkmcnt(0)
|
|
||||||
; CHECK: s_lshl_b32 s1, s2, 2
|
|
||||||
; CHECK: v_mov_b32_e32 v0, 4
|
; CHECK: v_mov_b32_e32 v0, 4
|
||||||
; CHECK: s_mov_b32 s11, 0xe8f000
|
; CHECK: s_waitcnt lgkmcnt(0)
|
||||||
; CHECK: v_add_i32_e32 v1, vcc, s1, v0
|
|
||||||
; CHECK: v_mov_b32_e32 v2, 7
|
|
||||||
; CHECK: s_lshl_b32 s0, s0, 2
|
; CHECK: s_lshl_b32 s0, s0, 2
|
||||||
|
; CHECK: v_add_i32_e32 v1, vcc, s0, v0
|
||||||
|
; CHECK: s_lshl_b32 s0, s1, 2
|
||||||
|
; CHECK: s_mov_b32 s11, 0xe8f000
|
||||||
|
; CHECK: v_mov_b32_e32 v2, 7
|
||||||
; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
|
; CHECK: buffer_store_dword v2, v1, s[8:11], s3 offen
|
||||||
; CHECK: v_add_i32_e32 v0, vcc, s0, v0
|
; CHECK: v_add_i32_e32 v0, vcc, s0, v0
|
||||||
; CHECK: s_mov_b32 s7, 0xf000
|
; CHECK: s_mov_b32 s7, 0xf000
|
||||||
|
@ -35,14 +34,14 @@ entry:
|
||||||
; CHECK: s_endpgm
|
; CHECK: s_endpgm
|
||||||
|
|
||||||
%x = alloca [100 x i32], align 4, addrspace(5)
|
%x = alloca [100 x i32], align 4, addrspace(5)
|
||||||
%0 = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
|
%alloca.bc = bitcast [100 x i32] addrspace(5)* %x to i8 addrspace(5)*
|
||||||
call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
|
call void @llvm.lifetime.start.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
|
||||||
%arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i
|
%arrayidx = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %i
|
||||||
store i32 7, i32 addrspace(5)* %arrayidx, align 4
|
store i32 7, i32 addrspace(5)* %arrayidx, align 4
|
||||||
%arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j
|
%arrayidx2 = getelementptr inbounds [100 x i32], [100 x i32] addrspace(5)* %x, i32 0, i32 %j
|
||||||
%1 = load i32, i32 addrspace(5)* %arrayidx2, align 4
|
%ld = load i32, i32 addrspace(5)* %arrayidx2, align 4
|
||||||
store i32 %1, i32 addrspace(1)* %a, align 4
|
store i32 %ld, i32 addrspace(1)* %a, align 4
|
||||||
call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %0) #0
|
call void @llvm.lifetime.end.p5i8(i64 400, i8 addrspace(5)* nonnull %alloca.bc) #0
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -17,8 +17,8 @@ define amdgpu_kernel void @fsub_f16(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fsub half %a.val, %b.val
|
%r.val = fsub half %a.val, %b.val
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -36,7 +36,7 @@ define amdgpu_kernel void @fsub_f16_imm_a(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = fsub half 1.0, %b.val
|
%r.val = fsub half 1.0, %b.val
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -54,33 +54,41 @@ define amdgpu_kernel void @fsub_f16_imm_b(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %a) {
|
half addrspace(1)* %a) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%r.val = fsub half %a.val, 2.0
|
%r.val = fsub half %a.val, 2.0
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fsub_v2f16:
|
; GCN-LABEL: {{^}}fsub_v2f16:
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
|
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
||||||
; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
|
; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
|
||||||
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
; VI-DAG: v_sub_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
|
||||||
|
; GFX9: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
; GFX9: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
|
; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
|
@ -101,13 +109,13 @@ entry:
|
||||||
; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
|
; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
|
||||||
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
|
; SI-DAG: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
|
; SI-DAG: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
@ -135,13 +143,13 @@ entry:
|
||||||
; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
|
; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
|
||||||
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
|
; SI-DAG: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
|
; SI-DAG: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
; CHECK: s_load_dwordx4
|
; CHECK: s_load_dwordx4
|
||||||
; CHECK-NOT: flat_load_dword
|
; CHECK-NOT: flat_load_dword
|
||||||
|
|
||||||
define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) {
|
define amdgpu_kernel void @uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) {
|
||||||
bb:
|
bb:
|
||||||
%tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
|
%tmp2 = load float, float addrspace(1)* %arg, align 4, !tbaa !8
|
||||||
%tmp3 = fadd float %tmp2, 0.000000e+00
|
%tmp3 = fadd float %tmp2, 0.000000e+00
|
||||||
|
@ -28,7 +28,7 @@ bb:
|
||||||
; CHECK: flat_load_dword
|
; CHECK: flat_load_dword
|
||||||
; CHECK-NOT: s_load_dwordx4
|
; CHECK-NOT: s_load_dwordx4
|
||||||
|
|
||||||
define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, float addrspace(1)* %arg1) #0 {
|
define amdgpu_kernel void @non-uniform_load(float addrspace(1)* %arg, [8 x i32], float addrspace(1)* %arg1) #0 {
|
||||||
bb:
|
bb:
|
||||||
%tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
|
%tmp = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||||
%tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
|
%tmp2 = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tmp
|
||||||
|
@ -59,7 +59,7 @@ bb:
|
||||||
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
||||||
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
||||||
|
|
||||||
define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
|
define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
|
||||||
store i32 0, i32 addrspace(1)* %out0
|
store i32 0, i32 addrspace(1)* %out0
|
||||||
%val = load i32, i32 addrspace(1)* %in
|
%val = load i32, i32 addrspace(1)* %in
|
||||||
store i32 %val, i32 addrspace(1)* %out1
|
store i32 %val, i32 addrspace(1)* %out1
|
||||||
|
@ -71,7 +71,7 @@ define amdgpu_kernel void @no_memdep_alias_arg(i32 addrspace(1)* noalias %in, i3
|
||||||
; CHECK: flat_store_dword
|
; CHECK: flat_store_dword
|
||||||
; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
|
; CHECK: flat_load_dword [[VVAL:v[0-9]+]]
|
||||||
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
||||||
define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
|
define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, [8 x i32], i32 addrspace(1)* %out0, [8 x i32], i32 addrspace(1)* %out1) {
|
||||||
store i32 0, i32 addrspace(1)* %out0
|
store i32 0, i32 addrspace(1)* %out0
|
||||||
%val = load i32, i32 addrspace(1)* %in
|
%val = load i32, i32 addrspace(1)* %in
|
||||||
store i32 %val, i32 addrspace(1)* %out1
|
store i32 %val, i32 addrspace(1)* %out1
|
||||||
|
@ -80,19 +80,20 @@ define amdgpu_kernel void @memdep(i32 addrspace(1)* %in, i32 addrspace(1)* %out0
|
||||||
|
|
||||||
; uniform load from global array
|
; uniform load from global array
|
||||||
; CHECK-LABEL: @global_array
|
; CHECK-LABEL: @global_array
|
||||||
; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]]
|
; CHECK: s_getpc_b64 [[GET_PC:s\[[0-9]+:[0-9]+\]]]
|
||||||
|
; CHECK: s_load_dwordx2 [[OUT:s\[[0-9]+:[0-9]+\]]], s[4:5], 0x0
|
||||||
|
; CHECK: s_load_dwordx2 [[A_ADDR:s\[[0-9]+:[0-9]+\]]], [[GET_PC]], 0x0
|
||||||
; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
|
; CHECK: s_load_dwordx2 [[A_ADDR1:s\[[0-9]+:[0-9]+\]]], [[A_ADDR]], 0x0
|
||||||
; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
|
; CHECK: s_load_dword [[SVAL:s[0-9]+]], [[A_ADDR1]], 0x0
|
||||||
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
||||||
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
||||||
|
|
||||||
@A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
|
@A = common local_unnamed_addr addrspace(1) global i32 addrspace(1)* null, align 4
|
||||||
|
|
||||||
define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
|
define amdgpu_kernel void @global_array(i32 addrspace(1)* nocapture %out) {
|
||||||
entry:
|
entry:
|
||||||
%0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
|
%load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
|
||||||
%1 = load i32, i32 addrspace(1)* %0, align 4
|
%load1 = load i32, i32 addrspace(1)* %load0, align 4
|
||||||
store i32 %1, i32 addrspace(1)* %out, align 4
|
store i32 %load1, i32 addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -105,13 +106,13 @@ entry:
|
||||||
; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
|
; CHECK: flat_load_dwordx2 [[A_ADDR:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[ADDR_LO]]:[[ADDR_HI]]{{\]}}
|
||||||
; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
|
; CHECK: flat_load_dword [[VVAL:v[0-9]+]], [[A_ADDR]]
|
||||||
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
; CHECK: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[VVAL]]
|
||||||
define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, i32 %n) {
|
define amdgpu_kernel void @global_array_alias_store(i32 addrspace(1)* nocapture %out, [8 x i32], i32 %n) {
|
||||||
entry:
|
entry:
|
||||||
%gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
|
%gep = getelementptr i32, i32 addrspace(1) * %out, i32 %n
|
||||||
store i32 12, i32 addrspace(1) * %gep
|
store i32 12, i32 addrspace(1) * %gep
|
||||||
%0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
|
%load0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @A, align 4
|
||||||
%1 = load i32, i32 addrspace(1)* %0, align 4
|
%load1 = load i32, i32 addrspace(1)* %load0, align 4
|
||||||
store i32 %1, i32 addrspace(1)* %out, align 4
|
store i32 %load1, i32 addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -22,13 +22,8 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}load_v3f16_arg:
|
; GCN-LABEL: {{^}}load_v3f16_arg:
|
||||||
; SI: s_load_dwordx2
|
; GCN: s_load_dwordx2
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2
|
||||||
; SI: s_load_dword s
|
|
||||||
|
|
||||||
; VI: s_load_dwordx2
|
|
||||||
; VI: s_load_dwordx2
|
|
||||||
|
|
||||||
; GCN-NOT: {buffer|flat|global}}_load_
|
; GCN-NOT: {buffer|flat|global}}_load_
|
||||||
|
|
||||||
|
|
||||||
|
@ -45,11 +40,7 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
|
||||||
|
|
||||||
; FIXME: Why not one load?
|
; FIXME: Why not one load?
|
||||||
; GCN-LABEL: {{^}}load_v4f16_arg:
|
; GCN-LABEL: {{^}}load_v4f16_arg:
|
||||||
; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
|
; GCN-DAG: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}}
|
||||||
; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3
|
|
||||||
|
|
||||||
; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
|
||||||
|
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]]
|
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]]
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]]
|
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]]
|
||||||
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
|
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
|
||||||
|
@ -86,14 +77,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
|
; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
|
||||||
; SI: s_load_dwordx2 s
|
; GCN: s_load_dwordx2 s
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2 s
|
||||||
; SI: s_load_dword s
|
|
||||||
|
|
||||||
; VI: s_load_dwordx2
|
|
||||||
; VI: s_load_dwordx2
|
|
||||||
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
|
||||||
|
|
||||||
; GCN-NOT: _load
|
; GCN-NOT: _load
|
||||||
; GCN: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; GCN: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
|
@ -116,14 +101,7 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
|
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx4
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
|
|
||||||
; VI: s_load_dwordx2 s
|
|
||||||
; VI: s_load_dwordx2 s
|
|
||||||
; VI: s_load_dwordx2 s
|
|
||||||
|
|
||||||
; GCN: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; GCN: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
|
@ -154,7 +132,7 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, hal
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
|
; GCN-LABEL: {{^}}extload_v2f16_to_v2f64_arg:
|
||||||
; GCN: s_load_dword
|
; GCN-DAG: s_load_dword s
|
||||||
; GCN: s_lshr_b32
|
; GCN: s_lshr_b32
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
|
@ -169,14 +147,8 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
|
; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
|
||||||
; SI: s_load_dword
|
; GCN: s_load_dwordx2 s
|
||||||
; SI: s_load_dword
|
; GCN: s_load_dwordx2 s
|
||||||
|
|
||||||
; VI: s_load_dwordx2
|
|
||||||
; VI: s_load_dwordx2
|
|
||||||
|
|
||||||
; GCN: s_lshr_b32
|
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
|
@ -191,19 +163,17 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
|
; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2 s
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2 s
|
||||||
|
|
||||||
; VI: s_load_dwordx2 s
|
; GCN: v_cvt_f32_f16_e32
|
||||||
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f64_f32_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN: v_cvt_f64_f32_e32
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32
|
; GCN: v_cvt_f64_f32_e32
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32
|
; GCN: v_cvt_f64_f32_e32
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32
|
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32
|
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
|
define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
|
||||||
%ext = fpext <4 x half> %arg to <4 x double>
|
%ext = fpext <4 x half> %arg to <4 x double>
|
||||||
|
@ -212,14 +182,8 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
|
; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
|
||||||
; SI: s_load_dword s
|
; GCN: s_load_dwordx2 s
|
||||||
; SI-NEXT: s_load_dword s
|
; GCN: s_load_dwordx4 s
|
||||||
; SI-NEXT: s_load_dword s
|
|
||||||
; SI-NEXT: s_load_dword s
|
|
||||||
; SI-NOT: _load_
|
|
||||||
|
|
||||||
; VI: s_load_dwordx2 s
|
|
||||||
; VI: s_load_dwordx2 s
|
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32
|
; GCN-DAG: v_cvt_f32_f16_e32
|
||||||
|
@ -299,12 +263,13 @@ define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, h
|
||||||
; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
|
; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
|
||||||
; GCN: flat_load_dword [[LOAD:v[0-9]+]],
|
; GCN: flat_load_dword [[LOAD:v[0-9]+]],
|
||||||
|
|
||||||
; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
|
; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
|
; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
|
||||||
|
|
||||||
; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
|
||||||
; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
|
; VI: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
|
||||||
|
; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||||
|
|
||||||
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
|
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
|
@ -348,6 +313,7 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
|
||||||
; SI: v_cvt_f32_f16_e32
|
; SI: v_cvt_f32_f16_e32
|
||||||
; SI: v_cvt_f32_f16_e32
|
; SI: v_cvt_f32_f16_e32
|
||||||
; SI: v_cvt_f32_f16_e32
|
; SI: v_cvt_f32_f16_e32
|
||||||
|
; SI: v_cvt_f32_f16_e32
|
||||||
|
|
||||||
; GCN: flat_store_dwordx4
|
; GCN: flat_store_dwordx4
|
||||||
|
|
||||||
|
@ -361,7 +327,6 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1
|
||||||
; SI: v_cvt_f32_f16_e32
|
; SI: v_cvt_f32_f16_e32
|
||||||
; SI: v_cvt_f32_f16_e32
|
; SI: v_cvt_f32_f16_e32
|
||||||
; SI: v_cvt_f32_f16_e32
|
; SI: v_cvt_f32_f16_e32
|
||||||
; SI: v_cvt_f32_f16_e32
|
|
||||||
|
|
||||||
; VI: v_cvt_f32_f16_e32
|
; VI: v_cvt_f32_f16_e32
|
||||||
; VI: v_cvt_f32_f16_sdwa
|
; VI: v_cvt_f32_f16_sdwa
|
||||||
|
@ -430,19 +395,19 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(
|
||||||
; XVI-NOT: v_cvt_f32_f16
|
; XVI-NOT: v_cvt_f32_f16
|
||||||
|
|
||||||
; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
|
; GCN: flat_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
|
; GCN: v_cvt_f32_f16_e32
|
||||||
; SI-DAG: v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
|
; SI: v_cvt_f32_f16_e32
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
|
; VI: v_cvt_f32_f16_sdwa
|
||||||
; VI-DAG: v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
; GCN-NOT: v_cvt_f32_f16
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
|
; GCN: v_cvt_f64_f32_e32
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
|
; GCN: v_cvt_f64_f32_e32
|
||||||
; GCN-DAG: v_cvt_f64_f32_e32 v[{{[0-9]+}}:[[YHI:[0-9]+]]{{\]}}, [[Y32]]
|
; GCN: v_cvt_f64_f32_e32
|
||||||
; GCN-NOT: v_cvt_f64_f32_e32
|
; GCN-NOT: v_cvt_f64_f32_e32
|
||||||
|
|
||||||
; GCN-DAG: flat_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[XLO]]:[[YHI]]{{\]}}
|
; GCN-DAG: flat_store_dwordx4
|
||||||
; GCN-DAG: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[Z]]
|
; GCN-DAG: flat_store_dwordx2
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
|
define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
|
||||||
%val = load <3 x half>, <3 x half> addrspace(1)* %in
|
%val = load <3 x half>, <3 x half> addrspace(1)* %in
|
||||||
|
|
|
@ -8,7 +8,7 @@
|
||||||
; CHECK: Version: [ 1, 0 ]
|
; CHECK: Version: [ 1, 0 ]
|
||||||
; CHECK: Kernels:
|
; CHECK: Kernels:
|
||||||
|
|
||||||
; CHECK: - Name: test
|
; CHECK-LABEL: - Name: test
|
||||||
; CHECK: SymbolName: 'test@kd'
|
; CHECK: SymbolName: 'test@kd'
|
||||||
; CHECK: CodeProps:
|
; CHECK: CodeProps:
|
||||||
; CHECK: KernargSegmentSize: 24
|
; CHECK: KernargSegmentSize: 24
|
||||||
|
@ -16,8 +16,8 @@
|
||||||
; CHECK: PrivateSegmentFixedSize: 0
|
; CHECK: PrivateSegmentFixedSize: 0
|
||||||
; CHECK: KernargSegmentAlign: 8
|
; CHECK: KernargSegmentAlign: 8
|
||||||
; CHECK: WavefrontSize: 64
|
; CHECK: WavefrontSize: 64
|
||||||
; CHECK: NumSGPRs: 6
|
; CHECK: NumSGPRs: 8
|
||||||
; CHECK: NumVGPRs: 3
|
; CHECK: NumVGPRs: 6
|
||||||
; CHECK: MaxFlatWorkGroupSize: 256
|
; CHECK: MaxFlatWorkGroupSize: 256
|
||||||
define amdgpu_kernel void @test(
|
define amdgpu_kernel void @test(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
|
@ -31,18 +31,24 @@ entry:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: - Name: num_spilled_sgprs
|
; CHECK-LABEL: - Name: num_spilled_sgprs
|
||||||
; CHECK: SymbolName: 'num_spilled_sgprs@kd'
|
; CHECK: SymbolName: 'num_spilled_sgprs@kd'
|
||||||
; CHECK: CodeProps:
|
; CHECK: CodeProps:
|
||||||
; CHECK: NumSpilledSGPRs: 41
|
; GFX700: NumSpilledSGPRs: 40
|
||||||
|
; GFX803: NumSpilledSGPRs: 24
|
||||||
|
; GFX900: NumSpilledSGPRs: 24
|
||||||
define amdgpu_kernel void @num_spilled_sgprs(
|
define amdgpu_kernel void @num_spilled_sgprs(
|
||||||
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %out2,
|
i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, [8 x i32],
|
||||||
i32 addrspace(1)* %out3, i32 addrspace(1)* %out4, i32 addrspace(1)* %out5,
|
i32 addrspace(1)* %out2, i32 addrspace(1)* %out3, [8 x i32],
|
||||||
i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, i32 addrspace(1)* %out8,
|
i32 addrspace(1)* %out4, i32 addrspace(1)* %out5, [8 x i32],
|
||||||
i32 addrspace(1)* %out9, i32 addrspace(1)* %outa, i32 addrspace(1)* %outb,
|
i32 addrspace(1)* %out6, i32 addrspace(1)* %out7, [8 x i32],
|
||||||
i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, i32 addrspace(1)* %oute,
|
i32 addrspace(1)* %out8, i32 addrspace(1)* %out9, [8 x i32],
|
||||||
i32 addrspace(1)* %outf, i32 %in0, i32 %in1, i32 %in2, i32 %in3, i32 %in4,
|
i32 addrspace(1)* %outa, i32 addrspace(1)* %outb, [8 x i32],
|
||||||
i32 %in5, i32 %in6, i32 %in7, i32 %in8, i32 %in9, i32 %ina, i32 %inb,
|
i32 addrspace(1)* %outc, i32 addrspace(1)* %outd, [8 x i32],
|
||||||
|
i32 addrspace(1)* %oute, i32 addrspace(1)* %outf, [8 x i32],
|
||||||
|
i32 %in0, i32 %in1, i32 %in2, i32 %in3, [8 x i32],
|
||||||
|
i32 %in4, i32 %in5, i32 %in6, i32 %in7, [8 x i32],
|
||||||
|
i32 %in8, i32 %in9, i32 %ina, i32 %inb, [8 x i32],
|
||||||
i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
|
i32 %inc, i32 %ind, i32 %ine, i32 %inf) #0 {
|
||||||
entry:
|
entry:
|
||||||
store i32 %in0, i32 addrspace(1)* %out0
|
store i32 %in0, i32 addrspace(1)* %out0
|
||||||
|
@ -64,7 +70,7 @@ entry:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; CHECK: - Name: num_spilled_vgprs
|
; CHECK-LABEL: - Name: num_spilled_vgprs
|
||||||
; CHECK: SymbolName: 'num_spilled_vgprs@kd'
|
; CHECK: SymbolName: 'num_spilled_vgprs@kd'
|
||||||
; CHECK: CodeProps:
|
; CHECK: CodeProps:
|
||||||
; CHECK: NumSpilledVGPRs: 14
|
; CHECK: NumSpilledVGPRs: 14
|
||||||
|
|
|
@ -344,114 +344,114 @@ define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float
|
||||||
|
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_0.0_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_0.0_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0.0
|
%y = fadd double %x, 0.0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_0.5_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_0.5_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0.5
|
%y = fadd double %x, 0.5
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, -0.5
|
%y = fadd double %x, -0.5
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_1.0_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_1.0_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 1.0
|
%y = fadd double %x, 1.0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, -1.0
|
%y = fadd double %x, -1.0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_2.0_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_2.0_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 2.0
|
%y = fadd double %x, 2.0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, -2.0
|
%y = fadd double %x, -2.0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_4.0_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_4.0_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 4.0
|
%y = fadd double %x, 4.0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, -4.0
|
%y = fadd double %x, -4.0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_inv_2pi_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_inv_2pi_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
|
; SI-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
|
||||||
; SI-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
|
; SI-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
|
||||||
; SI: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
; SI: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
||||||
|
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
|
; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
|
||||||
; VI: buffer_store_dwordx2 [[REG]]
|
; VI: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0x3fc45f306dc9c882
|
%y = fadd double %x, 0x3fc45f306dc9c882
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
@ -461,40 +461,40 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out,
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
|
; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
|
||||||
; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
|
; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
|
||||||
; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
||||||
define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0xbfc45f306dc9c882
|
%y = fadd double %x, 0xbfc45f306dc9c882
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_1_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_1_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0x0000000000000001
|
%y = fadd double %x, 0x0000000000000001
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_2_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_2_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0x0000000000000002
|
%y = fadd double %x, 0x0000000000000002
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_16_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_16_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0x0000000000000010
|
%y = fadd double %x, 0x0000000000000010
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
@ -504,7 +504,7 @@ define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, doub
|
||||||
; GCN: v_mov_b32_e32 v0, -1
|
; GCN: v_mov_b32_e32 v0, -1
|
||||||
; GCN: v_mov_b32_e32 v1, v0
|
; GCN: v_mov_b32_e32 v1, v0
|
||||||
; GCN: buffer_store_dwordx2 v[0:1]
|
; GCN: buffer_store_dwordx2 v[0:1]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0xffffffffffffffff
|
%y = fadd double %x, 0xffffffffffffffff
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
@ -514,7 +514,7 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, d
|
||||||
; GCN: v_mov_b32_e32 v0, -2
|
; GCN: v_mov_b32_e32 v0, -2
|
||||||
; GCN: v_mov_b32_e32 v1, -1
|
; GCN: v_mov_b32_e32 v1, -1
|
||||||
; GCN: buffer_store_dwordx2 v[0:1]
|
; GCN: buffer_store_dwordx2 v[0:1]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0xfffffffffffffffe
|
%y = fadd double %x, 0xfffffffffffffffe
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
@ -524,29 +524,29 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, d
|
||||||
; GCN: v_mov_b32_e32 v0, -16
|
; GCN: v_mov_b32_e32 v0, -16
|
||||||
; GCN: v_mov_b32_e32 v1, -1
|
; GCN: v_mov_b32_e32 v1, -1
|
||||||
; GCN: buffer_store_dwordx2 v[0:1]
|
; GCN: buffer_store_dwordx2 v[0:1]
|
||||||
define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0xfffffffffffffff0
|
%y = fadd double %x, 0xfffffffffffffff0
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_63_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_63_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0x000000000000003F
|
%y = fadd double %x, 0x000000000000003F
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_inline_imm_64_f64:
|
; GCN-LABEL: {{^}}add_inline_imm_64_f64:
|
||||||
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
|
; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
|
||||||
; GCN: buffer_store_dwordx2 [[REG]]
|
; GCN: buffer_store_dwordx2 [[REG]]
|
||||||
define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
|
define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, [8 x i32], double %x) {
|
||||||
%y = fadd double %x, 0x0000000000000040
|
%y = fadd double %x, 0x0000000000000040
|
||||||
store double %y, double addrspace(1)* %out
|
store double %y, double addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -310,9 +310,9 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)*
|
||||||
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
|
; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
|
||||||
; GFX9: buffer_store_dword [[REG]]
|
; GFX9: buffer_store_dword [[REG]]
|
||||||
|
|
||||||
; VI: buffer_load_dword
|
; VI-DAG: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
|
||||||
|
; VI-DAG: buffer_load_dword
|
||||||
; VI-NOT: and
|
; VI-NOT: and
|
||||||
; VI: v_mov_b32_e32 [[CONST05:v[0-9]+]], 0x3800
|
|
||||||
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[CONST05]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
|
; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
|
||||||
; VI: v_or_b32
|
; VI: v_or_b32
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tahiti -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
|
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,GCN-NO-TONGA %s
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
|
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,GCN-TONGA %s
|
||||||
|
|
||||||
; FIXME: Broken on evergreen
|
; FIXME: Broken on evergreen
|
||||||
; FIXME: For some reason the 8 and 16 vectors are being stored as
|
; FIXME: For some reason the 8 and 16 vectors are being stored as
|
||||||
|
@ -75,8 +75,9 @@ define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out,
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}insertelement_to_sgpr:
|
; GCN-LABEL: {{^}}insertelement_to_sgpr:
|
||||||
; GCN-NOT: v_readfirstlane
|
; GCN-NOT: v_readfirstlane
|
||||||
define amdgpu_ps <4 x float> @insertelement_to_sgpr(<4 x i32> inreg %samp) nounwind {
|
define <4 x float> @insertelement_to_sgpr() nounwind {
|
||||||
%tmp1 = insertelement <4 x i32> %samp, i32 0, i32 0
|
%tmp = load <4 x i32>, <4 x i32> addrspace(4)* undef
|
||||||
|
%tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
|
||||||
%tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
|
%tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.2d.v4f32.f32(i32 1, float undef, float undef, <8 x i32> undef, <4 x i32> %tmp1, i1 0, i32 0, i32 0)
|
||||||
ret <4 x float> %tmp2
|
ret <4 x float> %tmp2
|
||||||
}
|
}
|
||||||
|
@ -154,11 +155,11 @@ define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v4i32:
|
||||||
; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
|
; GCN: s_load_dword [[SVAL:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}
|
||||||
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
|
||||||
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
|
; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
|
||||||
; GCN: buffer_store_dwordx4
|
; GCN: buffer_store_dwordx4
|
||||||
define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
|
define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, [8 x i32], i32 %val) nounwind {
|
||||||
%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
|
%vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
|
||||||
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
|
store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
|
||||||
ret void
|
ret void
|
||||||
|
@ -201,23 +202,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
|
||||||
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
|
||||||
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; VI-NOT: _load
|
; VI-NOT: _load
|
||||||
; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[LOAD]], 8
|
|
||||||
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
; VI: v_lshlrev_b16_e64 [[ELT1_SHIFT:v[0-9]+]], 8, [[ELT1]]
|
|
||||||
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
|
|
||||||
; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
|
; VI: v_lshlrev_b16_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], -1
|
||||||
|
; VI: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
|
||||||
; VI: v_xor_b32_e32 [[NOT:v[0-9]+]], -1, [[MASK]]
|
; VI: v_xor_b32_e32 [[NOT_MASK:v[0-9]+]], -1, [[MASK]]
|
||||||
; VI: v_or_b32_e32 [[BUILD_VECTOR:v[0-9]+]], [[ELT0]], [[ELT1_SHIFT]]
|
; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[LOAD]], [[NOT_MASK]]
|
||||||
|
; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[AND_NOT_MASK]]
|
||||||
; VI: v_and_b32_e32 [[AND1:v[0-9]+]], [[NOT]], [[BUILD_VECTOR]]
|
|
||||||
; VI-DAG: v_and_b32_e32 [[INSERT:v[0-9]+]], 5, [[MASK]]
|
|
||||||
; VI: v_or_b32_e32 [[OR:v[0-9]+]], [[INSERT]], [[BUILD_VECTOR]]
|
|
||||||
; VI: buffer_store_short [[OR]]
|
; VI: buffer_store_short [[OR]]
|
||||||
define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
|
define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind {
|
||||||
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
|
%vecins = insertelement <2 x i8> %a, i8 5, i32 %b
|
||||||
store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
|
store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
|
||||||
ret void
|
ret void
|
||||||
|
@ -227,68 +222,51 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
|
||||||
; isTypeDesirableForOp in SimplifyDemandedBits
|
; isTypeDesirableForOp in SimplifyDemandedBits
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v3i8:
|
||||||
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
|
||||||
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; VI-NOT: _load
|
; VI-NOT: _load
|
||||||
|
|
||||||
; VI: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[LOAD]], 8
|
; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
|
||||||
; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[VEC_HI]]
|
|
||||||
; VI: s_and_b32 [[ELT0:s[0-9]+]], [[LOAD]], 0xff{{$}}
|
|
||||||
; VI: v_or_b32_e32 [[BUILD_VEC:v[0-9]+]], [[VEC_HI]], [[ELT2]]
|
|
||||||
; VI: s_and_b32 [[ELT2:s[0-9]+]], [[LOAD]], 0xff0000{{$}}
|
|
||||||
|
|
||||||
; VI: s_mov_b32 [[MASK16:s[0-9]+]], 0xffff{{$}}
|
|
||||||
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], [[MASK16]], [[SCALED_IDX]]
|
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
|
||||||
|
; VI: s_not_b32 [[NOT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
|
||||||
|
; VI: s_and_b32 [[AND_NOT_MASK:s[0-9]+]], [[NOT_MASK]], [[LOAD]]
|
||||||
|
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
|
||||||
|
; VI: s_lshr_b32 [[HI2:s[0-9]+]], [[AND_NOT_MASK]], 16
|
||||||
|
|
||||||
; VI: v_mov_b32_e32 [[V_ELT2:v[0-9]+]], [[ELT2]]
|
; VI-DAG: buffer_store_short [[BFI]]
|
||||||
; VI: v_or_b32_sdwa [[SDWA:v[0-9]+]], [[BUILD_VEC]], [[V_ELT2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; VI-DAG: v_mov_b32_e32 [[V_HI2:v[0-9]+]], [[HI2]]
|
||||||
; VI: s_not_b32 [[NOT_SHIFT_MASK:s[0-9]+]], [[SHIFTED_MASK]]
|
; VI: buffer_store_byte [[V_HI2]]
|
||||||
; VI: v_and_b32_e32 [[AND_NOT_MASK:v[0-9]+]], [[NOT_SHIFT_MASK]], [[SDWA]]
|
define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind {
|
||||||
; VI: v_lshrrev_b32_e32 [[HI2:v[0-9]+]], 16, [[AND_NOT_MASK]]
|
|
||||||
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SCALED_IDX]], 5, [[SDWA]]
|
|
||||||
; VI: buffer_store_short [[BFI]]
|
|
||||||
; VI: buffer_store_byte [[HI2]]
|
|
||||||
define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
|
|
||||||
%vecins = insertelement <3 x i8> %a, i8 5, i32 %b
|
%vecins = insertelement <3 x i8> %a, i8 5, i32 %b
|
||||||
store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
|
store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v4i8:
|
||||||
; VI: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI: s_load_dword [[LOAD:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x28
|
||||||
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
; VI-NEXT: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; VI-NOT: _load
|
; VI-NOT: _load
|
||||||
|
|
||||||
; VI: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 8
|
; VI: v_mov_b32_e32 [[V_LOAD:v[0-9]+]], [[LOAD]]
|
||||||
; VI: v_lshlrev_b16_e64 [[ELT2:v[0-9]+]], 8, [[ELT1]]
|
|
||||||
; VI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0xff{{$}}
|
|
||||||
|
|
||||||
|
|
||||||
; VI: s_lshr_b32 [[ELT3:s[0-9]+]], [[VEC]], 24
|
|
||||||
; VI: s_lshr_b32 [[ELT2:s[0-9]+]], [[VEC]], 16
|
|
||||||
; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, 8, [[ELT3]]
|
|
||||||
; VI: v_or_b32_e32
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
; VI: v_or_b32_sdwa
|
; VI: s_lshl_b32 [[SHIFTED_MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
|
||||||
; VI: s_lshl_b32
|
; VI: v_bfi_b32 [[BFI:v[0-9]+]], [[SHIFTED_MASK]], 5, [[V_LOAD]]
|
||||||
; VI: v_bfi_b32
|
; VI: buffer_store_dword [[BFI]]
|
||||||
define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
|
define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind {
|
||||||
%vecins = insertelement <4 x i8> %a, i8 5, i32 %b
|
%vecins = insertelement <4 x i8> %a, i8 5, i32 %b
|
||||||
store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
|
store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
|
; GCN-LABEL: {{^}}s_dynamic_insertelement_v8i8:
|
||||||
; VI-NOT: {{buffer|flat|global}}
|
; VI-NOT: {{buffer|flat|global}}_load
|
||||||
; VI: s_load_dword [[IDX:s[0-9]]]
|
; VI-DAG: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
|
||||||
; VI-NOT: {{buffer|flat|global}}
|
; VI-DAG: s_load_dword [[IDX:s[0-9]]], s[4:5], 0x10
|
||||||
; VI: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0{{$}}
|
||||||
; VI-NOT: {{buffer|flat|global}}
|
; VI-DAG: s_load_dwordx2 [[VEC:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
; VI-DAG: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 3
|
||||||
; VI-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
|
|
||||||
; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
|
; VI-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff
|
||||||
; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
|
; VI: s_lshl_b64 s{{\[}}[[MASK_SHIFT_LO:[0-9]+]]:[[MASK_SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
|
||||||
; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
|
; VI: s_not_b64 [[NOT_MASK:s\[[0-9]+:[0-9]+\]]], s{{\[}}[[MASK_SHIFT_LO]]:[[MASK_SHIFT_HI]]{{\]}}
|
||||||
|
@ -307,13 +285,8 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
|
||||||
; GCN: s_load_dwordx2
|
; GCN: s_load_dwordx2
|
||||||
|
; GCN: s_load_dwordx4
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN: s_load_dword s
|
|
||||||
; GCN: s_load_dword s
|
|
||||||
; GCN: s_load_dword s
|
|
||||||
; GCN: s_load_dword s
|
|
||||||
; GCN-NOT: _load_
|
|
||||||
|
|
||||||
|
|
||||||
; GCN: buffer_store_byte
|
; GCN: buffer_store_byte
|
||||||
; GCN: buffer_store_byte
|
; GCN: buffer_store_byte
|
||||||
|
@ -368,7 +341,7 @@ endif:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v2f64:
|
||||||
; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
|
; GCN-DAG: s_load_dwordx4 s{{\[}}[[A_ELT0:[0-9]+]]:[[A_ELT3:[0-9]+]]{{\]}}
|
||||||
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x11|0x44}}{{$}}
|
; GCN-DAG: s_load_dword [[IDX:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x18|0x60}}{{$}}
|
||||||
|
|
||||||
; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
|
; GCN-DAG: s_lshl_b32 [[SCALEDIDX:s[0-9]+]], [[IDX]], 1{{$}}
|
||||||
|
|
||||||
|
@ -390,7 +363,7 @@ endif:
|
||||||
|
|
||||||
; GCN: buffer_store_dwordx4
|
; GCN: buffer_store_dwordx4
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
|
define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, [8 x i32], <2 x double> %a, [8 x i32], i32 %b) nounwind {
|
||||||
%vecins = insertelement <2 x double> %a, double 8.0, i32 %b
|
%vecins = insertelement <2 x double> %a, double 8.0, i32 %b
|
||||||
store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
|
store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
|
||||||
ret void
|
ret void
|
||||||
|
@ -420,19 +393,18 @@ define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %
|
||||||
; space is also 2x what should be required.
|
; space is also 2x what should be required.
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v4f64:
|
||||||
; GCN: SCRATCH_RSRC_DWORD
|
|
||||||
|
|
||||||
; Stack store
|
; Stack store
|
||||||
|
|
||||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
|
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
|
||||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
|
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
|
||||||
|
|
||||||
; Write element
|
; Write element
|
||||||
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
|
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
|
||||||
|
|
||||||
; Stack reload
|
; Stack reload
|
||||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
|
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:32{{$}}
|
||||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
|
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:48{{$}}
|
||||||
|
|
||||||
; Store result
|
; Store result
|
||||||
; GCN: buffer_store_dwordx4
|
; GCN: buffer_store_dwordx4
|
||||||
|
@ -447,19 +419,17 @@ define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
|
; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
|
||||||
; GCN-DAG: SCRATCH_RSRC_DWORD
|
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
|
||||||
|
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
|
||||||
|
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
|
||||||
|
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
|
||||||
|
|
||||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
|
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], {{s[0-9]+}} offen{{$}}
|
||||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
|
|
||||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
|
|
||||||
; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
|
|
||||||
|
|
||||||
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
|
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:64{{$}}
|
||||||
|
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:80{{$}}
|
||||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
|
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:96{{$}}
|
||||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
|
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s[0:3], {{s[0-9]+}} offset:112{{$}}
|
||||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
|
|
||||||
; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
|
|
||||||
|
|
||||||
; GCN: buffer_store_dwordx4
|
; GCN: buffer_store_dwordx4
|
||||||
; GCN: buffer_store_dwordx4
|
; GCN: buffer_store_dwordx4
|
||||||
|
|
|
@ -1,9 +1,9 @@
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
|
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
|
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,VI,GFX89 %s
|
||||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
|
; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CIVI,CI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
; GCN: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
||||||
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
|
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
|
||||||
|
@ -18,17 +18,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out,
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reg:
|
||||||
; GCN: s_load_dword [[ELT0:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
|
; CIVI-DAG: s_and_b32 [[ELT0:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
|
||||||
; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
||||||
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
||||||
|
|
||||||
; GFX9-NOT: [[ELT0]]
|
; GFX9-NOT: [[ELT0]]
|
||||||
; GFX9-NOT: [[VEC]]
|
; GFX9-NOT: [[VEC]]
|
||||||
; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
|
; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[VEC]]
|
||||||
define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
|
define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
|
||||||
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
||||||
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
|
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
|
||||||
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
|
||||||
|
@ -36,29 +36,29 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
|
||||||
; GCN: s_load_dword [[ELT0:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[ELT_LOAD:s[0-9]+]], s[4:5],
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
; CI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
|
; CI-DAG: s_and_b32 [[ELT0_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
|
||||||
; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
; CI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
||||||
; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
|
; CI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
|
||||||
; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
; CI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0_MASKED]], [[ELT1]]
|
||||||
; CI-DAG: ; use [[SHR]]
|
; CI-DAG: ; use [[SHR]]
|
||||||
|
|
||||||
|
|
||||||
; FIXME: Should be able to void mask of upper bits
|
; FIXME: Should be able to void mask of upper bits
|
||||||
; VI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
|
; VI-DAG: s_and_b32 [[ELT_MASKED:s[0-9]+]], [[ELT_LOAD]], 0xffff{{$}}
|
||||||
; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
; VI-DAG: s_and_b32 [[VEC_HIMASK:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
||||||
; VI: s_or_b32 [[OR:s[0-9]+]], [[ELT0]], [[VEC_HIMASK]]
|
; VI-DAG: s_or_b32 [[OR:s[0-9]+]], [[ELT_MASKED]], [[VEC_HIMASK]]
|
||||||
; VI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
; VI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
|
||||||
|
|
||||||
; VI-DAG: ; use [[SHR]]
|
; VI-DAG: ; use [[SHR]]
|
||||||
|
|
||||||
|
|
||||||
; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
|
; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
|
||||||
; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_LOAD]], [[ELT1]]
|
||||||
; GFX9-DAG: ; use [[ELT1]]
|
; GFX9-DAG: ; use [[ELT1]]
|
||||||
define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
|
define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
|
||||||
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
||||||
%elt1 = extractelement <2 x i16> %vec, i32 1
|
%elt1 = extractelement <2 x i16> %vec, i32 1
|
||||||
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
|
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
|
||||||
|
@ -69,16 +69,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
|
||||||
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
|
; GCN-DAG: s_load_dword [[ELT_ARG:s[0-9]+]], s[4:5],
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
|
; CIVI: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
||||||
; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
|
||||||
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
|
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[ELT1]]
|
||||||
|
|
||||||
; GFX9-NOT: [[ELT0]]
|
; GFX9-NOT: [[ELT0]]
|
||||||
; GFX9-NOT: [[VEC]]
|
; GFX9-NOT: [[VEC]]
|
||||||
; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
|
; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
|
||||||
define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i32 %elt.arg) #0 {
|
define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i32 %elt.arg) #0 {
|
||||||
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
||||||
%elt.hi = lshr i32 %elt.arg, 16
|
%elt.hi = lshr i32 %elt.arg, 16
|
||||||
%elt = trunc i32 %elt.hi to i16
|
%elt = trunc i32 %elt.hi to i16
|
||||||
|
@ -88,7 +89,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
|
||||||
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
|
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]],
|
; GCN: s_load_dword [[VEC:s[0-9]+]],
|
||||||
|
|
||||||
; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
|
; CIVI-DAG: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
|
||||||
|
@ -110,7 +111,7 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> a
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
|
||||||
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
|
; GCN: s_load_dword [[ELT_ARG:s[0-9]+]],
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]],
|
; GCN: s_load_dword [[VEC:s[0-9]+]],
|
||||||
|
|
||||||
; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
; CI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
|
||||||
|
@ -161,15 +162,16 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out,
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
|
; GCN-LABEL: {{^}}s_insertelement_v2i16_1_reg:
|
||||||
; GCN: s_load_dword [[ELT1:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[ELT1_LOAD:s[0-9]+]], s[4:5],
|
||||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[VEC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x0
|
||||||
|
|
||||||
|
; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[ELT1_LOAD]], 16
|
||||||
; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
|
; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
|
||||||
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
|
||||||
|
|
||||||
; GCN-NOT: shlr
|
; GCN-NOT: shlr
|
||||||
; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
|
; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1_LOAD]]
|
||||||
define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, i16 %elt) #0 {
|
define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(4)* %vec.ptr, [8 x i32], i16 %elt) #0 {
|
||||||
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
|
||||||
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
|
%vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
|
||||||
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
|
||||||
|
@ -444,12 +446,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspac
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
|
; GCN-LABEL: {{^}}v_insertelement_v2i16_dynamic_vgpr:
|
||||||
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
|
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
|
||||||
; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
|
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
|
||||||
|
|
||||||
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
|
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
|
||||||
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
|
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
|
||||||
|
|
||||||
; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
|
|
||||||
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
||||||
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
|
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
|
||||||
|
|
||||||
|
@ -473,12 +474,11 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspac
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
|
; GCN-LABEL: {{^}}v_insertelement_v2f16_dynamic_vgpr:
|
||||||
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
|
; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
|
||||||
; CI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
|
; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
|
||||||
|
|
||||||
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
|
; GCN: {{flat|global}}_load_dword [[IDX:v[0-9]+]]
|
||||||
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
|
; GCN: {{flat|global}}_load_dword [[VEC:v[0-9]+]]
|
||||||
|
|
||||||
; GFX89-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
|
|
||||||
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
||||||
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
|
; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
|
||||||
|
|
||||||
|
@ -501,17 +501,18 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
|
; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
|
||||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
|
||||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||||
|
|
||||||
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
||||||
; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]
|
; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]
|
||||||
|
|
||||||
|
; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
|
||||||
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
|
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
|
||||||
; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]]
|
; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL_MASKED]], [[AND]]
|
||||||
|
|
||||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
|
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
|
||||||
define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||||
%tid.ext = sext i32 %tid to i64
|
%tid.ext = sext i32 %tid to i64
|
||||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||||
|
@ -531,12 +532,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out
|
||||||
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
|
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
|
||||||
; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]
|
; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]
|
||||||
|
|
||||||
; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
|
; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
|
||||||
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
|
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
|
||||||
; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
|
|
||||||
|
; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
|
||||||
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
|
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
|
||||||
; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]]
|
; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL_HI]], [[AND]]
|
||||||
|
|
||||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
|
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
|
||||||
define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
||||||
|
@ -553,17 +555,18 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
|
; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
|
||||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]], s[4:5],
|
||||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||||
|
|
||||||
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
||||||
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
|
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
|
||||||
|
|
||||||
|
; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
|
||||||
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
|
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
|
||||||
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
|
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
|
||||||
|
|
||||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, [8 x i32], i32 %val) #0 {
|
||||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||||
%tid.ext = sext i32 %tid to i64
|
%tid.ext = sext i32 %tid to i64
|
||||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||||
|
@ -583,12 +586,13 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out
|
||||||
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
|
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
|
||||||
; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]
|
; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]
|
||||||
|
|
||||||
; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
|
; VI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
|
||||||
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
|
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL_HI]]
|
||||||
; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||||
|
|
||||||
|
; CI: s_lshl_b32 [[VAL_HI:s[0-9]+]], [[VAL]], 16
|
||||||
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
|
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
|
||||||
; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
|
; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_HI]], [[AND]]
|
||||||
|
|
||||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
||||||
|
@ -611,8 +615,9 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out
|
||||||
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
||||||
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
|
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
|
||||||
|
|
||||||
|
; CIVI: s_and_b32 [[VAL_MASKED:s[0-9]+]], [[VAL]], 0xffff{{$}}
|
||||||
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
|
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
|
||||||
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
|
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL_MASKED]], [[AND]]
|
||||||
|
|
||||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
|
define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
|
||||||
|
|
|
@ -210,8 +210,10 @@ entry:
|
||||||
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
|
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 41
|
||||||
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
|
; EG-DAG: VTX_READ_8 T{{[0-9]}}.X, T{{[0-9]}}.X, 42
|
||||||
|
|
||||||
; GCN: s_load_dword s
|
; SI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||||
; GCN-NOT: {{buffer|flat|global}}_load_
|
|
||||||
|
; VI-MESA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||||
|
; VI-HSA: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||||
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
|
define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
|
||||||
entry:
|
entry:
|
||||||
store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
|
store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
|
||||||
|
@ -226,8 +228,7 @@ entry:
|
||||||
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
|
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
|
||||||
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
|
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
|
||||||
|
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||||
; SI: s_load_dword s
|
|
||||||
|
|
||||||
; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||||
; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||||
|
@ -236,6 +237,7 @@ entry:
|
||||||
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
|
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}v3i32_arg:
|
; FUNC-LABEL: {{^}}v3i32_arg:
|
||||||
; HSA-VI: kernarg_segment_byte_size = 32
|
; HSA-VI: kernarg_segment_byte_size = 32
|
||||||
; HSA-VI: kernarg_segment_alignment = 4
|
; HSA-VI: kernarg_segment_alignment = 4
|
||||||
|
@ -274,8 +276,8 @@ entry:
|
||||||
; EG: VTX_READ_8
|
; EG: VTX_READ_8
|
||||||
; EG: VTX_READ_8
|
; EG: VTX_READ_8
|
||||||
|
|
||||||
; GCN: s_load_dword s
|
; GCN-DAG: s_load_dwordx2 s
|
||||||
; GCN-NOT: {{buffer|flat|global}}_load_
|
; GCN-DAG: s_load_dword s
|
||||||
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
|
define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
|
store <4 x i8> %in, <4 x i8> addrspace(1)* %out
|
||||||
|
@ -290,12 +292,18 @@ entry:
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
|
|
||||||
; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0xb
|
||||||
; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
|
||||||
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
|
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
|
||||||
|
|
||||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c
|
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
|
||||||
; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
|
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
|
||||||
|
|
||||||
|
|
||||||
|
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x24
|
||||||
|
; MESA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x2c
|
||||||
|
|
||||||
|
; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x0
|
||||||
|
; HSA-VI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x8
|
||||||
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
|
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
|
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
|
||||||
|
@ -348,23 +356,16 @@ entry:
|
||||||
; EG: VTX_READ_8
|
; EG: VTX_READ_8
|
||||||
; EG: VTX_READ_8
|
; EG: VTX_READ_8
|
||||||
|
|
||||||
|
; SI-NOT: {{buffer|flat|global}}_load
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dwordx2 s
|
; SI: s_load_dwordx2 s
|
||||||
|
; SI-NEXT: s_load_dwordx2 s
|
||||||
; SI-NOT: {{buffer|flat|global}}_load
|
; SI-NOT: {{buffer|flat|global}}_load
|
||||||
|
|
||||||
; VI: s_load_dword s
|
; VI: s_load_dwordx2 s
|
||||||
; VI: s_load_dword s
|
; VI-NEXT: s_load_dwordx2 s
|
||||||
|
; VI-NOT: lshl
|
||||||
; VI: v_lshlrev_b16
|
; VI-NOT: _or
|
||||||
; VI: v_or_b32_e32
|
; VI-NOT: _sdwa
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_lshlrev_b16
|
|
||||||
; VI: s_lshr_b32
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
|
define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
|
store <8 x i8> %in, <8 x i8> addrspace(1)* %out
|
||||||
|
@ -383,19 +384,14 @@ entry:
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
|
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx4
|
||||||
; SI: s_load_dword s
|
; SI-NEXT: s_load_dwordx2
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dwordx2
|
|
||||||
; SI-NOT: {{buffer|flat|global}}_load
|
; SI-NOT: {{buffer|flat|global}}_load
|
||||||
|
|
||||||
|
|
||||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
|
; MESA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
|
||||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c
|
|
||||||
|
|
||||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
|
; HSA-VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
|
||||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18
|
|
||||||
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
|
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
|
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
|
||||||
|
@ -413,6 +409,7 @@ entry:
|
||||||
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
|
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].Z
|
||||||
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
|
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
|
||||||
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
|
; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
|
||||||
|
|
||||||
; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
|
; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
|
||||||
; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
|
; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
|
||||||
; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
|
; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
|
||||||
|
@ -462,33 +459,16 @@ entry:
|
||||||
; EG: VTX_READ_8
|
; EG: VTX_READ_8
|
||||||
; EG: VTX_READ_8
|
; EG: VTX_READ_8
|
||||||
|
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx4 s
|
||||||
; SI: s_load_dword s
|
; SI-NEXT: s_load_dwordx2 s
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dwordx2
|
|
||||||
; SI-NOT: {{buffer|flat|global}}_load
|
; SI-NOT: {{buffer|flat|global}}_load
|
||||||
|
|
||||||
|
|
||||||
; VI: s_load_dword s
|
; VI: s_load_dwordx4 s
|
||||||
; VI: s_load_dword s
|
; VI-NOT: shr
|
||||||
; VI: s_load_dword s
|
; VI-NOT: shl
|
||||||
; VI: s_load_dword s
|
; VI-NOT: _sdwa
|
||||||
|
; VI-NOT: _or_
|
||||||
; VI: s_lshr_b32
|
|
||||||
; VI: v_lshlrev_b16
|
|
||||||
; VI: s_lshr_b32
|
|
||||||
; VI: s_lshr_b32
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_lshlrev_b16
|
|
||||||
; VI: v_lshlrev_b16
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_lshlrev_b16
|
|
||||||
; VI: v_lshlrev_b16
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
; VI: v_or_b32_sdwa
|
|
||||||
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
|
define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
|
store <16 x i8> %in, <16 x i8> addrspace(1)* %out
|
||||||
|
@ -516,27 +496,14 @@ entry:
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
; EG: VTX_READ_16
|
; EG: VTX_READ_16
|
||||||
|
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dwordx8 s
|
||||||
; SI: s_load_dword s
|
; SI-NEXT: s_load_dwordx2 s
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
; SI: s_load_dword s
|
|
||||||
|
|
||||||
; SI-NOT: {{buffer|flat|global}}_load
|
; SI-NOT: {{buffer|flat|global}}_load
|
||||||
|
|
||||||
|
|
||||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
|
; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
|
||||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c
|
|
||||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54
|
|
||||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c
|
|
||||||
|
|
||||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
|
; HSA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
|
||||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28
|
|
||||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
|
|
||||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38
|
|
||||||
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
|
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
|
||||||
entry:
|
entry:
|
||||||
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
|
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
|
||||||
|
@ -600,22 +567,21 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}kernel_arg_i64:
|
; FUNC-LABEL: {{^}}kernel_arg_i64:
|
||||||
; MESA-GCN: s_load_dwordx2
|
; MESA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[0:1], 0x24
|
||||||
; MESA-GCN: s_load_dwordx2
|
; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
|
||||||
|
|
||||||
; MESA-GCN: buffer_store_dwordx2
|
; MESA-GCN: buffer_store_dwordx2
|
||||||
; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
|
|
||||||
define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
|
define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
|
||||||
store i64 %a, i64 addrspace(1)* %out, align 8
|
store i64 %a, i64 addrspace(1)* %out, align 8
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}f64_kernel_arg:
|
; FUNC-LABEL: {{^}}f64_kernel_arg:
|
||||||
; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
|
; SI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x9
|
||||||
; SI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0xb
|
; MESA-VI-DAG: s_load_dwordx4 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
|
||||||
; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x24
|
|
||||||
; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
|
|
||||||
; MESA-GCN: buffer_store_dwordx2
|
; MESA-GCN: buffer_store_dwordx2
|
||||||
; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
|
|
||||||
|
; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x0
|
||||||
define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
|
define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double %in) {
|
||||||
entry:
|
entry:
|
||||||
store double %in, double addrspace(1)* %out
|
store double %in, double addrspace(1)* %out
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
; GCN: s_load_dword s[[LO:[0-9]+]]
|
; GCN: s_load_dword s[[LO:[0-9]+]]
|
||||||
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
|
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[LO]]
|
||||||
; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
; GCN: buffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
||||||
define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, half %data, i32 %index) {
|
define amdgpu_kernel void @buffer_store_format_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %index) {
|
||||||
main_body:
|
main_body:
|
||||||
call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
|
call void @llvm.amdgcn.buffer.store.format.f16(half %data, <4 x i32> %rsrc, i32 %index, i32 0, i1 0, i1 0)
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -4,8 +4,8 @@ declare half @llvm.fabs.f16(half %a)
|
||||||
declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
|
declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}class_f16:
|
; GCN-LABEL: {{^}}class_f16:
|
||||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||||
; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[B_I32:[0-9]+]]
|
||||||
; VI: v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]]
|
; VI: v_cmp_class_f16_e32 vcc, v[[A_F16]], v[[B_I32]]
|
||||||
; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
|
; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
|
||||||
; GCN: buffer_store_dword v[[R_I32]]
|
; GCN: buffer_store_dword v[[R_I32]]
|
||||||
|
@ -33,7 +33,9 @@ entry:
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @class_f16_fabs(
|
define amdgpu_kernel void @class_f16_fabs(
|
||||||
i32 addrspace(1)* %r,
|
i32 addrspace(1)* %r,
|
||||||
|
[8 x i32],
|
||||||
half %a.val,
|
half %a.val,
|
||||||
|
[8 x i32],
|
||||||
i32 %b.val) {
|
i32 %b.val) {
|
||||||
entry:
|
entry:
|
||||||
%a.val.fabs = call half @llvm.fabs.f16(half %a.val)
|
%a.val.fabs = call half @llvm.fabs.f16(half %a.val)
|
||||||
|
@ -53,7 +55,9 @@ entry:
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @class_f16_fneg(
|
define amdgpu_kernel void @class_f16_fneg(
|
||||||
i32 addrspace(1)* %r,
|
i32 addrspace(1)* %r,
|
||||||
|
[8 x i32],
|
||||||
half %a.val,
|
half %a.val,
|
||||||
|
[8 x i32],
|
||||||
i32 %b.val) {
|
i32 %b.val) {
|
||||||
entry:
|
entry:
|
||||||
%a.val.fneg = fsub half -0.0, %a.val
|
%a.val.fneg = fsub half -0.0, %a.val
|
||||||
|
@ -73,7 +77,9 @@ entry:
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @class_f16_fabs_fneg(
|
define amdgpu_kernel void @class_f16_fabs_fneg(
|
||||||
i32 addrspace(1)* %r,
|
i32 addrspace(1)* %r,
|
||||||
|
[8 x i32],
|
||||||
half %a.val,
|
half %a.val,
|
||||||
|
[8 x i32],
|
||||||
i32 %b.val) {
|
i32 %b.val) {
|
||||||
entry:
|
entry:
|
||||||
%a.val.fabs = call half @llvm.fabs.f16(half %a.val)
|
%a.val.fabs = call half @llvm.fabs.f16(half %a.val)
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s
|
||||||
|
|
||||||
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
|
declare i1 @llvm.amdgcn.class.f32(float, i32) #1
|
||||||
declare i1 @llvm.amdgcn.class.f64(double, i32) #1
|
declare i1 @llvm.amdgcn.class.f64(double, i32) #1
|
||||||
|
@ -7,14 +7,14 @@ declare float @llvm.fabs.f32(float) #1
|
||||||
declare double @llvm.fabs.f64(double) #1
|
declare double @llvm.fabs.f64(double) #1
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_f32:
|
; SI-LABEL: {{^}}test_class_f32:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
|
; SI: v_cmp_class_f32_e32 vcc, [[SA]], [[VB]]
|
||||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||||
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
|
||||||
%sext = sext i1 %result to i32
|
%sext = sext i1 %result to i32
|
||||||
store i32 %sext, i32 addrspace(1)* %out, align 4
|
store i32 %sext, i32 addrspace(1)* %out, align 4
|
||||||
|
@ -22,14 +22,14 @@ define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_fabs_f32:
|
; SI-LABEL: {{^}}test_class_fabs_f32:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
||||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||||
%a.fabs = call float @llvm.fabs.f32(float %a) #1
|
%a.fabs = call float @llvm.fabs.f32(float %a) #1
|
||||||
%result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
|
||||||
%sext = sext i1 %result to i32
|
%sext = sext i1 %result to i32
|
||||||
|
@ -38,14 +38,14 @@ define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a,
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_fneg_f32:
|
; SI-LABEL: {{^}}test_class_fneg_f32:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
||||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||||
%a.fneg = fsub float -0.0, %a
|
%a.fneg = fsub float -0.0, %a
|
||||||
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
|
||||||
%sext = sext i1 %result to i32
|
%sext = sext i1 %result to i32
|
||||||
|
@ -54,14 +54,14 @@ define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a,
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
|
; SI-LABEL: {{^}}test_class_fneg_fabs_f32:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
; SI: v_cmp_class_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
||||||
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 {
|
||||||
%a.fabs = call float @llvm.fabs.f32(float %a) #1
|
%a.fabs = call float @llvm.fabs.f32(float %a) #1
|
||||||
%a.fneg.fabs = fsub float -0.0, %a.fabs
|
%a.fneg.fabs = fsub float -0.0, %a.fabs
|
||||||
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
|
||||||
|
@ -183,14 +183,14 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspac
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_f64:
|
; SI-LABEL: {{^}}test_class_f64:
|
||||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
|
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[VB]]
|
||||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||||
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
|
||||||
%sext = sext i1 %result to i32
|
%sext = sext i1 %result to i32
|
||||||
store i32 %sext, i32 addrspace(1)* %out, align 4
|
store i32 %sext, i32 addrspace(1)* %out, align 4
|
||||||
|
@ -198,14 +198,14 @@ define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_fabs_f64:
|
; SI-LABEL: {{^}}test_class_fabs_f64:
|
||||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SA]]|, [[VB]]
|
||||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||||
%a.fabs = call double @llvm.fabs.f64(double %a) #1
|
%a.fabs = call double @llvm.fabs.f64(double %a) #1
|
||||||
%result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
|
||||||
%sext = sext i1 %result to i32
|
%sext = sext i1 %result to i32
|
||||||
|
@ -214,14 +214,14 @@ define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_fneg_f64:
|
; SI-LABEL: {{^}}test_class_fneg_f64:
|
||||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -[[SA]], [[VB]]
|
||||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||||
%a.fneg = fsub double -0.0, %a
|
%a.fneg = fsub double -0.0, %a
|
||||||
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
|
||||||
%sext = sext i1 %result to i32
|
%sext = sext i1 %result to i32
|
||||||
|
@ -230,14 +230,14 @@ define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
|
; SI-LABEL: {{^}}test_class_fneg_fabs_f64:
|
||||||
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
; SI: v_cmp_class_f64_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], -|[[SA]]|, [[VB]]
|
||||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 {
|
||||||
%a.fabs = call double @llvm.fabs.f64(double %a) #1
|
%a.fabs = call double @llvm.fabs.f64(double %a) #1
|
||||||
%a.fneg.fabs = fsub double -0.0, %a.fabs
|
%a.fneg.fabs = fsub double -0.0, %a.fabs
|
||||||
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
|
%result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
|
||||||
|
@ -268,14 +268,14 @@ define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a)
|
||||||
|
|
||||||
; Set all 9 bits of mask
|
; Set all 9 bits of mask
|
||||||
; SI-LABEL: {{^}}test_class_full_mask_f64:
|
; SI-LABEL: {{^}}test_class_full_mask_f64:
|
||||||
; SI: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 [[SA:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
; SI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x1ff{{$}}
|
||||||
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
|
; SI: v_cmp_class_f64_e32 vcc, [[SA]], [[MASK]]
|
||||||
; SI-NOT: vcc
|
; SI-NOT: vcc
|
||||||
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
|
||||||
; SI-NEXT: buffer_store_dword [[RESULT]]
|
; SI-NEXT: buffer_store_dword [[RESULT]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
|
define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, [8 x i32], double %a) #0 {
|
||||||
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
|
%result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
|
||||||
%sext = sext i1 %result to i32
|
%sext = sext i1 %result to i32
|
||||||
store i32 %sext, i32 addrspace(1)* %out, align 4
|
store i32 %sext, i32 addrspace(1)* %out, align 4
|
||||||
|
|
|
@ -4,11 +4,10 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
|
; GCN-LABEL: {{^}}s_cvt_pk_i16_i32:
|
||||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
|
||||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
|
||||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; SI: v_cvt_pk_i16_i32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; VI: v_cvt_pk_i16_i32 v{{[0-9]+}}, [[X]], [[VY]]
|
|
||||||
define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
|
define amdgpu_kernel void @s_cvt_pk_i16_i32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
|
||||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
|
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.i16(i32 %x, i32 %y)
|
||||||
%r = bitcast <2 x i16> %result to i32
|
%r = bitcast <2 x i16> %result to i32
|
||||||
|
|
|
@ -4,11 +4,10 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
|
; GCN-LABEL: {{^}}s_cvt_pk_u16_u32:
|
||||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
|
||||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
|
||||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; SI: v_cvt_pk_u16_u32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; VI: v_cvt_pk_u16_u32 v{{[0-9]+}}, [[X]], [[VY]]
|
|
||||||
define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
|
define amdgpu_kernel void @s_cvt_pk_u16_u32(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
|
||||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
|
%result = call <2 x i16> @llvm.amdgcn.cvt.pk.u16(i32 %x, i32 %y)
|
||||||
%r = bitcast <2 x i16> %result to i32
|
%r = bitcast <2 x i16> %result to i32
|
||||||
|
|
|
@ -4,11 +4,10 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
|
; GCN-LABEL: {{^}}s_cvt_pknorm_i16_f32:
|
||||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
|
||||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
|
||||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; SI: v_cvt_pknorm_i16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; VI: v_cvt_pknorm_i16_f32 v{{[0-9]+}}, [[X]], [[VY]]
|
|
||||||
define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
|
define amdgpu_kernel void @s_cvt_pknorm_i16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
|
||||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
|
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.i16(float %x, float %y)
|
||||||
%r = bitcast <2 x i16> %result to i32
|
%r = bitcast <2 x i16> %result to i32
|
||||||
|
|
|
@ -4,11 +4,10 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
|
; GCN-LABEL: {{^}}s_cvt_pknorm_u16_f32:
|
||||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
|
||||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
|
||||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; SI: v_cvt_pknorm_u16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; VI: v_cvt_pknorm_u16_f32 v{{[0-9]+}}, [[X]], [[VY]]
|
|
||||||
define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
|
define amdgpu_kernel void @s_cvt_pknorm_u16_f32(i32 addrspace(1)* %out, float %x, float %y) #0 {
|
||||||
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
|
%result = call <2 x i16> @llvm.amdgcn.cvt.pknorm.u16(float %x, float %y)
|
||||||
%r = bitcast <2 x i16> %result to i32
|
%r = bitcast <2 x i16> %result to i32
|
||||||
|
|
|
@ -3,11 +3,10 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
|
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
|
; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
|
||||||
; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
|
; GCN-DAG: s_load_dwordx2 s{{\[}}[[SX:[0-9]+]]:[[SY:[0-9]+]]{{\]}}, s[0:1], 0x{{b|2c}}
|
||||||
; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
|
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], s[[SY]]
|
||||||
; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
|
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
|
; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, s[[SX]], [[VY]]
|
||||||
; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]]
|
|
||||||
define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
|
define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
|
||||||
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
|
%result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
|
||||||
store <2 x half> %result, <2 x half> addrspace(1)* %out
|
store <2 x half> %result, <2 x half> addrspace(1)* %out
|
||||||
|
|
|
@ -15,9 +15,9 @@ define amdgpu_kernel void @div_fixup_f16(
|
||||||
half addrspace(1)* %b,
|
half addrspace(1)* %b,
|
||||||
half addrspace(1)* %c) {
|
half addrspace(1)* %c) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%c.val = load half, half addrspace(1)* %c
|
%c.val = load volatile half, half addrspace(1)* %c
|
||||||
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val)
|
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half %c.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -35,8 +35,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_a(
|
||||||
half addrspace(1)* %b,
|
half addrspace(1)* %b,
|
||||||
half addrspace(1)* %c) {
|
half addrspace(1)* %c) {
|
||||||
entry:
|
entry:
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%c.val = load half, half addrspace(1)* %c
|
%c.val = load volatile half, half addrspace(1)* %c
|
||||||
%r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val)
|
%r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half %b.val, half %c.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -54,8 +54,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_b(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %c) {
|
half addrspace(1)* %c) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%c.val = load half, half addrspace(1)* %c
|
%c.val = load volatile half, half addrspace(1)* %c
|
||||||
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val)
|
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half 3.0, half %c.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -73,8 +73,8 @@ define amdgpu_kernel void @div_fixup_f16_imm_c(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0)
|
%r.val = call half @llvm.amdgcn.div.fixup.f16(half %a.val, half %b.val, half 3.0)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -90,7 +90,7 @@ define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %c) {
|
half addrspace(1)* %c) {
|
||||||
entry:
|
entry:
|
||||||
%c.val = load half, half addrspace(1)* %c
|
%c.val = load volatile half, half addrspace(1)* %c
|
||||||
%r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val)
|
%r.val = call half @llvm.amdgcn.div.fixup.f16(half 3.0, half 3.0, half %c.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -5,18 +5,20 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) nounwind readnone
|
||||||
declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
|
declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readnone
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_div_fixup_f32:
|
; GCN-LABEL: {{^}}test_div_fixup_f32:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
|
||||||
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
|
||||||
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
|
||||||
|
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
|
||||||
|
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
|
; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
|
||||||
; GCN: buffer_store_dword [[RESULT]],
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
|
define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c) nounwind {
|
||||||
%result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
|
||||||
store float %result, float addrspace(1)* %out, align 4
|
store float %result, float addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=SI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s
|
||||||
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN -check-prefix=VI %s
|
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
; FIXME: Enable for VI.
|
; FIXME: Enable for VI.
|
||||||
|
|
||||||
|
@ -8,33 +8,36 @@ declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readno
|
||||||
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
|
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_div_fmas_f32:
|
; GCN-LABEL: {{^}}test_div_fmas_f32:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
|
||||||
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
|
||||||
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
|
||||||
|
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
|
||||||
|
|
||||||
|
; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}}
|
||||||
|
; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1
|
||||||
|
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
|
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
|
||||||
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
|
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]]
|
||||||
; GCN: buffer_store_dword [[RESULT]],
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
; GCN: s_endpgm
|
define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
|
||||||
define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
|
|
||||||
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
|
||||||
store float %result, float addrspace(1)* %out, align 4
|
store float %result, float addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
|
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_0:
|
||||||
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x25
|
||||||
; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
||||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
|
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
|
||||||
; SI: buffer_store_dword [[RESULT]],
|
; SI: buffer_store_dword [[RESULT]],
|
||||||
; SI: s_endpgm
|
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
|
||||||
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
|
|
||||||
%result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
|
||||||
store float %result, float addrspace(1)* %out, align 4
|
store float %result, float addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
|
@ -43,26 +46,32 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %o
|
||||||
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
|
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_1:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||||
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||||
; SI-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
|
||||||
; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
|
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
|
; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94
|
||||||
; SI: buffer_store_dword [[RESULT]],
|
|
||||||
; SI: s_endpgm
|
; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]]
|
||||||
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
|
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
|
||||||
|
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
|
||||||
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
|
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind {
|
||||||
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
|
||||||
store float %result, float addrspace(1)* %out, align 4
|
store float %result, float addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
|
; GCN-LABEL: {{^}}test_div_fmas_f32_inline_imm_2:
|
||||||
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
|
|
||||||
; SI-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
; VI-DAG: s_load_dword [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
|
; VI-DAG: s_load_dword [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
|
||||||
; SI: buffer_store_dword [[RESULT]],
|
|
||||||
; SI: s_endpgm
|
; GCN-DAG: v_mov_b32_e32 [[VA:v[0-9]+]], [[SA]]
|
||||||
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
|
; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||||
|
; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
|
||||||
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
|
define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind {
|
||||||
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
|
||||||
store float %result, float addrspace(1)* %out, align 4
|
store float %result, float addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
|
@ -77,8 +86,8 @@ define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
|
; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
|
||||||
; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
|
; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
|
||||||
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
|
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
|
||||||
define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
|
define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
|
||||||
%cmp = icmp eq i32 %i, 0
|
%cmp = icmp eq i32 %i, 0
|
||||||
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
|
||||||
|
@ -87,8 +96,8 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %ou
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
|
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
|
||||||
; SI: s_mov_b64 vcc, 0
|
; GCN: s_mov_b64 vcc, 0
|
||||||
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
|
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
|
||||||
define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
|
define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
|
||||||
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
|
||||||
store float %result, float addrspace(1)* %out, align 4
|
store float %result, float addrspace(1)* %out, align 4
|
||||||
|
@ -96,8 +105,8 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspa
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
|
; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
|
||||||
; SI: s_mov_b64 vcc, -1
|
; GCN: s_mov_b64 vcc, -1
|
||||||
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
|
; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
|
||||||
define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
|
define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
|
||||||
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
|
%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
|
||||||
store float %result, float addrspace(1)* %out, align 4
|
store float %result, float addrspace(1)* %out, align 4
|
||||||
|
|
|
@ -230,13 +230,13 @@ define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1:
|
; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_1:
|
||||||
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
|
; SI: v_mov_b32_e32 [[VA:v[0-9]+]], [[A]]
|
||||||
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
|
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
|
||||||
; SI: buffer_store_dword [[RESULT0]]
|
; SI: buffer_store_dword [[RESULT0]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
|
define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
|
||||||
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
|
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
|
||||||
%result0 = extractvalue { float, i1 } %result, 0
|
%result0 = extractvalue { float, i1 } %result, 0
|
||||||
store float %result0, float addrspace(1)* %out, align 4
|
store float %result0, float addrspace(1)* %out, align 4
|
||||||
|
@ -244,13 +244,13 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2:
|
; SI-LABEL: {{^}}test_div_scale_f32_all_scalar_2:
|
||||||
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
|
; SI: v_mov_b32_e32 [[VB:v[0-9]+]], [[B]]
|
||||||
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
|
; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
|
||||||
; SI: buffer_store_dword [[RESULT0]]
|
; SI: buffer_store_dword [[RESULT0]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
|
define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
|
||||||
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
|
%result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
|
||||||
%result0 = extractvalue { float, i1 } %result, 0
|
%result0 = extractvalue { float, i1 } %result, 0
|
||||||
store float %result0, float addrspace(1)* %out, align 4
|
store float %result0, float addrspace(1)* %out, align 4
|
||||||
|
@ -258,14 +258,14 @@ define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
|
; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_1:
|
||||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 s{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dwordx2 [[B:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||||
; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
|
; SI-DAG: v_mov_b32_e32 v[[VA_LO:[0-9]+]], s[[A_LO]]
|
||||||
; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
|
; SI-DAG: v_mov_b32_e32 v[[VA_HI:[0-9]+]], s[[A_HI]]
|
||||||
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
|
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
|
||||||
; SI: buffer_store_dwordx2 [[RESULT0]]
|
; SI: buffer_store_dwordx2 [[RESULT0]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
|
define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
|
||||||
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
|
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
|
||||||
%result0 = extractvalue { double, i1 } %result, 0
|
%result0 = extractvalue { double, i1 } %result, 0
|
||||||
store double %result0, double addrspace(1)* %out, align 8
|
store double %result0, double addrspace(1)* %out, align 8
|
||||||
|
@ -273,14 +273,14 @@ define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)*
|
||||||
}
|
}
|
||||||
|
|
||||||
; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
|
; SI-LABEL: {{^}}test_div_scale_f64_all_scalar_2:
|
||||||
; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dwordx2 [[A:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xd
|
; SI-DAG: s_load_dwordx2 s{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x1d
|
||||||
; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
|
; SI-DAG: v_mov_b32_e32 v[[VB_LO:[0-9]+]], s[[B_LO]]
|
||||||
; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
|
; SI-DAG: v_mov_b32_e32 v[[VB_HI:[0-9]+]], s[[B_HI]]
|
||||||
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
|
; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
|
||||||
; SI: buffer_store_dwordx2 [[RESULT0]]
|
; SI: buffer_store_dwordx2 [[RESULT0]]
|
||||||
; SI: s_endpgm
|
; SI: s_endpgm
|
||||||
define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
|
define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, [8 x i32], double %a, [8 x i32], double %b) nounwind {
|
||||||
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
|
%result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
|
||||||
%result0 = extractvalue { double, i1 } %result, 0
|
%result0 = extractvalue { double, i1 } %result, 0
|
||||||
store double %result0, double addrspace(1)* %out, align 8
|
store double %result0, double addrspace(1)* %out, align 8
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
; GCN: s_load_dword s[[S_LO:[0-9]+]]
|
; GCN: s_load_dword s[[S_LO:[0-9]+]]
|
||||||
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
|
; GCN: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[S_LO]]
|
||||||
; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
; GCN: tbuffer_store_format_d16_x v[[V_LO]], v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||||
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, half %data, i32 %vindex) {
|
define amdgpu_kernel void @tbuffer_store_d16_x(<4 x i32> %rsrc, [8 x i32], half %data, [8 x i32], i32 %vindex) {
|
||||||
main_body:
|
main_body:
|
||||||
call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
|
call void @llvm.amdgcn.tbuffer.store.f16(half %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i32 0, i32 0, i32 1, i32 2, i1 0, i1 0)
|
||||||
ret void
|
ret void
|
||||||
|
@ -41,7 +41,6 @@ main_body:
|
||||||
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
|
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
|
||||||
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||||
|
|
||||||
|
|
||||||
; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
|
; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
|
||||||
; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
|
; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
|
||||||
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||||
|
|
|
@ -24,14 +24,14 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}ceil_v2f16:
|
; GCN-LABEL: {{^}}ceil_v2f16:
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
; SI-DAG: v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
; SI-DAG: v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
|
|
@ -24,26 +24,31 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}cos_v2f16
|
; GCN-LABEL: {{^}}cos_v2f16
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI-DAG: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
|
; SI-DAG: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
|
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
|
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
|
||||||
|
; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
||||||
|
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
|
||||||
|
; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
||||||
|
|
||||||
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||||
; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
|
; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
|
||||||
; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
|
; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
|
||||||
|
; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
||||||
|
; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
||||||
|
|
||||||
; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
|
||||||
; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
|
||||||
; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
|
|
||||||
; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
|
|
||||||
|
|
||||||
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
|
|
||||||
|
; VI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
|
||||||
; GCN-NOT: and
|
; GCN-NOT: and
|
||||||
|
|
||||||
|
|
|
@ -2,11 +2,9 @@
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s
|
; RUN: llc -march=amdgcn -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}test_debug_value:
|
; GCN-LABEL: {{^}}test_debug_value:
|
||||||
; NOOPT: s_load_dwordx2 s[4:5]
|
; NOOPT: .loc 1 1 42 prologue_end ; /tmp/test_debug_value.cl:1:42
|
||||||
|
; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
|
||||||
; FIXME: Why is the SGPR4_SGPR5 reference being removed from DBG_VALUE?
|
; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- $sgpr4_sgpr5
|
||||||
; NOOPT: ; kill: def $sgpr8_sgpr9 killed $sgpr4_sgpr5
|
|
||||||
; NOOPT-NEXT: ;DEBUG_VALUE: test_debug_value:globalptr_arg <- undef
|
|
||||||
|
|
||||||
; GCN: flat_store_dword
|
; GCN: flat_store_dword
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
|
|
|
@ -24,13 +24,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}floor_v2f16
|
; GCN-LABEL: {{^}}floor_v2f16
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
; SI-DAG: v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
; SI-DAG: v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
|
@ -105,16 +105,17 @@ define amdgpu_kernel void @fma_f16_imm_c(
|
||||||
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||||
|
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
||||||
|
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||||
|
|
||||||
|
|
||||||
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
|
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
|
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
|
||||||
|
@ -145,8 +146,9 @@ define amdgpu_kernel void @fma_v2f16(
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fma_v2f16_imm_a:
|
; GCN-LABEL: {{^}}fma_v2f16_imm_a:
|
||||||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
|
||||||
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
|
|
||||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
@ -157,13 +159,14 @@ define amdgpu_kernel void @fma_v2f16(
|
||||||
; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||||
|
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
||||||
; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
|
|
||||||
|
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
|
||||||
|
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
|
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
|
||||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
|
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
|
||||||
|
@ -186,8 +189,8 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fma_v2f16_imm_b:
|
; GCN-LABEL: {{^}}fma_v2f16_imm_b:
|
||||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
|
||||||
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
@ -195,10 +198,10 @@ define amdgpu_kernel void @fma_v2f16_imm_a(
|
||||||
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
|
; SI: v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
|
||||||
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||||
|
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
||||||
|
@ -229,8 +232,8 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fma_v2f16_imm_c:
|
; GCN-LABEL: {{^}}fma_v2f16_imm_c:
|
||||||
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
|
||||||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
@ -238,26 +241,31 @@ define amdgpu_kernel void @fma_v2f16_imm_b(
|
||||||
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
|
; SI: v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
|
||||||
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
; VI: v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
|
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
|
|
||||||
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
|
|
||||||
|
; SI: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
|
||||||
|
; SI-DAG: v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI-DAG: v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
|
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
|
; GCN-NOT: and
|
||||||
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; VI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
|
; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
|
||||||
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
|
; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
|
||||||
|
|
||||||
; GCN-NOT: and
|
; GCN-NOT: and
|
||||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
||||||
|
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @fma_v2f16_imm_c(
|
define amdgpu_kernel void @fma_v2f16_imm_c(
|
||||||
|
|
|
@ -58,8 +58,8 @@ define amdgpu_kernel void @fmuladd_f16_imm_a(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %b,
|
half addrspace(1)* %b,
|
||||||
half addrspace(1)* %c) {
|
half addrspace(1)* %c) {
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%c.val = load half, half addrspace(1)* %c
|
%c.val = load volatile half, half addrspace(1)* %c
|
||||||
%r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
|
%r.val = call half @llvm.fmuladd.f16(half 3.0, half %b.val, half %c.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -87,56 +87,64 @@ define amdgpu_kernel void @fmuladd_f16_imm_b(
|
||||||
half addrspace(1)* %r,
|
half addrspace(1)* %r,
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %c) {
|
half addrspace(1)* %c) {
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%c.val = load half, half addrspace(1)* %c
|
%c.val = load volatile half, half addrspace(1)* %c
|
||||||
%r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
|
%r.val = call half @llvm.fmuladd.f16(half %a.val, half 3.0, half %c.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}fmuladd_v2f16
|
; GCN-LABEL: {{^}}fmuladd_v2f16
|
||||||
; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
|
||||||
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
|
; VI-FLUSH: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
; VI-FLUSH: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
; VI-FLUSH: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
|
; VI-DENORM: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
; VI-DENORM: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
; VI-DENORM: buffer_load_dword v[[C_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
|
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
|
; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
|
||||||
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
|
||||||
; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
|
||||||
; SI: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
|
; SI-DAG: v_mac_f32_e32 v[[C_F32_0]], v[[A_F32_0]], v[[B_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
|
; SI-DAG: v_mac_f32_e32 v[[C_F32_1]], v[[A_F32_1]], v[[B_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
|
||||||
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
|
||||||
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_LO]], v[[R_F16_HI]]
|
||||||
|
|
||||||
; VI-FLUSH: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
|
||||||
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[B_V2_F16]], v[[C_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
; VI-FLUSH: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||||
; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
|
; VI-FLUSH-DAG: v_mac_f16_sdwa v[[C_F16_1]], v[[A_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||||
; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
|
; VI-FLUSH-DAG: v_mac_f16_e32 v[[C_V2_F16]], v[[A_V2_F16]], v[[B_V2_F16]]
|
||||||
|
; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
|
||||||
; VI-FLUSH-NOT: v_and_b32
|
; VI-FLUSH-NOT: v_and_b32
|
||||||
; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[R_F16_HI]]
|
; VI-FLUSH: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[C_V2_F16]], v[[R_F16_HI]]
|
||||||
|
|
||||||
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
; VI-DENORM-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
|
||||||
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]], v[[C_V2_F16]]
|
; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
|
||||||
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]], v[[C_F16_1]]
|
; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
|
||||||
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
|
; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
|
||||||
; VI-DENORM-NOT: v_and_b32
|
; VI-DENORM-NOT: v_and_b32
|
||||||
; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
|
; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[RES0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
; GCN: s_endpgm
|
|
||||||
|
|
||||||
define amdgpu_kernel void @fmuladd_v2f16(
|
define amdgpu_kernel void @fmuladd_v2f16(
|
||||||
<2 x half> addrspace(1)* %r,
|
<2 x half> addrspace(1)* %r,
|
||||||
<2 x half> addrspace(1)* %a,
|
<2 x half> addrspace(1)* %a,
|
||||||
|
|
|
@ -22,8 +22,8 @@ define amdgpu_kernel void @maxnum_f16(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
|
%r.val = call half @llvm.maxnum.f16(half %a.val, half %b.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -66,17 +66,16 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}maxnum_v2f16:
|
; GCN-LABEL: {{^}}maxnum_v2f16:
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
|
||||||
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
|
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
|
; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
||||||
; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
@ -89,7 +88,7 @@ entry:
|
||||||
; VI-NOT: and
|
; VI-NOT: and
|
||||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
||||||
|
|
||||||
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
|
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
|
@ -107,13 +106,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
|
; GCN-LABEL: {{^}}maxnum_v2f16_imm_a:
|
||||||
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
|
; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
||||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||||
|
@ -127,7 +126,6 @@ entry:
|
||||||
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
|
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
; GCN: s_endpgm
|
|
||||||
define amdgpu_kernel void @maxnum_v2f16_imm_a(
|
define amdgpu_kernel void @maxnum_v2f16_imm_a(
|
||||||
<2 x half> addrspace(1)* %r,
|
<2 x half> addrspace(1)* %r,
|
||||||
<2 x half> addrspace(1)* %b) {
|
<2 x half> addrspace(1)* %b) {
|
||||||
|
@ -140,13 +138,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
|
; GCN-LABEL: {{^}}maxnum_v2f16_imm_b:
|
||||||
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
|
; SI-DAG: v_max_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
|
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
|
||||||
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-DAG: v_max_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
|
@ -162,7 +160,6 @@ entry:
|
||||||
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
|
; GFX9: v_pk_max_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
; GCN: s_endpgm
|
|
||||||
define amdgpu_kernel void @maxnum_v2f16_imm_b(
|
define amdgpu_kernel void @maxnum_v2f16_imm_b(
|
||||||
<2 x half> addrspace(1)* %r,
|
<2 x half> addrspace(1)* %r,
|
||||||
<2 x half> addrspace(1)* %a) {
|
<2 x half> addrspace(1)* %a) {
|
||||||
|
@ -192,8 +189,8 @@ entry:
|
||||||
; GCN-LABEL: {{^}}maxnum_v4f16:
|
; GCN-LABEL: {{^}}maxnum_v4f16:
|
||||||
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
|
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
|
||||||
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
|
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
|
||||||
; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
|
; GFX9-DAG: v_pk_max_f16 v[[MAX_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
|
||||||
; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
|
; GFX9-DAG: v_pk_max_f16 v[[MAX_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
|
||||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
|
; GFX9: buffer_store_dwordx2 v{{\[}}[[MAX_LO]]:[[MAX_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @maxnum_v4f16(
|
define amdgpu_kernel void @maxnum_v4f16(
|
||||||
<4 x half> addrspace(1)* %r,
|
<4 x half> addrspace(1)* %r,
|
||||||
|
|
|
@ -22,8 +22,8 @@ define amdgpu_kernel void @minnum_f16(
|
||||||
half addrspace(1)* %a,
|
half addrspace(1)* %a,
|
||||||
half addrspace(1)* %b) {
|
half addrspace(1)* %b) {
|
||||||
entry:
|
entry:
|
||||||
%a.val = load half, half addrspace(1)* %a
|
%a.val = load volatile half, half addrspace(1)* %a
|
||||||
%b.val = load half, half addrspace(1)* %b
|
%b.val = load volatile half, half addrspace(1)* %b
|
||||||
%r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
|
%r.val = call half @llvm.minnum.f16(half %a.val, half %b.val)
|
||||||
store half %r.val, half addrspace(1)* %r
|
store half %r.val, half addrspace(1)* %r
|
||||||
ret void
|
ret void
|
||||||
|
@ -66,20 +66,20 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}minnum_v2f16:
|
; GCN-LABEL: {{^}}minnum_v2f16:
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
|
||||||
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
|
; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]]
|
||||||
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI-NOT: and
|
; SI-NOT: and
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
@ -88,10 +88,9 @@ entry:
|
||||||
; VI-NOT: and
|
; VI-NOT: and
|
||||||
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
|
||||||
|
|
||||||
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]]
|
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
; GCN: s_endpgm
|
|
||||||
define amdgpu_kernel void @minnum_v2f16(
|
define amdgpu_kernel void @minnum_v2f16(
|
||||||
<2 x half> addrspace(1)* %r,
|
<2 x half> addrspace(1)* %r,
|
||||||
<2 x half> addrspace(1)* %a,
|
<2 x half> addrspace(1)* %a,
|
||||||
|
@ -106,15 +105,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
|
; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
|
||||||
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[B_V2_F16:[0-9]+]]
|
||||||
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
|
; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
|
||||||
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
|
|
||||||
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
|
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
; VI-DAG: v_mov_b32_e32 [[CONST4:v[0-9]+]], 0x4400
|
||||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], [[CONST4]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||||
|
@ -123,6 +120,7 @@ entry:
|
||||||
; SIVI-NOT: and
|
; SIVI-NOT: and
|
||||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
|
||||||
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
|
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x44004200
|
||||||
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
|
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[B_V2_F16]], [[K]]
|
||||||
|
|
||||||
|
@ -139,26 +137,28 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
|
; GCN-LABEL: {{^}}minnum_v2f16_imm_b:
|
||||||
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN-DAG: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
|
; SI-DAG: v_min_f32_e32 v[[R_F32_0:[0-9]+]], 4.0, v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
|
||||||
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
|
; VI-DAG: v_mov_b32_e32 [[CONST3:v[0-9]+]], 0x4200
|
||||||
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
; VI-DAG: v_min_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[A_V2_F16]], [[CONST3]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
|
||||||
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||||
|
|
||||||
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
|
|
||||||
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
|
|
||||||
|
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
|
|
||||||
|
|
||||||
; SIVI-NOT: and
|
; SIVI-NOT: and
|
||||||
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x42004400
|
||||||
|
; GFX9: v_pk_min_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], [[K]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword v[[R_V2_F16]]
|
; GCN: buffer_store_dword v[[R_V2_F16]]
|
||||||
; GCN: s_endpgm
|
|
||||||
define amdgpu_kernel void @minnum_v2f16_imm_b(
|
define amdgpu_kernel void @minnum_v2f16_imm_b(
|
||||||
<2 x half> addrspace(1)* %r,
|
<2 x half> addrspace(1)* %r,
|
||||||
<2 x half> addrspace(1)* %a) {
|
<2 x half> addrspace(1)* %a) {
|
||||||
|
@ -188,8 +188,8 @@ entry:
|
||||||
; GCN-LABEL: {{^}}minnum_v4f16:
|
; GCN-LABEL: {{^}}minnum_v4f16:
|
||||||
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
|
; GFX89: buffer_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
|
||||||
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
|
; GFX89: buffer_load_dwordx2 v{{\[}}[[B_LO:[0-9]+]]:[[B_HI:[0-9]+]]{{\]}}
|
||||||
; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[A_LO]], v[[B_LO]]
|
; GFX9-DAG: v_pk_min_f16 v[[MIN_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
|
||||||
; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[A_HI]], v[[B_HI]]
|
; GFX9-DAG: v_pk_min_f16 v[[MIN_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
|
||||||
; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
|
; GFX9: buffer_store_dwordx2 v{{\[}}[[MIN_LO]]:[[MIN_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @minnum_v4f16(
|
define amdgpu_kernel void @minnum_v4f16(
|
||||||
<4 x half> addrspace(1)* %r,
|
<4 x half> addrspace(1)* %r,
|
||||||
|
|
|
@ -25,13 +25,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}rint_v2f16
|
; GCN-LABEL: {{^}}rint_v2f16
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
; SI-DAG: v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
; SI-DAG: v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI-NOT: v_and_b32
|
; SI-NOT: v_and_b32
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
|
||||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
|
||||||
|
|
||||||
declare half @llvm.sin.f16(half %a)
|
declare half @llvm.sin.f16(half %a)
|
||||||
declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
|
declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}sin_f16
|
; GCN-LABEL: {{^}}sin_f16:
|
||||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||||
; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
; GCN: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||||
; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{0.15915494|0x3e22f983}}, v[[A_F32]]
|
; GCN: v_mul_f32_e32 v[[M_F32:[0-9]+]], {{0.15915494|0x3e22f983}}, v[[A_F32]]
|
||||||
|
@ -23,16 +23,20 @@ entry:
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}sin_v2f16
|
; GCN-LABEL: {{^}}sin_v2f16:
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
|
; SI: v_mov_b32_e32 v[[HALF_PI:[0-9]+]], 0x3e22f983{{$}}
|
||||||
|
|
||||||
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PI]]
|
||||||
; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[A_F32_0]], v[[HALF_PIE]]
|
; SI: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
||||||
; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
; SI: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PI]]
|
||||||
; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[A_F32_1]], v[[HALF_PIE]]
|
; SI: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
||||||
; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
; SI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
|
||||||
|
; SI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
|
||||||
|
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
|
|
||||||
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
|
||||||
|
@ -40,12 +44,11 @@ entry:
|
||||||
; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
|
; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
|
||||||
; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
|
||||||
; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
|
||||||
|
; VI: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
|
||||||
|
; VI: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
|
||||||
|
|
||||||
; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
|
|
||||||
; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
|
|
||||||
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
|
|
||||||
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
|
||||||
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
||||||
|
|
|
@ -24,13 +24,13 @@ entry:
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}trunc_v2f16
|
; GCN-LABEL: {{^}}trunc_v2f16
|
||||||
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
|
||||||
; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
; SI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
|
||||||
; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
|
||||||
; SI: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
; SI-DAG: v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||||
; SI: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
; SI-DAG: v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
|
||||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||||
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||||
; SI-NOT: v_and_b32
|
; SI-NOT: v_and_b32
|
||||||
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_HI]]
|
||||||
|
|
|
@ -17,7 +17,7 @@
|
||||||
; GCN-NOT: load_dword
|
; GCN-NOT: load_dword
|
||||||
|
|
||||||
; GCN: flat_store_dwordx2
|
; GCN: flat_store_dwordx2
|
||||||
define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64* %ptr1, i64 addrspace(1)* %ptr2) {
|
define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, [8 x i32], i64* %ptr0, [8 x i32], i64* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
|
||||||
%tmp2 = icmp eq i32 %tmp, 0
|
%tmp2 = icmp eq i32 %tmp, 0
|
||||||
%tmp3 = load i64, i64* %ptr0, align 8
|
%tmp3 = load i64, i64* %ptr0, align 8
|
||||||
%tmp4 = load i64, i64* %ptr1, align 8
|
%tmp4 = load i64, i64* %ptr1, align 8
|
||||||
|
@ -38,7 +38,7 @@ define amdgpu_kernel void @select_ptr_crash_i64_flat(i32 %tmp, i64* %ptr0, i64*
|
||||||
; GCN: v_cndmask_b32
|
; GCN: v_cndmask_b32
|
||||||
; GCN: v_cndmask_b32
|
; GCN: v_cndmask_b32
|
||||||
; GCN: flat_store_dwordx2
|
; GCN: flat_store_dwordx2
|
||||||
define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, i64 addrspace(1)* %ptr0, i64 addrspace(1)* %ptr1, i64 addrspace(1)* %ptr2) {
|
define amdgpu_kernel void @select_ptr_crash_i64_global(i32 %tmp, [8 x i32], i64 addrspace(1)* %ptr0, [8 x i32], i64 addrspace(1)* %ptr1, [8 x i32], i64 addrspace(1)* %ptr2) {
|
||||||
%tmp2 = icmp eq i32 %tmp, 0
|
%tmp2 = icmp eq i32 %tmp, 0
|
||||||
%tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8
|
%tmp3 = load i64, i64 addrspace(1)* %ptr0, align 8
|
||||||
%tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8
|
%tmp4 = load i64, i64 addrspace(1)* %ptr1, align 8
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -8,28 +8,14 @@
|
||||||
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
|
||||||
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
|
||||||
|
|
||||||
|
; CIVI: s_load_dword [[LHS:s[0-9]+]]
|
||||||
; VI: s_load_dword [[LHS:s[0-9]+]]
|
; CIVI: s_load_dword [[RHS:s[0-9]+]]
|
||||||
; VI: s_load_dword [[RHS:s[0-9]+]]
|
; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||||
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
; CIVI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
||||||
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
; CIVI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
|
||||||
; VI-DAG: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
|
; CIVI-DAG: s_lshl_b32
|
||||||
; VI-DAG: s_lshl_b32
|
; CIVI: v_or_b32_e32
|
||||||
; VI: v_or_b32_e32
|
|
||||||
|
|
||||||
; CI: s_load_dword s
|
|
||||||
; CI-NEXT: s_load_dword s
|
|
||||||
; CI-NOT: {{buffer|flat}}
|
|
||||||
; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
|
|
||||||
; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
|
||||||
; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
|
||||||
; CI: s_and_b32
|
|
||||||
; CI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
|
|
||||||
; CI: s_and_b32
|
|
||||||
; CI: v_bfe_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 16
|
|
||||||
; CI: s_lshl_b32
|
|
||||||
; CI: v_or_b32_e32
|
|
||||||
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
|
define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
|
||||||
%result = lshr <2 x i16> %lhs, %rhs
|
%result = lshr <2 x i16> %lhs, %rhs
|
||||||
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
|
store <2 x i16> %result, <2 x i16> addrspace(1)* %out
|
||||||
|
|
|
@ -206,14 +206,14 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia
|
||||||
; SIFoldOperands should not fold the SGPR copy into the instruction
|
; SIFoldOperands should not fold the SGPR copy into the instruction
|
||||||
; because the implicit immediate already uses the constant bus.
|
; because the implicit immediate already uses the constant bus.
|
||||||
; GCN-LABEL: {{^}}madak_constant_bus_violation:
|
; GCN-LABEL: {{^}}madak_constant_bus_violation:
|
||||||
; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
|
; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x12|0x48}}
|
||||||
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
|
; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]]
|
||||||
; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
|
; GCN: {{buffer|flat|global}}_load_dword [[VGPR:v[0-9]+]]
|
||||||
; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
|
; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
|
||||||
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
|
; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[MADAK]], [[VGPR]]
|
||||||
; GFX6: buffer_store_dword [[MUL]]
|
; GFX6: buffer_store_dword [[MUL]]
|
||||||
; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
|
; GFX8_9: {{flat|global}}_store_dword v[{{[0-9:]+}}], [[MUL]]
|
||||||
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
|
define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, [8 x i32], float %sgpr0, float %sgpr1) #0 {
|
||||||
bb:
|
bb:
|
||||||
%tmp = icmp eq i32 %arg1, 0
|
%tmp = icmp eq i32 %arg1, 0
|
||||||
br i1 %tmp, label %bb3, label %bb4
|
br i1 %tmp, label %bb3, label %bb4
|
||||||
|
|
|
@ -83,7 +83,7 @@ define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out
|
||||||
; GCN-NOT: v_madmk_f32
|
; GCN-NOT: v_madmk_f32
|
||||||
; GCN: v_mac_f32_e32
|
; GCN: v_mac_f32_e32
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
|
define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, [8 x i32], float %a, [8 x i32], float %b) nounwind {
|
||||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
|
||||||
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
%out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
|
||||||
|
|
||||||
|
|
|
@ -216,14 +216,14 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %ou
|
||||||
|
|
||||||
; Make sure redundant and removed
|
; Make sure redundant and removed
|
||||||
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
|
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umax_ugt_i16:
|
||||||
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
|
; SI: s_max_u32 [[MAX:s[0-9]+]], [[A]], [[B]]
|
||||||
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
|
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
|
||||||
; SI: buffer_store_dword [[VMAX]]
|
; SI: buffer_store_dword [[VMAX]]
|
||||||
|
|
||||||
; EG: MAX_UINT
|
; EG: MAX_UINT
|
||||||
define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
|
define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) nounwind {
|
||||||
%a.ext = zext i16 %a to i32
|
%a.ext = zext i16 %a to i32
|
||||||
%b.ext = zext i16 %b to i32
|
%b.ext = zext i16 %b to i32
|
||||||
%cmp = icmp ugt i32 %a.ext, %b.ext
|
%cmp = icmp ugt i32 %a.ext, %b.ext
|
||||||
|
@ -236,14 +236,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspac
|
||||||
; Make sure redundant sign_extend_inreg removed.
|
; Make sure redundant sign_extend_inreg removed.
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16:
|
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_max_slt_i16:
|
||||||
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
|
; SI: s_max_i32 [[MAX:s[0-9]+]], [[A]], [[B]]
|
||||||
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
|
; SI: v_mov_b32_e32 [[VMAX:v[0-9]+]], [[MAX]]
|
||||||
; SI: buffer_store_dword [[VMAX]]
|
; SI: buffer_store_dword [[VMAX]]
|
||||||
|
|
||||||
; EG: MAX_INT
|
; EG: MAX_INT
|
||||||
define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
|
define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) nounwind {
|
||||||
%a.ext = sext i16 %a to i32
|
%a.ext = sext i16 %a to i32
|
||||||
%b.ext = sext i16 %b to i32
|
%b.ext = sext i16 %b to i32
|
||||||
%cmp = icmp sgt i32 %a.ext, %b.ext
|
%cmp = icmp sgt i32 %a.ext, %b.ext
|
||||||
|
@ -262,7 +262,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace
|
||||||
; SI: s_max_i32
|
; SI: s_max_i32
|
||||||
|
|
||||||
; EG: MAX_INT
|
; EG: MAX_INT
|
||||||
define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
|
define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, [8 x i32], i16 %a, [8 x i32], i16 %b) nounwind {
|
||||||
%cmp = icmp sge i16 %a, %b
|
%cmp = icmp sge i16 %a, %b
|
||||||
%val = select i1 %cmp, i16 %a, i16 %b
|
%val = select i1 %cmp, i16 %a, i16 %b
|
||||||
store i16 %val, i16 addrspace(1)* %out
|
store i16 %val, i16 addrspace(1)* %out
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
|
||||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
|
; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
|
||||||
; RUN: llc -march=r600 -mtriple=r600---amdgiz -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
; RUN: llc -march=r600 -mtriple=r600-- -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
|
; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
|
||||||
; GCN: v_min_i32_e32
|
; GCN: v_min_i32_e32
|
||||||
|
@ -65,16 +65,14 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <
|
||||||
; GCN: s_sext_i32_i8
|
; GCN: s_sext_i32_i8
|
||||||
; GCN: s_sext_i32_i8
|
; GCN: s_sext_i32_i8
|
||||||
; GCN: s_min_i32
|
; GCN: s_min_i32
|
||||||
define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
|
define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, [8 x i32], i8 %a, [8 x i32], i8 %b) #0 {
|
||||||
%cmp = icmp sle i8 %a, %b
|
%cmp = icmp sle i8 %a, %b
|
||||||
%val = select i1 %cmp, i8 %a, i8 %b
|
%val = select i1 %cmp, i8 %a, i8 %b
|
||||||
store i8 %val, i8 addrspace(1)* %out
|
store i8 %val, i8 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; XXX - should be able to use s_min if we stop unnecessarily doing
|
; FIXME: Why vector and sdwa for last element?
|
||||||
; extloads with mubuf instructions.
|
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
|
; FUNC-LABEL: {{^}}s_test_imin_sle_v4i8:
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
|
@ -88,7 +86,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %
|
||||||
; VI: s_min_i32
|
; VI: s_min_i32
|
||||||
; VI: s_min_i32
|
; VI: s_min_i32
|
||||||
; VI: s_min_i32
|
; VI: s_min_i32
|
||||||
; VI: s_min_i32
|
; VI: v_min_i32_sdwa
|
||||||
|
|
||||||
; GFX9: v_min_i16
|
; GFX9: v_min_i16
|
||||||
; GFX9: v_min_i16
|
; GFX9: v_min_i16
|
||||||
|
@ -99,7 +97,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %
|
||||||
; EG: MIN_INT
|
; EG: MIN_INT
|
||||||
; EG: MIN_INT
|
; EG: MIN_INT
|
||||||
; EG: MIN_INT
|
; EG: MIN_INT
|
||||||
define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
|
define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, [8 x i32], <4 x i8> %a, [8 x i32], <4 x i8> %b) #0 {
|
||||||
%cmp = icmp sle <4 x i8> %a, %b
|
%cmp = icmp sle <4 x i8> %a, %b
|
||||||
%val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
|
%val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
|
||||||
store <4 x i8> %val, <4 x i8> addrspace(1)* %out
|
store <4 x i8> %val, <4 x i8> addrspace(1)* %out
|
||||||
|
@ -110,9 +108,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
|
|
||||||
; SI: s_ashr_i32
|
|
||||||
; SI: s_ashr_i32
|
; SI: s_ashr_i32
|
||||||
; SI: s_sext_i32_i16
|
; SI: s_sext_i32_i16
|
||||||
|
; SI: s_ashr_i32
|
||||||
; SI: s_sext_i32_i16
|
; SI: s_sext_i32_i16
|
||||||
; SI: s_min_i32
|
; SI: s_min_i32
|
||||||
; SI: s_min_i32
|
; SI: s_min_i32
|
||||||
|
@ -346,8 +344,8 @@ define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrs
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
|
; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
|
||||||
; SI: buffer_load_ubyte
|
; SI: {{buffer|flat|global}}_load_ubyte
|
||||||
; SI: buffer_load_ubyte
|
; SI: {{buffer|flat|global}}_load_ubyte
|
||||||
; SI: v_min_u32_e32
|
; SI: v_min_u32_e32
|
||||||
|
|
||||||
; GFX89: {{flat|global}}_load_ubyte
|
; GFX89: {{flat|global}}_load_ubyte
|
||||||
|
@ -490,14 +488,14 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <
|
||||||
|
|
||||||
; Make sure redundant and removed
|
; Make sure redundant and removed
|
||||||
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
|
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_umin_ult_i16:
|
||||||
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
|
||||||
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
|
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
|
||||||
; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
|
; GCN: s_min_u32 [[MIN:s[0-9]+]], [[A]], [[B]]
|
||||||
; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
|
; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
|
||||||
; GCN: buffer_store_dword [[VMIN]]
|
; GCN: buffer_store_dword [[VMIN]]
|
||||||
|
|
||||||
; EG: MIN_UINT
|
; EG: MIN_UINT
|
||||||
define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
|
define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, [8 x i32], i16 zeroext %a, [8 x i32], i16 zeroext %b) #0 {
|
||||||
%a.ext = zext i16 %a to i32
|
%a.ext = zext i16 %a to i32
|
||||||
%b.ext = zext i16 %b to i32
|
%b.ext = zext i16 %b to i32
|
||||||
%cmp = icmp ult i32 %a.ext, %b.ext
|
%cmp = icmp ult i32 %a.ext, %b.ext
|
||||||
|
@ -510,14 +508,17 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspac
|
||||||
; Make sure redundant sign_extend_inreg removed.
|
; Make sure redundant sign_extend_inreg removed.
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
|
; FUNC-LABEL: {{^}}simplify_demanded_bits_test_min_slt_i16:
|
||||||
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
; GCN-DAG: s_load_dword [[A:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}}
|
||||||
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0xc|0x30}}
|
; GCN-DAG: s_load_dword [[B:s[0-9]+]], {{s\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
|
||||||
; GCN: s_min_i32 [[MIN:s[0-9]+]], [[A]], [[B]]
|
; GCN-DAG: s_sext_i32_i16 [[EXT_A:s[0-9]+]], [[A]]
|
||||||
|
; GCN-DAG: s_sext_i32_i16 [[EXT_B:s[0-9]+]], [[B]]
|
||||||
|
|
||||||
|
; GCN: s_min_i32 [[MIN:s[0-9]+]], [[EXT_A]], [[EXT_B]]
|
||||||
; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
|
; GCN: v_mov_b32_e32 [[VMIN:v[0-9]+]], [[MIN]]
|
||||||
; GCN: buffer_store_dword [[VMIN]]
|
; GCN: buffer_store_dword [[VMIN]]
|
||||||
|
|
||||||
; EG: MIN_INT
|
; EG: MIN_INT
|
||||||
define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
|
define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, [8 x i32], i16 signext %a, [8 x i32], i16 signext %b) #0 {
|
||||||
%a.ext = sext i16 %a to i32
|
%a.ext = sext i16 %a to i32
|
||||||
%b.ext = sext i16 %b to i32
|
%b.ext = sext i16 %b to i32
|
||||||
%cmp = icmp slt i32 %a.ext, %b.ext
|
%cmp = icmp slt i32 %a.ext, %b.ext
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
; resulting in losing the store to gptr
|
; resulting in losing the store to gptr
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}missing_store_reduced:
|
; FUNC-LABEL: {{^}}missing_store_reduced:
|
||||||
; SI: s_load_dwordx2
|
; SI: s_load_dwordx4
|
||||||
; SI: ds_read_b64
|
; SI: ds_read_b64
|
||||||
; SI-DAG: buffer_store_dword
|
; SI-DAG: buffer_store_dword
|
||||||
; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
|
; SI-DAG: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
|
||||||
|
|
|
@ -19,7 +19,7 @@
|
||||||
; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
|
; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
|
||||||
; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
|
; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
|
||||||
|
|
||||||
define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
|
define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, [8 x i32], i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
|
||||||
bb:
|
bb:
|
||||||
%tmp = icmp sgt i32 %arg3, 0
|
%tmp = icmp sgt i32 %arg3, 0
|
||||||
br i1 %tmp, label %bb4, label %bb17
|
br i1 %tmp, label %bb4, label %bb17
|
||||||
|
|
|
@ -16,7 +16,8 @@ define i16 @v_mul_i16(i16 %a, i16 %b) {
|
||||||
|
|
||||||
; FIXME: Should emit scalar mul or maybe i16 v_mul here
|
; FIXME: Should emit scalar mul or maybe i16 v_mul here
|
||||||
; GCN-LABEL: {{^}}s_mul_i16:
|
; GCN-LABEL: {{^}}s_mul_i16:
|
||||||
; GCN: v_mul_u32_u24
|
; SI: v_mul_u32_u24
|
||||||
|
; VI: s_mul_i16
|
||||||
define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) {
|
define amdgpu_kernel void @s_mul_i16(i16 %a, i16 %b) {
|
||||||
%r.val = mul i16 %a, %b
|
%r.val = mul i16 %a, %b
|
||||||
store volatile i16 %r.val, i16 addrspace(1)* null
|
store volatile i16 %r.val, i16 addrspace(1)* null
|
||||||
|
|
|
@ -114,7 +114,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 a
|
||||||
; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
|
||||||
; GCN: buffer_store_dword [[VRESULT]],
|
; GCN: buffer_store_dword [[VRESULT]],
|
||||||
; GCN: s_endpgm
|
; GCN: s_endpgm
|
||||||
define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
|
define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) nounwind {
|
||||||
%mul = mul i32 %a, %b
|
%mul = mul i32 %a, %b
|
||||||
store i32 %mul, i32 addrspace(1)* %out, align 4
|
store i32 %mul, i32 addrspace(1)* %out, align 4
|
||||||
ret void
|
ret void
|
||||||
|
@ -201,10 +201,8 @@ endif:
|
||||||
|
|
||||||
; FIXME: Load dwordx4
|
; FIXME: Load dwordx4
|
||||||
; FUNC-LABEL: {{^}}s_mul_i128:
|
; FUNC-LABEL: {{^}}s_mul_i128:
|
||||||
; GCN: s_load_dwordx2
|
; GCN: s_load_dwordx4
|
||||||
; GCN: s_load_dwordx2
|
; GCN: s_load_dwordx4
|
||||||
; GCN: s_load_dwordx2
|
|
||||||
; GCN: s_load_dwordx2
|
|
||||||
|
|
||||||
; SI: v_mul_hi_u32
|
; SI: v_mul_hi_u32
|
||||||
; SI: v_mul_hi_u32
|
; SI: v_mul_hi_u32
|
||||||
|
@ -220,18 +218,23 @@ endif:
|
||||||
; SI-DAG: s_mul_i32
|
; SI-DAG: s_mul_i32
|
||||||
; SI-DAG: v_mul_hi_u32
|
; SI-DAG: v_mul_hi_u32
|
||||||
|
|
||||||
; VI: s_mul_i32
|
|
||||||
; VI: v_mul_hi_u32
|
; VI: v_mul_hi_u32
|
||||||
; VI: s_mul_i32
|
; VI: s_mul_i32
|
||||||
|
; VI: s_mul_i32
|
||||||
; VI: v_mul_hi_u32
|
; VI: v_mul_hi_u32
|
||||||
|
; VI: v_mul_hi_u32
|
||||||
|
; VI: s_mul_i32
|
||||||
; VI: v_mad_u64_u32
|
; VI: v_mad_u64_u32
|
||||||
|
; VI: s_mul_i32
|
||||||
; VI: v_mad_u64_u32
|
; VI: v_mad_u64_u32
|
||||||
|
; VI: s_mul_i32
|
||||||
|
; VI: s_mul_i32
|
||||||
; VI: v_mad_u64_u32
|
; VI: v_mad_u64_u32
|
||||||
|
; VI: s_mul_i32
|
||||||
|
|
||||||
|
|
||||||
; GCN: buffer_store_dwordx4
|
; GCN: buffer_store_dwordx4
|
||||||
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
|
define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, [8 x i32], i128 %a, [8 x i32], i128 %b) nounwind #0 {
|
||||||
%mul = mul i128 %a, %b
|
%mul = mul i128 %a, %b
|
||||||
store i128 %mul, i128 addrspace(1)* %out
|
store i128 %mul, i128 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -70,7 +70,7 @@ entry:
|
||||||
; GCN-DAG: v_mul_i32_i24_e32
|
; GCN-DAG: v_mul_i32_i24_e32
|
||||||
|
|
||||||
; GCN: buffer_store_dwordx2
|
; GCN: buffer_store_dwordx2
|
||||||
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, [8 x i32], i32 %a, [8 x i32], i32 %b) #0 {
|
||||||
%shl.i = shl i32 %a, 8
|
%shl.i = shl i32 %a, 8
|
||||||
%shr.i = ashr i32 %shl.i, 8
|
%shr.i = ashr i32 %shl.i, 8
|
||||||
%conv.i = sext i32 %shr.i to i64
|
%conv.i = sext i32 %shr.i to i64
|
||||||
|
|
|
@ -18,8 +18,11 @@ entry:
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_umul24_i16_sext:
|
; FUNC-LABEL: {{^}}test_umul24_i16_sext:
|
||||||
; GCN: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
; SI: v_mul_u32_u24_e{{(32|64)}} [[VI_MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
|
||||||
; GCN: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
|
; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
|
||||||
|
|
||||||
|
; VI: s_mul_i32 [[MUL:s[0-9]+]]
|
||||||
|
; VI: s_sext_i32_i16 s{{[0-9]+}}, [[MUL]]
|
||||||
define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
||||||
entry:
|
entry:
|
||||||
%mul = mul i16 %a, %b
|
%mul = mul i16 %a, %b
|
||||||
|
@ -46,9 +49,12 @@ define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}test_umul24_i16:
|
; FUNC-LABEL: {{^}}test_umul24_i16:
|
||||||
; GCN: s_and_b32
|
; SI: s_and_b32
|
||||||
; GCN: v_mul_u32_u24_e32
|
; SI: v_mul_u32_u24_e32
|
||||||
; GCN: v_and_b32_e32
|
; SI: v_and_b32_e32
|
||||||
|
|
||||||
|
; VI: s_mul_i32
|
||||||
|
; VI: s_and_b32
|
||||||
define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
|
||||||
entry:
|
entry:
|
||||||
%mul = mul i16 %a, %b
|
%mul = mul i16 %a, %b
|
||||||
|
@ -147,7 +153,7 @@ entry:
|
||||||
; GCN-NOT: s_and_b32
|
; GCN-NOT: s_and_b32
|
||||||
; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
||||||
; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
|
||||||
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
|
define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
|
||||||
entry:
|
entry:
|
||||||
%tmp0 = shl i64 %a, 40
|
%tmp0 = shl i64 %a, 40
|
||||||
%a.24 = lshr i64 %tmp0, 40
|
%a.24 = lshr i64 %tmp0, 40
|
||||||
|
|
|
@ -70,14 +70,14 @@
|
||||||
; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
|
; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]
|
||||||
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
|
; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
|
||||||
|
|
||||||
; GCN: ; %Flow1
|
; GCN: ; %Flow5
|
||||||
; GCN-NEXT: s_or_b64 exec, exec
|
; GCN-NEXT: s_or_b64 exec, exec
|
||||||
; GCN: v_cmp_ne_u32_e32 vcc, 0
|
; GCN: v_cmp_ne_u32_e32 vcc, 0
|
||||||
|
|
||||||
; GCN: ; %exit1
|
; GCN: ; %exit1
|
||||||
; GCN: ds_write_b32
|
; GCN: ds_write_b32
|
||||||
|
|
||||||
; GCN: %Flow2
|
; GCN: %Flow6
|
||||||
; GCN-NEXT: s_or_b64 exec, exec
|
; GCN-NEXT: s_or_b64 exec, exec
|
||||||
; GCN: v_cmp_ne_u32_e32 vcc, 0
|
; GCN: v_cmp_ne_u32_e32 vcc, 0
|
||||||
; GCN-NEXT: s_and_saveexec_b64
|
; GCN-NEXT: s_and_saveexec_b64
|
||||||
|
|
|
@ -78,7 +78,7 @@ define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out,
|
||||||
; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
|
; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dword s
|
||||||
; SI: buffer_store_dword v
|
; SI: buffer_store_dword v
|
||||||
define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
|
define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
|
||||||
%trunc = trunc i64 %arg to i32
|
%trunc = trunc i64 %arg to i32
|
||||||
store i32 %trunc, i32 addrspace(1)* %out
|
store i32 %trunc, i32 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
@ -100,7 +100,7 @@ define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %ou
|
||||||
; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
|
; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dword s
|
||||||
; SI: buffer_store_dword v
|
; SI: buffer_store_dword v
|
||||||
define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
|
define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
|
||||||
%srl = lshr i64 %arg, 32
|
%srl = lshr i64 %arg, 32
|
||||||
%trunc = trunc i64 %srl to i32
|
%trunc = trunc i64 %srl to i32
|
||||||
store i32 %trunc, i32 addrspace(1)* %out
|
store i32 %trunc, i32 addrspace(1)* %out
|
||||||
|
@ -147,7 +147,7 @@ define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out,
|
||||||
; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
|
; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dword s
|
||||||
; SI: buffer_store_byte v
|
; SI: buffer_store_byte v
|
||||||
define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
|
define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
|
||||||
%srl = lshr i64 %arg, 32
|
%srl = lshr i64 %arg, 32
|
||||||
%trunc = trunc i64 %srl to i8
|
%trunc = trunc i64 %srl to i8
|
||||||
store i8 %trunc, i8 addrspace(1)* %out
|
store i8 %trunc, i8 addrspace(1)* %out
|
||||||
|
@ -171,7 +171,7 @@ define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64
|
||||||
; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
|
; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
|
||||||
; SI: s_load_dword s
|
; SI: s_load_dword s
|
||||||
; SI: buffer_store_byte v
|
; SI: buffer_store_byte v
|
||||||
define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
|
define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, [8 x i32], i64 %arg) nounwind {
|
||||||
%trunc = trunc i64 %arg to i8
|
%trunc = trunc i64 %arg to i8
|
||||||
store i8 %trunc, i8 addrspace(1)* %out
|
store i8 %trunc, i8 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -6,7 +6,7 @@
|
||||||
; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
|
; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
|
||||||
; GCN: flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
; GCN: flat_load_dword v{{[0-9]+}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
|
||||||
|
|
||||||
define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, i32 addrspace(1)* nocapture %arg1) {
|
define amdgpu_kernel void @volatile_load(i32 addrspace(1)* %arg, [8 x i32], i32 addrspace(1)* nocapture %arg1) {
|
||||||
bb:
|
bb:
|
||||||
%tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
|
%tmp18 = load volatile i32, i32 addrspace(1)* %arg, align 4
|
||||||
%tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5
|
%tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 5
|
||||||
|
|
|
@ -4,14 +4,18 @@
|
||||||
; Make sure there isn't an extra space between the instruction name and first operands.
|
; Make sure there isn't an extra space between the instruction name and first operands.
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}add_f32:
|
; GCN-LABEL: {{^}}add_f32:
|
||||||
; SI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1c
|
||||||
; SI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
; SI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x13
|
||||||
; VI-DAG: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
; SI: v_mov_b32_e32 [[VREGA:v[0-9]+]], [[SREGA]]
|
||||||
; VI-DAG: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x30
|
; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGB]], [[VREGA]]
|
||||||
; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
|
|
||||||
; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
|
; VI: s_load_dword [[SREGA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c
|
||||||
|
; VI: s_load_dword [[SREGB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x70
|
||||||
|
; VI: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
|
||||||
|
; VI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
|
||||||
|
|
||||||
; GCN: buffer_store_dword [[RESULT]],
|
; GCN: buffer_store_dword [[RESULT]],
|
||||||
define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
|
define amdgpu_kernel void @add_f32(float addrspace(1)* %out, [8 x i32], float %a, [8 x i32], float %b) {
|
||||||
%result = fadd float %a, %b
|
%result = fadd float %a, %b
|
||||||
store float %result, float addrspace(1)* %out
|
store float %result, float addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
|
|
@ -63,26 +63,26 @@ define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a)
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}scalar_or_literal_i64:
|
; FUNC-LABEL: {{^}}scalar_or_literal_i64:
|
||||||
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
|
||||||
; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
|
; SI-DAG: s_or_b32 s[[RES_HI:[0-9]+]], s[[HI]], 0xf237b
|
||||||
; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
|
; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
|
||||||
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
|
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
|
||||||
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
|
; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
|
||||||
define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
|
define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
|
||||||
%or = or i64 %a, 4261135838621753
|
%or = or i64 %a, 4261135838621753
|
||||||
store i64 %or, i64 addrspace(1)* %out
|
store i64 %or, i64 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
|
; FUNC-LABEL: {{^}}scalar_or_literal_multi_use_i64:
|
||||||
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
; SI: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
|
||||||
; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
|
; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xf237b
|
||||||
; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
|
; SI-DAG: s_movk_i32 s[[K_LO:[0-9]+]], 0x3039
|
||||||
; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
|
; SI: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
|
||||||
|
|
||||||
; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
|
; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
|
||||||
; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
|
; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
|
||||||
define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
|
define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
|
||||||
%or = or i64 %a, 4261135838621753
|
%or = or i64 %a, 4261135838621753
|
||||||
store i64 %or, i64 addrspace(1)* %out
|
store i64 %or, i64 addrspace(1)* %out
|
||||||
|
|
||||||
|
@ -92,7 +92,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
|
||||||
}
|
}
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
|
; FUNC-LABEL: {{^}}scalar_or_inline_imm_i64:
|
||||||
; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
|
; SI: s_load_dwordx2 s{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, {{0x13|0x4c}}
|
||||||
; SI-NOT: or_b32
|
; SI-NOT: or_b32
|
||||||
; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
|
; SI: s_or_b32 s[[VAL_LO]], s[[VAL_LO]], 63
|
||||||
; SI-NOT: or_b32
|
; SI-NOT: or_b32
|
||||||
|
@ -101,7 +101,7 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %ou
|
||||||
; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
|
; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
|
||||||
; SI-NOT: or_b32
|
; SI-NOT: or_b32
|
||||||
; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
|
; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
|
||||||
define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
|
define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
|
||||||
%or = or i64 %a, 63
|
%or = or i64 %a, 63
|
||||||
store i64 %or, i64 addrspace(1)* %out
|
store i64 %or, i64 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
@ -125,7 +125,7 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)*
|
||||||
; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
|
; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
|
||||||
; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
|
; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
|
||||||
; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
|
; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
|
||||||
define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
|
define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, [8 x i32], i64 %a) {
|
||||||
%or = or i64 %a, -8
|
%or = or i64 %a, -8
|
||||||
store i64 %or, i64 addrspace(1)* %out
|
store i64 %or, i64 addrspace(1)* %out
|
||||||
ret void
|
ret void
|
||||||
|
@ -239,7 +239,7 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64
|
||||||
; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
|
; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
|
||||||
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
|
; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
|
||||||
; SI: buffer_store_dword [[VRESULT]],
|
; SI: buffer_store_dword [[VRESULT]],
|
||||||
define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
|
define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, [8 x i32], i64 %a, [8 x i32], i64 %b) {
|
||||||
%add = or i64 %b, %a
|
%add = or i64 %b, %a
|
||||||
%trunc = trunc i64 %add to i32
|
%trunc = trunc i64 %add to i32
|
||||||
store i32 %trunc, i32 addrspace(1)* %out, align 8
|
store i32 %trunc, i32 addrspace(1)* %out, align 8
|
||||||
|
@ -249,7 +249,7 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i
|
||||||
; FUNC-LABEL: {{^}}or_i1:
|
; FUNC-LABEL: {{^}}or_i1:
|
||||||
; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
|
; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
|
||||||
|
|
||||||
; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
|
; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], vcc
|
||||||
define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
|
define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
|
||||||
%a = load float, float addrspace(1)* %in0
|
%a = load float, float addrspace(1)* %in0
|
||||||
%b = load float, float addrspace(1)* %in1
|
%b = load float, float addrspace(1)* %in1
|
||||||
|
|
|
@ -10,26 +10,26 @@
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
|
; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
|
||||||
|
|
||||||
; GCN: def s[8:15]
|
; GCN: def s[4:11]
|
||||||
; GCN: def s[16:23]
|
; GCN: def s[12:19]
|
||||||
; GCN: def s[24:31]
|
; GCN: def s[20:27]
|
||||||
; GCN: def s[32:39]
|
; GCN: def s[28:35]
|
||||||
; GCN: def s[40:47]
|
; GCN: def s[36:43]
|
||||||
; GCN: def s[48:55]
|
; GCN: def s[44:51]
|
||||||
; GCN: def s[56:63]
|
; GCN: def s[52:59]
|
||||||
; GCN: def s[64:71]
|
; GCN: def s[60:67]
|
||||||
; GCN: def s[72:79]
|
; GCN: def s[68:75]
|
||||||
; GCN: def s[80:87]
|
; GCN: def s[76:83]
|
||||||
; GCN: def s[88:95]
|
; GCN: def s[84:91]
|
||||||
|
|
||||||
; GCN: v_writelane_b32 v0, s8, 0
|
; GCN: v_writelane_b32 v0, s4, 0
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s9, 1
|
; GCN-NEXT: v_writelane_b32 v0, s5, 1
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s10, 2
|
; GCN-NEXT: v_writelane_b32 v0, s6, 2
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s11, 3
|
; GCN-NEXT: v_writelane_b32 v0, s7, 3
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s12, 4
|
; GCN-NEXT: v_writelane_b32 v0, s8, 4
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s13, 5
|
; GCN-NEXT: v_writelane_b32 v0, s9, 5
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s14, 6
|
; GCN-NEXT: v_writelane_b32 v0, s10, 6
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s15, 7
|
; GCN-NEXT: v_writelane_b32 v0, s11, 7
|
||||||
|
|
||||||
; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
|
; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
|
||||||
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
|
; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
|
||||||
|
@ -37,8 +37,8 @@
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s13, 13
|
; GCN-NEXT: v_writelane_b32 v0, s9, 13
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s14, 14
|
; GCN-NEXT: v_writelane_b32 v0, s10, 14
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
|
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
|
||||||
|
|
||||||
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
||||||
|
@ -47,8 +47,8 @@
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s13, 21
|
; GCN-NEXT: v_writelane_b32 v0, s9, 21
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s14, 22
|
; GCN-NEXT: v_writelane_b32 v0, s10, 22
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
|
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
|
||||||
|
|
||||||
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
||||||
|
@ -57,8 +57,8 @@
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s13, 29
|
; GCN-NEXT: v_writelane_b32 v0, s9, 29
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s14, 30
|
; GCN-NEXT: v_writelane_b32 v0, s10, 30
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
|
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
|
||||||
|
|
||||||
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
||||||
|
@ -67,8 +67,8 @@
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s13, 37
|
; GCN-NEXT: v_writelane_b32 v0, s9, 37
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s14, 38
|
; GCN-NEXT: v_writelane_b32 v0, s10, 38
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
|
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
|
||||||
|
|
||||||
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
||||||
|
@ -77,8 +77,8 @@
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s13, 45
|
; GCN-NEXT: v_writelane_b32 v0, s9, 45
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s14, 46
|
; GCN-NEXT: v_writelane_b32 v0, s10, 46
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
|
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
|
||||||
|
|
||||||
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
|
||||||
|
@ -87,90 +87,90 @@
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
|
; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s13, 53
|
; GCN-NEXT: v_writelane_b32 v0, s9, 53
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s14, 54
|
; GCN-NEXT: v_writelane_b32 v0, s10, 54
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
|
; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
|
||||||
|
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s88, 56
|
; GCN-NEXT: v_writelane_b32 v0, s84, 56
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s89, 57
|
; GCN-NEXT: v_writelane_b32 v0, s85, 57
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s90, 58
|
; GCN-NEXT: v_writelane_b32 v0, s86, 58
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s91, 59
|
; GCN-NEXT: v_writelane_b32 v0, s87, 59
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s92, 60
|
; GCN-NEXT: v_writelane_b32 v0, s88, 60
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s93, 61
|
; GCN-NEXT: v_writelane_b32 v0, s89, 61
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s94, 62
|
; GCN-NEXT: v_writelane_b32 v0, s90, 62
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s95, 63
|
; GCN-NEXT: v_writelane_b32 v0, s91, 63
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s16, 0
|
; GCN-NEXT: v_writelane_b32 v1, s12, 0
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s17, 1
|
; GCN-NEXT: v_writelane_b32 v1, s13, 1
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s18, 2
|
; GCN-NEXT: v_writelane_b32 v1, s14, 2
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s19, 3
|
; GCN-NEXT: v_writelane_b32 v1, s15, 3
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s20, 4
|
; GCN-NEXT: v_writelane_b32 v1, s16, 4
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s21, 5
|
; GCN-NEXT: v_writelane_b32 v1, s17, 5
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s22, 6
|
; GCN-NEXT: v_writelane_b32 v1, s18, 6
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s23, 7
|
; GCN-NEXT: v_writelane_b32 v1, s19, 7
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s24, 8
|
; GCN-NEXT: v_writelane_b32 v1, s20, 8
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s25, 9
|
; GCN-NEXT: v_writelane_b32 v1, s21, 9
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s26, 10
|
; GCN-NEXT: v_writelane_b32 v1, s22, 10
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s27, 11
|
; GCN-NEXT: v_writelane_b32 v1, s23, 11
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s28, 12
|
; GCN-NEXT: v_writelane_b32 v1, s24, 12
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s29, 13
|
; GCN-NEXT: v_writelane_b32 v1, s25, 13
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s30, 14
|
; GCN-NEXT: v_writelane_b32 v1, s26, 14
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s31, 15
|
; GCN-NEXT: v_writelane_b32 v1, s27, 15
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s32, 16
|
; GCN-NEXT: v_writelane_b32 v1, s28, 16
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s33, 17
|
; GCN-NEXT: v_writelane_b32 v1, s29, 17
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s34, 18
|
; GCN-NEXT: v_writelane_b32 v1, s30, 18
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s35, 19
|
; GCN-NEXT: v_writelane_b32 v1, s31, 19
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s36, 20
|
; GCN-NEXT: v_writelane_b32 v1, s32, 20
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s37, 21
|
; GCN-NEXT: v_writelane_b32 v1, s33, 21
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s38, 22
|
; GCN-NEXT: v_writelane_b32 v1, s34, 22
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s39, 23
|
; GCN-NEXT: v_writelane_b32 v1, s35, 23
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s40, 24
|
; GCN-NEXT: v_writelane_b32 v1, s36, 24
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s41, 25
|
; GCN-NEXT: v_writelane_b32 v1, s37, 25
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s42, 26
|
; GCN-NEXT: v_writelane_b32 v1, s38, 26
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s43, 27
|
; GCN-NEXT: v_writelane_b32 v1, s39, 27
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s44, 28
|
; GCN-NEXT: v_writelane_b32 v1, s40, 28
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s45, 29
|
; GCN-NEXT: v_writelane_b32 v1, s41, 29
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s46, 30
|
; GCN-NEXT: v_writelane_b32 v1, s42, 30
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s47, 31
|
; GCN-NEXT: v_writelane_b32 v1, s43, 31
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s48, 32
|
; GCN-NEXT: v_writelane_b32 v1, s44, 32
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s49, 33
|
; GCN-NEXT: v_writelane_b32 v1, s45, 33
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s50, 34
|
; GCN-NEXT: v_writelane_b32 v1, s46, 34
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s51, 35
|
; GCN-NEXT: v_writelane_b32 v1, s47, 35
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s52, 36
|
; GCN-NEXT: v_writelane_b32 v1, s48, 36
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s53, 37
|
; GCN-NEXT: v_writelane_b32 v1, s49, 37
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s54, 38
|
; GCN-NEXT: v_writelane_b32 v1, s50, 38
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s55, 39
|
; GCN-NEXT: v_writelane_b32 v1, s51, 39
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s56, 40
|
; GCN-NEXT: v_writelane_b32 v1, s52, 40
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s57, 41
|
; GCN-NEXT: v_writelane_b32 v1, s53, 41
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s58, 42
|
; GCN-NEXT: v_writelane_b32 v1, s54, 42
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s59, 43
|
; GCN-NEXT: v_writelane_b32 v1, s55, 43
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s60, 44
|
; GCN-NEXT: v_writelane_b32 v1, s56, 44
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s61, 45
|
; GCN-NEXT: v_writelane_b32 v1, s57, 45
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s62, 46
|
; GCN-NEXT: v_writelane_b32 v1, s58, 46
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s63, 47
|
; GCN-NEXT: v_writelane_b32 v1, s59, 47
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s64, 48
|
; GCN-NEXT: v_writelane_b32 v1, s60, 48
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s65, 49
|
; GCN-NEXT: v_writelane_b32 v1, s61, 49
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s66, 50
|
; GCN-NEXT: v_writelane_b32 v1, s62, 50
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s67, 51
|
; GCN-NEXT: v_writelane_b32 v1, s63, 51
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s68, 52
|
; GCN-NEXT: v_writelane_b32 v1, s64, 52
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s69, 53
|
; GCN-NEXT: v_writelane_b32 v1, s65, 53
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s70, 54
|
; GCN-NEXT: v_writelane_b32 v1, s66, 54
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s71, 55
|
; GCN-NEXT: v_writelane_b32 v1, s67, 55
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s72, 56
|
; GCN-NEXT: v_writelane_b32 v1, s68, 56
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s73, 57
|
; GCN-NEXT: v_writelane_b32 v1, s69, 57
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s74, 58
|
; GCN-NEXT: v_writelane_b32 v1, s70, 58
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s75, 59
|
; GCN-NEXT: v_writelane_b32 v1, s71, 59
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s76, 60
|
; GCN-NEXT: v_writelane_b32 v1, s72, 60
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s77, 61
|
; GCN-NEXT: v_writelane_b32 v1, s73, 61
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s78, 62
|
; GCN-NEXT: v_writelane_b32 v1, s74, 62
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s79, 63
|
; GCN-NEXT: v_writelane_b32 v1, s75, 63
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s80, 0
|
; GCN-NEXT: v_writelane_b32 v2, s76, 0
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s81, 1
|
; GCN-NEXT: v_writelane_b32 v2, s77, 1
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s82, 2
|
; GCN-NEXT: v_writelane_b32 v2, s78, 2
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s83, 3
|
; GCN-NEXT: v_writelane_b32 v2, s79, 3
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s84, 4
|
; GCN-NEXT: v_writelane_b32 v2, s80, 4
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s85, 5
|
; GCN-NEXT: v_writelane_b32 v2, s81, 5
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s86, 6
|
; GCN-NEXT: v_writelane_b32 v2, s82, 6
|
||||||
; GCN-NEXT: v_writelane_b32 v2, s87, 7
|
; GCN-NEXT: v_writelane_b32 v2, s83, 7
|
||||||
; GCN: s_cbranch_scc1
|
; GCN: s_cbranch_scc1
|
||||||
|
|
||||||
|
|
||||||
|
@ -393,24 +393,25 @@ ret:
|
||||||
; into the next available VGPR.
|
; into the next available VGPR.
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
|
; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
|
||||||
; GCN: def s[24:39]
|
; GCN: def s[4:19]
|
||||||
|
; GCN: def s[20:35]
|
||||||
|
|
||||||
; GCN: v_writelane_b32 v0, s24, 50
|
; GCN: v_writelane_b32 v0, s4, 50
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s25, 51
|
; GCN-NEXT: v_writelane_b32 v0, s5, 51
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s26, 52
|
; GCN-NEXT: v_writelane_b32 v0, s6, 52
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s27, 53
|
; GCN-NEXT: v_writelane_b32 v0, s7, 53
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s28, 54
|
; GCN-NEXT: v_writelane_b32 v0, s8, 54
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s29, 55
|
; GCN-NEXT: v_writelane_b32 v0, s9, 55
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s30, 56
|
; GCN-NEXT: v_writelane_b32 v0, s10, 56
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s31, 57
|
; GCN-NEXT: v_writelane_b32 v0, s11, 57
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s32, 58
|
; GCN-NEXT: v_writelane_b32 v0, s12, 58
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s33, 59
|
; GCN-NEXT: v_writelane_b32 v0, s13, 59
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s34, 60
|
; GCN-NEXT: v_writelane_b32 v0, s14, 60
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s35, 61
|
; GCN-NEXT: v_writelane_b32 v0, s15, 61
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s36, 62
|
; GCN-NEXT: v_writelane_b32 v0, s16, 62
|
||||||
; GCN-NEXT: v_writelane_b32 v0, s37, 63
|
; GCN-NEXT: v_writelane_b32 v0, s17, 63
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s38, 0
|
; GCN-NEXT: v_writelane_b32 v1, s18, 0
|
||||||
; GCN-NEXT: v_writelane_b32 v1, s39, 1
|
; GCN-NEXT: v_writelane_b32 v1, s19, 1
|
||||||
|
|
||||||
; GCN: v_readlane_b32 s4, v0, 50
|
; GCN: v_readlane_b32 s4, v0, 50
|
||||||
; GCN-NEXT: v_readlane_b32 s5, v0, 51
|
; GCN-NEXT: v_readlane_b32 s5, v0, 51
|
||||||
|
|
|
@ -40,8 +40,7 @@ define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)*
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
|
; GCN-LABEL: {{^}}store_v4i16_as_v2i32_align_4:
|
||||||
; GCN: s_load_dword s
|
; GCN: s_load_dword s
|
||||||
; GCN-NEXT: s_load_dword s
|
; GCN-NEXT: s_load_dwordx2 s
|
||||||
; GCN-NEXT: s_load_dword s
|
|
||||||
; GCN-NOT: {{buffer|flat|global}}
|
; GCN-NOT: {{buffer|flat|global}}
|
||||||
|
|
||||||
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
|
; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_sad_u32_pat1:
|
; GCN-LABEL: {{^}}v_sad_u32_pat1:
|
||||||
; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
|
@ -203,8 +203,11 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i1
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
|
; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
|
||||||
; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
|
define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out) {
|
||||||
|
%a = load volatile i16, i16 addrspace(1)* undef
|
||||||
|
%b = load volatile i16, i16 addrspace(1)* undef
|
||||||
|
%c = load volatile i16, i16 addrspace(1)* undef
|
||||||
%icmp0 = icmp ugt i16 %a, %b
|
%icmp0 = icmp ugt i16 %a, %b
|
||||||
%sub0 = sub i16 %a, %b
|
%sub0 = sub i16 %a, %b
|
||||||
%sub1 = sub i16 %b, %a
|
%sub1 = sub i16 %b, %a
|
||||||
|
@ -233,8 +236,31 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b
|
||||||
}
|
}
|
||||||
|
|
||||||
; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
|
; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
|
||||||
; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
; GCN: v_sad_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||||
define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
|
define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out) {
|
||||||
|
%a = load volatile i8, i8 addrspace(1)* undef
|
||||||
|
%b = load volatile i8, i8 addrspace(1)* undef
|
||||||
|
%c = load volatile i8, i8 addrspace(1)* undef
|
||||||
|
%icmp0 = icmp ugt i8 %a, %b
|
||||||
|
%sub0 = sub i8 %a, %b
|
||||||
|
%sub1 = sub i8 %b, %a
|
||||||
|
%ret0 = select i1 %icmp0, i8 %sub0, i8 %sub1
|
||||||
|
|
||||||
|
%ret = add i8 %ret0, %c
|
||||||
|
|
||||||
|
store i8 %ret, i8 addrspace(1)* %out
|
||||||
|
ret void
|
||||||
|
}
|
||||||
|
|
||||||
|
; GCN-LABEL: {{^}}s_sad_u32_i8_pat2:
|
||||||
|
; GCN: s_load_dword
|
||||||
|
; GCN: s_bfe_u32
|
||||||
|
; GCN: s_sub_i32
|
||||||
|
; GCN: s_and_b32
|
||||||
|
; GCN: s_sub_i32
|
||||||
|
; GCN: s_lshr_b32
|
||||||
|
; GCN: v_add_i32_e32
|
||||||
|
define amdgpu_kernel void @s_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
|
||||||
%icmp0 = icmp ugt i8 %a, %b
|
%icmp0 = icmp ugt i8 %a, %b
|
||||||
%sub0 = sub i8 %a, %b
|
%sub0 = sub i8 %a, %b
|
||||||
%sub1 = sub i8 %b, %a
|
%sub1 = sub i8 %b, %a
|
||||||
|
|
|
@ -2,14 +2,11 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
|
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=VI -check-prefix=GCN %s
|
||||||
|
|
||||||
; FUNC-LABEL: {{^}}cluster_arg_loads:
|
; FUNC-LABEL: {{^}}cluster_arg_loads:
|
||||||
; SI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
; SI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x9
|
||||||
; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
; SI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||||
; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
|
||||||
; SI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xe
|
; VI: s_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
||||||
; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
|
; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
||||||
; VI-NEXT: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
|
||||||
; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
|
|
||||||
; VI-NEXT: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
|
|
||||||
define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
|
define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
|
||||||
store i32 %x, i32 addrspace(1)* %out0, align 4
|
store i32 %x, i32 addrspace(1)* %out0, align 4
|
||||||
store i32 %y, i32 addrspace(1)* %out1, align 4
|
store i32 %y, i32 addrspace(1)* %out1, align 4
|
||||||
|
@ -42,7 +39,7 @@ define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
|
||||||
i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
|
i64 %arg112, i64 %arg113, i64 %arg114, i64 %arg115, i64 %arg116, i64 %arg117, i64 %arg118, i64 %arg119,
|
||||||
i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
|
i64 %arg120, i64 %arg121, i64 %arg122, i64 %arg123, i64 %arg124, i64 %arg125, i64 %arg126) {
|
||||||
entry:
|
entry:
|
||||||
%value = add i64 %arg125, %arg126
|
%value = add i64 %arg124, %arg126
|
||||||
store i64 %value, i64 addrspace(1)* %out, align 8
|
store i64 %value, i64 addrspace(1)* %out, align 8
|
||||||
ret void
|
ret void
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,10 +1,13 @@
|
||||||
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MINREG %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
; RUN: llc -march=amdgcn -mcpu=tahiti -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=SI,SI-MAXOCC %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MINREG %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
|
; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,VI-MAXOCC %s
|
||||||
|
|
||||||
; SI: NumSgprs: {{[1-9]$}}
|
; SI-MINREG: NumSgprs: {{[1-9]$}}
|
||||||
; SI: NumVgprs: {{[1-9]$}}
|
; SI-MINREG: NumVgprs: {{[1-9]$}}
|
||||||
|
|
||||||
|
; SI-MAXOCC: NumSgprs: {{[0-4][0-9]$}}
|
||||||
|
; SI-MAXOCC: NumVgprs: {{[0-4][0-9]$}}
|
||||||
|
|
||||||
; stores may alias loads
|
; stores may alias loads
|
||||||
; VI: NumSgprs: {{[0-9]$}}
|
; VI: NumSgprs: {{[0-9]$}}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||||
; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||||
|
|
||||||
; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
|
; FIXME: This should go in existing select.ll test, except the current testcase there is broken on GCN
|
||||||
|
|
||||||
|
@ -18,12 +18,12 @@ define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1
|
||||||
; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
|
; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8
|
||||||
; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
|
; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16
|
||||||
; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
|
; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
|
; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
|
||||||
; GCN-DAG: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]]
|
; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]]
|
||||||
; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
|
; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1
|
||||||
; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
|
; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]]
|
||||||
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
|
; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]]
|
||||||
define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
|
define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
|
||||||
%cmp = icmp slt i1 %cond, false
|
%cmp = icmp slt i1 %cond, false
|
||||||
%sel = select i1 %cmp, i1 %a, i1 %b
|
%sel = select i1 %cmp, i1 %a, i1 %b
|
||||||
store i1 %sel, i1 addrspace(1)* %out, align 4
|
store i1 %sel, i1 addrspace(1)* %out, align 4
|
||||||
|
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue