forked from OSchip/llvm-project
AMDGPU: Fix handling of 16-bit immediates
Since 32-bit instructions with 32-bit input immediate behavior are used to materialize 16-bit constants in 32-bit registers for 16-bit instructions, determining the legality based on the size is incorrect. Change operands to have the size specified in the type. Also adds a workaround for a disassembler bug that produces an immediate MCOperand for an operand that is supposed to be OPERAND_REGISTER. The assembler appears to accept out of bounds immediates and truncates them, but this seems to be an issue for 32-bit already. llvm-svn: 289306
This commit is contained in:
parent
86581e496b
commit
4bd7236193
|
@ -215,6 +215,10 @@ public:
|
|||
return isRegKind() || isInlinableImm(type);
|
||||
}
|
||||
|
||||
bool isRegOrImmWithInt16InputMods() const {
|
||||
return isRegOrImmWithInputMods(MVT::i16);
|
||||
}
|
||||
|
||||
bool isRegOrImmWithInt32InputMods() const {
|
||||
return isRegOrImmWithInputMods(MVT::i32);
|
||||
}
|
||||
|
@ -223,6 +227,10 @@ public:
|
|||
return isRegOrImmWithInputMods(MVT::i64);
|
||||
}
|
||||
|
||||
bool isRegOrImmWithFP16InputMods() const {
|
||||
return isRegOrImmWithInputMods(MVT::f16);
|
||||
}
|
||||
|
||||
bool isRegOrImmWithFP32InputMods() const {
|
||||
return isRegOrImmWithInputMods(MVT::f32);
|
||||
}
|
||||
|
@ -282,6 +290,10 @@ public:
|
|||
|
||||
bool isRegClass(unsigned RCID) const;
|
||||
|
||||
bool isSCSrcB16() const {
|
||||
return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16);
|
||||
}
|
||||
|
||||
bool isSCSrcB32() const {
|
||||
return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32);
|
||||
}
|
||||
|
@ -290,6 +302,10 @@ public:
|
|||
return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64);
|
||||
}
|
||||
|
||||
bool isSCSrcF16() const {
|
||||
return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16);
|
||||
}
|
||||
|
||||
bool isSCSrcF32() const {
|
||||
return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32);
|
||||
}
|
||||
|
@ -302,6 +318,10 @@ public:
|
|||
return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr();
|
||||
}
|
||||
|
||||
bool isSSrcB16() const {
|
||||
return isSCSrcB16() || isLiteralImm(MVT::i16);
|
||||
}
|
||||
|
||||
bool isSSrcB64() const {
|
||||
// TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
|
||||
// See isVSrc64().
|
||||
|
@ -316,6 +336,10 @@ public:
|
|||
return isSCSrcB64() || isLiteralImm(MVT::f64);
|
||||
}
|
||||
|
||||
bool isSSrcF16() const {
|
||||
return isSCSrcB16() || isLiteralImm(MVT::f16);
|
||||
}
|
||||
|
||||
bool isVCSrcB32() const {
|
||||
return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32);
|
||||
}
|
||||
|
@ -324,6 +348,10 @@ public:
|
|||
return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64);
|
||||
}
|
||||
|
||||
bool isVCSrcB16() const {
|
||||
return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16);
|
||||
}
|
||||
|
||||
bool isVCSrcF32() const {
|
||||
return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32);
|
||||
}
|
||||
|
@ -332,6 +360,10 @@ public:
|
|||
return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64);
|
||||
}
|
||||
|
||||
bool isVCSrcF16() const {
|
||||
return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16);
|
||||
}
|
||||
|
||||
bool isVSrcB32() const {
|
||||
return isVCSrcF32() || isLiteralImm(MVT::i32);
|
||||
}
|
||||
|
@ -340,6 +372,10 @@ public:
|
|||
return isVCSrcF64() || isLiteralImm(MVT::i64);
|
||||
}
|
||||
|
||||
bool isVSrcB16() const {
|
||||
return isVCSrcF16() || isLiteralImm(MVT::i16);
|
||||
}
|
||||
|
||||
bool isVSrcF32() const {
|
||||
return isVCSrcF32() || isLiteralImm(MVT::f32);
|
||||
}
|
||||
|
@ -348,10 +384,18 @@ public:
|
|||
return isVCSrcF64() || isLiteralImm(MVT::f64);
|
||||
}
|
||||
|
||||
bool isVSrcF16() const {
|
||||
return isVCSrcF16() || isLiteralImm(MVT::f16);
|
||||
}
|
||||
|
||||
bool isKImmFP32() const {
|
||||
return isLiteralImm(MVT::f32);
|
||||
}
|
||||
|
||||
bool isKImmFP16() const {
|
||||
return isLiteralImm(MVT::f16);
|
||||
}
|
||||
|
||||
bool isMem() const override {
|
||||
return false;
|
||||
}
|
||||
|
@ -439,7 +483,16 @@ public:
|
|||
|
||||
void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
|
||||
|
||||
void addKImmFP32Operands(MCInst &Inst, unsigned N) const;
|
||||
template <unsigned Bitwidth>
|
||||
void addKImmFPOperands(MCInst &Inst, unsigned N) const;
|
||||
|
||||
void addKImmFP16Operands(MCInst &Inst, unsigned N) const {
|
||||
addKImmFPOperands<16>(Inst, N);
|
||||
}
|
||||
|
||||
void addKImmFP32Operands(MCInst &Inst, unsigned N) const {
|
||||
addKImmFPOperands<32>(Inst, N);
|
||||
}
|
||||
|
||||
void addRegOperands(MCInst &Inst, unsigned N) const;
|
||||
|
||||
|
@ -826,19 +879,23 @@ struct OptionalOperand {
|
|||
} // end anonymous namespace
|
||||
|
||||
// May be called with integer type with equivalent bitwidth.
|
||||
static const fltSemantics *getFltSemantics(MVT VT) {
|
||||
switch (VT.getSizeInBits()) {
|
||||
case 32:
|
||||
static const fltSemantics *getFltSemantics(unsigned Size) {
|
||||
switch (Size) {
|
||||
case 4:
|
||||
return &APFloat::IEEEsingle;
|
||||
case 64:
|
||||
case 8:
|
||||
return &APFloat::IEEEdouble;
|
||||
case 16:
|
||||
case 2:
|
||||
return &APFloat::IEEEhalf;
|
||||
default:
|
||||
llvm_unreachable("unsupported fp type");
|
||||
}
|
||||
}
|
||||
|
||||
static const fltSemantics *getFltSemantics(MVT VT) {
|
||||
return getFltSemantics(VT.getSizeInBits() / 8);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Operand
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -895,6 +952,12 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
|
|||
AsmParser->hasInv2PiInlineImm());
|
||||
}
|
||||
|
||||
if (type.getScalarSizeInBits() == 16) {
|
||||
return AMDGPU::isInlinableLiteral16(
|
||||
static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
|
||||
AsmParser->hasInv2PiInlineImm());
|
||||
}
|
||||
|
||||
return AMDGPU::isInlinableLiteral32(
|
||||
static_cast<int32_t>(Literal.getLoBits(32).getZExtValue()),
|
||||
AsmParser->hasInv2PiInlineImm());
|
||||
|
@ -909,9 +972,13 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
|
|||
if (!Imm.IsFPImm) {
|
||||
// We got int literal token.
|
||||
|
||||
unsigned Size = type.getSizeInBits();
|
||||
if (Size == 64)
|
||||
Size = 32;
|
||||
|
||||
// FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP
|
||||
// types.
|
||||
return isUInt<32>(Imm.Val) || isInt<32>(Imm.Val);
|
||||
return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val);
|
||||
}
|
||||
|
||||
// We got fp literal token
|
||||
|
@ -947,7 +1014,8 @@ void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers
|
|||
}
|
||||
}
|
||||
|
||||
if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) {
|
||||
if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
|
||||
Inst.getNumOperands())) {
|
||||
addLiteralImmOperand(Inst, Val);
|
||||
} else {
|
||||
Inst.addOperand(MCOperand::createImm(Val));
|
||||
|
@ -960,69 +1028,112 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
|
|||
// Check that this operand accepts literals
|
||||
assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
|
||||
|
||||
APInt Literal(64, Val);
|
||||
auto OpSize = AMDGPU::getRegOperandSize(AsmParser->getMRI(), InstDesc, OpNum); // expected operand size
|
||||
auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
|
||||
|
||||
if (Imm.IsFPImm) { // We got fp literal token
|
||||
if (OpSize == 8) { // Expected 64-bit operand
|
||||
// Check if literal is inlinable
|
||||
APInt Literal(64, Val);
|
||||
|
||||
switch (OpSize) {
|
||||
case 8: {
|
||||
if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
|
||||
AsmParser->hasInv2PiInlineImm())) {
|
||||
Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
|
||||
} else if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
|
||||
return;
|
||||
}
|
||||
|
||||
// Non-inlineable
|
||||
if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand
|
||||
// For fp operands we check if low 32 bits are zeros
|
||||
if (Literal.getLoBits(32) != 0) {
|
||||
const_cast<AMDGPUAsmParser *>(AsmParser)->Warning(Inst.getLoc(),
|
||||
"Can't encode literal as exact 64-bit"
|
||||
" floating-point operand. Low 32-bits will be"
|
||||
" set to zero");
|
||||
"Can't encode literal as exact 64-bit floating-point operand. "
|
||||
"Low 32-bits will be set to zero");
|
||||
}
|
||||
|
||||
Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue()));
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
|
||||
// We don't allow fp literals in 64-bit integer instructions. It is
|
||||
// unclear how we should encode them. This case should be checked earlier
|
||||
// in predicate methods (isLiteralImm())
|
||||
llvm_unreachable("fp literal in 64-bit integer instruction.");
|
||||
}
|
||||
} else { // Expected 32-bit operand
|
||||
case 4:
|
||||
case 2: {
|
||||
bool lost;
|
||||
APFloat FPLiteral(APFloat::IEEEdouble, Literal);
|
||||
// Convert literal to single precision
|
||||
FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost);
|
||||
FPLiteral.convert(*getFltSemantics(OpSize),
|
||||
APFloat::rmNearestTiesToEven, &lost);
|
||||
// We allow precision lost but not overflow or underflow. This should be
|
||||
// checked earlier in isLiteralImm()
|
||||
Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
|
||||
}
|
||||
} else { // We got int literal token
|
||||
if (OpSize == 8) { // Expected 64-bit operand
|
||||
auto LiteralVal = Literal.getZExtValue();
|
||||
if (AMDGPU::isInlinableLiteral64(LiteralVal,
|
||||
AsmParser->hasInv2PiInlineImm())) {
|
||||
Inst.addOperand(MCOperand::createImm(LiteralVal));
|
||||
return;
|
||||
}
|
||||
} else { // Expected 32-bit operand
|
||||
auto LiteralVal = static_cast<int32_t>(Literal.getLoBits(32).getZExtValue());
|
||||
if (AMDGPU::isInlinableLiteral32(LiteralVal,
|
||||
AsmParser->hasInv2PiInlineImm())) {
|
||||
Inst.addOperand(MCOperand::createImm(LiteralVal));
|
||||
default:
|
||||
llvm_unreachable("invalid operand size");
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
// We got int literal token.
|
||||
// Only sign extend inline immediates.
|
||||
// FIXME: No errors on truncation
|
||||
switch (OpSize) {
|
||||
case 4: {
|
||||
if (isInt<32>(Val) &&
|
||||
AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
|
||||
AsmParser->hasInv2PiInlineImm())) {
|
||||
Inst.addOperand(MCOperand::createImm(Val));
|
||||
return;
|
||||
}
|
||||
Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
|
||||
|
||||
Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
|
||||
return;
|
||||
}
|
||||
case 8: {
|
||||
if (AMDGPU::isInlinableLiteral64(Val,
|
||||
AsmParser->hasInv2PiInlineImm())) {
|
||||
Inst.addOperand(MCOperand::createImm(Val));
|
||||
return;
|
||||
}
|
||||
|
||||
Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
|
||||
return;
|
||||
}
|
||||
case 2: {
|
||||
if (isInt<16>(Val) &&
|
||||
AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
|
||||
AsmParser->hasInv2PiInlineImm())) {
|
||||
Inst.addOperand(MCOperand::createImm(Val));
|
||||
return;
|
||||
}
|
||||
|
||||
Inst.addOperand(MCOperand::createImm(Val & 0xffff));
|
||||
return;
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("invalid operand size");
|
||||
}
|
||||
}
|
||||
|
||||
void AMDGPUOperand::addKImmFP32Operands(MCInst &Inst, unsigned N) const {
|
||||
template <unsigned Bitwidth>
|
||||
void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const {
|
||||
APInt Literal(64, Imm.Val);
|
||||
if (Imm.IsFPImm) { // We got fp literal
|
||||
bool lost;
|
||||
APFloat FPLiteral(APFloat::IEEEdouble, Literal);
|
||||
FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost);
|
||||
Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
|
||||
} else { // We got int literal token
|
||||
Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue()));
|
||||
|
||||
if (!Imm.IsFPImm) {
|
||||
// We got int literal token.
|
||||
Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue()));
|
||||
return;
|
||||
}
|
||||
|
||||
bool Lost;
|
||||
APFloat FPLiteral(APFloat::IEEEdouble, Literal);
|
||||
FPLiteral.convert(*getFltSemantics(Bitwidth / 8),
|
||||
APFloat::rmNearestTiesToEven, &Lost);
|
||||
Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
|
||||
}
|
||||
|
||||
void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const {
|
||||
|
|
|
@ -88,6 +88,15 @@ DECODE_OPERAND(SReg_128)
|
|||
DECODE_OPERAND(SReg_256)
|
||||
DECODE_OPERAND(SReg_512)
|
||||
|
||||
|
||||
static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
|
||||
unsigned Imm,
|
||||
uint64_t Addr,
|
||||
const void *Decoder) {
|
||||
auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
|
||||
return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
|
||||
}
|
||||
|
||||
#define GET_SUBTARGETINFO_ENUM
|
||||
#include "AMDGPUGenSubtargetInfo.inc"
|
||||
#undef GET_SUBTARGETINFO_ENUM
|
||||
|
@ -250,6 +259,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const {
|
|||
return decodeSrcOp(OPW64, Val);
|
||||
}
|
||||
|
||||
MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
|
||||
return decodeSrcOp(OPW16, Val);
|
||||
}
|
||||
|
||||
MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
|
||||
// Some instructions have operand restrictions beyond what the encoding
|
||||
// allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
|
||||
|
@ -324,28 +337,96 @@ MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) {
|
|||
// Cast prevents negative overflow.
|
||||
}
|
||||
|
||||
MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) {
|
||||
static int64_t getInlineImmVal32(unsigned Imm) {
|
||||
switch (Imm) {
|
||||
case 240:
|
||||
return FloatToBits(0.5f);
|
||||
case 241:
|
||||
return FloatToBits(-0.5f);
|
||||
case 242:
|
||||
return FloatToBits(1.0f);
|
||||
case 243:
|
||||
return FloatToBits(-1.0f);
|
||||
case 244:
|
||||
return FloatToBits(2.0f);
|
||||
case 245:
|
||||
return FloatToBits(-2.0f);
|
||||
case 246:
|
||||
return FloatToBits(4.0f);
|
||||
case 247:
|
||||
return FloatToBits(-4.0f);
|
||||
case 248: // 1 / (2 * PI)
|
||||
return 0x3e22f983;
|
||||
default:
|
||||
llvm_unreachable("invalid fp inline imm");
|
||||
}
|
||||
}
|
||||
|
||||
static int64_t getInlineImmVal64(unsigned Imm) {
|
||||
switch (Imm) {
|
||||
case 240:
|
||||
return DoubleToBits(0.5);
|
||||
case 241:
|
||||
return DoubleToBits(-0.5);
|
||||
case 242:
|
||||
return DoubleToBits(1.0);
|
||||
case 243:
|
||||
return DoubleToBits(-1.0);
|
||||
case 244:
|
||||
return DoubleToBits(2.0);
|
||||
case 245:
|
||||
return DoubleToBits(-2.0);
|
||||
case 246:
|
||||
return DoubleToBits(4.0);
|
||||
case 247:
|
||||
return DoubleToBits(-4.0);
|
||||
case 248: // 1 / (2 * PI)
|
||||
return 0x3fc45f306dc9c882;
|
||||
default:
|
||||
llvm_unreachable("invalid fp inline imm");
|
||||
}
|
||||
}
|
||||
|
||||
static int64_t getInlineImmVal16(unsigned Imm) {
|
||||
switch (Imm) {
|
||||
case 240:
|
||||
return 0x3800;
|
||||
case 241:
|
||||
return 0xB800;
|
||||
case 242:
|
||||
return 0x3C00;
|
||||
case 243:
|
||||
return 0xBC00;
|
||||
case 244:
|
||||
return 0x4000;
|
||||
case 245:
|
||||
return 0xC000;
|
||||
case 246:
|
||||
return 0x4400;
|
||||
case 247:
|
||||
return 0xC400;
|
||||
case 248: // 1 / (2 * PI)
|
||||
return 0x3118;
|
||||
default:
|
||||
llvm_unreachable("invalid fp inline imm");
|
||||
}
|
||||
}
|
||||
|
||||
MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
|
||||
assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN
|
||||
&& Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX);
|
||||
|
||||
// ToDo: case 248: 1/(2*PI) - is allowed only on VI
|
||||
// ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as
|
||||
// literal constant.
|
||||
float V = 0.0f;
|
||||
switch (Imm) {
|
||||
case 240: V = 0.5f; break;
|
||||
case 241: V = -0.5f; break;
|
||||
case 242: V = 1.0f; break;
|
||||
case 243: V = -1.0f; break;
|
||||
case 244: V = 2.0f; break;
|
||||
case 245: V = -2.0f; break;
|
||||
case 246: V = 4.0f; break;
|
||||
case 247: V = -4.0f; break;
|
||||
case 248: return MCOperand::createImm(Is32 ? // 1/(2*PI)
|
||||
0x3e22f983 :
|
||||
0x3fc45f306dc9c882);
|
||||
default: break;
|
||||
switch (Width) {
|
||||
case OPW32:
|
||||
return MCOperand::createImm(getInlineImmVal32(Imm));
|
||||
case OPW64:
|
||||
return MCOperand::createImm(getInlineImmVal64(Imm));
|
||||
case OPW16:
|
||||
return MCOperand::createImm(getInlineImmVal16(Imm));
|
||||
default:
|
||||
llvm_unreachable("implement me");
|
||||
}
|
||||
return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V));
|
||||
}
|
||||
|
||||
unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
|
||||
|
@ -353,7 +434,9 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
|
|||
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
|
||||
switch (Width) {
|
||||
default: // fall
|
||||
case OPW32: return VGPR_32RegClassID;
|
||||
case OPW32:
|
||||
case OPW16:
|
||||
return VGPR_32RegClassID;
|
||||
case OPW64: return VReg_64RegClassID;
|
||||
case OPW128: return VReg_128RegClassID;
|
||||
}
|
||||
|
@ -364,7 +447,9 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
|
|||
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
|
||||
switch (Width) {
|
||||
default: // fall
|
||||
case OPW32: return SGPR_32RegClassID;
|
||||
case OPW32:
|
||||
case OPW16:
|
||||
return SGPR_32RegClassID;
|
||||
case OPW64: return SGPR_64RegClassID;
|
||||
case OPW128: return SGPR_128RegClassID;
|
||||
}
|
||||
|
@ -375,7 +460,9 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
|
|||
assert(OPW_FIRST_ <= Width && Width < OPW_LAST_);
|
||||
switch (Width) {
|
||||
default: // fall
|
||||
case OPW32: return TTMP_32RegClassID;
|
||||
case OPW32:
|
||||
case OPW16:
|
||||
return TTMP_32RegClassID;
|
||||
case OPW64: return TTMP_64RegClassID;
|
||||
case OPW128: return TTMP_128RegClassID;
|
||||
}
|
||||
|
@ -396,19 +483,26 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
|
|||
return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN);
|
||||
}
|
||||
|
||||
assert(Width == OPW32 || Width == OPW64);
|
||||
const bool Is32 = (Width == OPW32);
|
||||
assert(Width == OPW16 || Width == OPW32 || Width == OPW64);
|
||||
|
||||
if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX)
|
||||
return decodeIntImmed(Val);
|
||||
|
||||
if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX)
|
||||
return decodeFPImmed(Is32, Val);
|
||||
return decodeFPImmed(Width, Val);
|
||||
|
||||
if (Val == LITERAL_CONST)
|
||||
return decodeLiteralConstant();
|
||||
|
||||
return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val);
|
||||
switch (Width) {
|
||||
case OPW32:
|
||||
case OPW16:
|
||||
return decodeSpecialReg32(Val);
|
||||
case OPW64:
|
||||
return decodeSpecialReg64(Val);
|
||||
default:
|
||||
llvm_unreachable("unexpected immediate type");
|
||||
}
|
||||
}
|
||||
|
||||
MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
|
||||
|
|
|
@ -66,6 +66,7 @@ public:
|
|||
MCOperand decodeOperand_VGPR_32(unsigned Val) const;
|
||||
MCOperand decodeOperand_VS_32(unsigned Val) const;
|
||||
MCOperand decodeOperand_VS_64(unsigned Val) const;
|
||||
MCOperand decodeOperand_VSrc16(unsigned Val) const;
|
||||
|
||||
MCOperand decodeOperand_VReg_64(unsigned Val) const;
|
||||
MCOperand decodeOperand_VReg_96(unsigned Val) const;
|
||||
|
@ -83,6 +84,7 @@ public:
|
|||
OPW32,
|
||||
OPW64,
|
||||
OPW128,
|
||||
OPW16,
|
||||
OPW_LAST_,
|
||||
OPW_FIRST_ = OPW32
|
||||
};
|
||||
|
@ -92,7 +94,7 @@ public:
|
|||
unsigned getTtmpClassId(const OpWidthTy Width) const;
|
||||
|
||||
static MCOperand decodeIntImmed(unsigned Imm);
|
||||
static MCOperand decodeFPImmed(bool Is32, unsigned Imm);
|
||||
static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm);
|
||||
MCOperand decodeLiteralConstant() const;
|
||||
|
||||
MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const;
|
||||
|
|
|
@ -47,7 +47,13 @@ void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
|
|||
void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
|
||||
// It's possible to end up with a 32-bit literal used with a 16-bit operand
|
||||
// with ignored high bits. Print as 32-bit anyway in that case.
|
||||
int64_t Imm = MI->getOperand(OpNo).getImm();
|
||||
if (isInt<16>(Imm) || isUInt<16>(Imm))
|
||||
O << formatHex(static_cast<uint64_t>(Imm & 0xffff));
|
||||
else
|
||||
printU32ImmOperand(MI, OpNo, STI, O);
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo,
|
||||
|
@ -336,6 +342,38 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
|
|||
printOperand(MI, OpNo, STI, O);
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
int16_t SImm = static_cast<int16_t>(Imm);
|
||||
if (SImm >= -16 && SImm <= 64) {
|
||||
O << SImm;
|
||||
return;
|
||||
}
|
||||
|
||||
if (Imm == 0x3C00)
|
||||
O<< "1.0";
|
||||
else if (Imm == 0xBC00)
|
||||
O<< "-1.0";
|
||||
else if (Imm == 0x3800)
|
||||
O<< "0.5";
|
||||
else if (Imm == 0xB800)
|
||||
O<< "-0.5";
|
||||
else if (Imm == 0x4000)
|
||||
O<< "2.0";
|
||||
else if (Imm == 0xC000)
|
||||
O<< "-2.0";
|
||||
else if (Imm == 0x4400)
|
||||
O<< "4.0";
|
||||
else if (Imm == 0xC400)
|
||||
O<< "-4.0";
|
||||
else if (Imm == 0x3118) {
|
||||
assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]);
|
||||
O << "0.15915494";
|
||||
} else
|
||||
O << formatHex(static_cast<uint64_t>(Imm));
|
||||
}
|
||||
|
||||
void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
|
||||
const MCSubtargetInfo &STI,
|
||||
raw_ostream &O) {
|
||||
|
@ -431,22 +469,39 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
|
|||
}
|
||||
} else if (Op.isImm()) {
|
||||
const MCInstrDesc &Desc = MII.get(MI->getOpcode());
|
||||
int RCID = Desc.OpInfo[OpNo].RegClass;
|
||||
if (RCID != -1) {
|
||||
unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID));
|
||||
if (RCBits == 32)
|
||||
switch (Desc.OpInfo[OpNo].OperandType) {
|
||||
case AMDGPU::OPERAND_REG_IMM_INT32:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
|
||||
case MCOI::OPERAND_IMMEDIATE:
|
||||
printImmediate32(Op.getImm(), STI, O);
|
||||
else if (RCBits == 64)
|
||||
break;
|
||||
case AMDGPU::OPERAND_REG_IMM_INT64:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
|
||||
printImmediate64(Op.getImm(), STI, O);
|
||||
else
|
||||
llvm_unreachable("Invalid register class size");
|
||||
} else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) {
|
||||
printImmediate32(Op.getImm(), STI, O);
|
||||
} else {
|
||||
break;
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
|
||||
case AMDGPU::OPERAND_REG_IMM_INT16:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP16:
|
||||
printImmediate16(Op.getImm(), STI, O);
|
||||
break;
|
||||
case MCOI::OPERAND_UNKNOWN:
|
||||
case MCOI::OPERAND_PCREL:
|
||||
O << formatDec(Op.getImm());
|
||||
break;
|
||||
case MCOI::OPERAND_REGISTER:
|
||||
// FIXME: This should be removed and handled somewhere else. Seems to come
|
||||
// from a disassembler bug.
|
||||
O << "/*invalid immediate*/";
|
||||
break;
|
||||
default:
|
||||
// We hit this for the immediate instruction bits that don't yet have a
|
||||
// custom printer.
|
||||
// TODO: Eventually this should be unnecessary.
|
||||
O << formatDec(Op.getImm());
|
||||
llvm_unreachable("unexpected immediate operand type");
|
||||
}
|
||||
} else if (Op.isFPImm()) {
|
||||
// We special case 0.0 because otherwise it will be printed as an integer.
|
||||
|
|
|
@ -88,6 +88,8 @@ private:
|
|||
void printRegOperand(unsigned RegNo, raw_ostream &O);
|
||||
void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
|
||||
raw_ostream &O);
|
||||
void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
|
||||
raw_ostream &O);
|
||||
void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
|
||||
raw_ostream &O);
|
||||
void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
|
||||
|
|
|
@ -39,7 +39,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter {
|
|||
const MCRegisterInfo &MRI;
|
||||
|
||||
/// \brief Encode an fp or int literal
|
||||
uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize,
|
||||
uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo,
|
||||
const MCSubtargetInfo &STI) const;
|
||||
|
||||
public:
|
||||
|
@ -87,6 +87,42 @@ static uint32_t getIntInlineImmEncoding(IntTy Imm) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
|
||||
uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
|
||||
if (IntImm != 0)
|
||||
return IntImm;
|
||||
|
||||
if (Val == 0x3800) // 0.5
|
||||
return 240;
|
||||
|
||||
if (Val == 0xB800) // -0.5
|
||||
return 241;
|
||||
|
||||
if (Val == 0x3C00) // 1.0
|
||||
return 242;
|
||||
|
||||
if (Val == 0xBC00) // -1.0
|
||||
return 243;
|
||||
|
||||
if (Val == 0x4000) // 2.0
|
||||
return 244;
|
||||
|
||||
if (Val == 0xC000) // -2.0
|
||||
return 245;
|
||||
|
||||
if (Val == 0x4400) // 4.0
|
||||
return 246;
|
||||
|
||||
if (Val == 0xC400) // -4.0
|
||||
return 247;
|
||||
|
||||
if (Val == 0x3118 && // 1.0 / (2.0 * pi)
|
||||
STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm])
|
||||
return 248;
|
||||
|
||||
return 255;
|
||||
}
|
||||
|
||||
static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) {
|
||||
uint32_t IntImm = getIntInlineImmEncoding(static_cast<int32_t>(Val));
|
||||
if (IntImm != 0)
|
||||
|
@ -160,7 +196,7 @@ static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) {
|
|||
}
|
||||
|
||||
uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
|
||||
unsigned OpSize,
|
||||
const MCOperandInfo &OpInfo,
|
||||
const MCSubtargetInfo &STI) const {
|
||||
|
||||
int64_t Imm;
|
||||
|
@ -180,12 +216,16 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
|
|||
Imm = MO.getImm();
|
||||
}
|
||||
|
||||
if (OpSize == 4)
|
||||
switch (AMDGPU::getOperandSize(OpInfo)) {
|
||||
case 4:
|
||||
return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
|
||||
|
||||
assert(OpSize == 8);
|
||||
|
||||
case 8:
|
||||
return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
|
||||
case 2:
|
||||
return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
|
||||
default:
|
||||
llvm_unreachable("invalid operand size");
|
||||
}
|
||||
}
|
||||
|
||||
void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
|
||||
|
@ -212,12 +252,9 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
|
|||
if (!AMDGPU::isSISrcOperand(Desc, i))
|
||||
continue;
|
||||
|
||||
int RCID = Desc.OpInfo[i].RegClass;
|
||||
const MCRegisterClass &RC = MRI.getRegClass(RCID);
|
||||
|
||||
// Is this operand a literal immediate?
|
||||
const MCOperand &Op = MI.getOperand(i);
|
||||
if (getLitEncoding(Op, AMDGPU::getRegBitWidth(RC) / 8, STI) != 255)
|
||||
if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255)
|
||||
continue;
|
||||
|
||||
// Yes! Encode it
|
||||
|
@ -282,9 +319,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
|
|||
|
||||
const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
|
||||
if (AMDGPU::isSISrcOperand(Desc, OpNo)) {
|
||||
uint32_t Enc = getLitEncoding(MO,
|
||||
AMDGPU::getRegOperandSize(&MRI, Desc, OpNo),
|
||||
STI);
|
||||
uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI);
|
||||
if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4))
|
||||
return Enc;
|
||||
|
||||
|
|
|
@ -88,17 +88,36 @@ enum ClassFlags {
|
|||
namespace AMDGPU {
|
||||
enum OperandType {
|
||||
/// Operands with register or 32-bit immediate
|
||||
OPERAND_REG_IMM32_INT = MCOI::OPERAND_FIRST_TARGET,
|
||||
OPERAND_REG_IMM32_FP,
|
||||
OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET,
|
||||
OPERAND_REG_IMM_INT64,
|
||||
OPERAND_REG_IMM_INT16,
|
||||
OPERAND_REG_IMM_FP32,
|
||||
OPERAND_REG_IMM_FP64,
|
||||
OPERAND_REG_IMM_FP16,
|
||||
|
||||
/// Operands with register or inline constant
|
||||
OPERAND_REG_INLINE_C_INT,
|
||||
OPERAND_REG_INLINE_C_FP,
|
||||
OPERAND_REG_INLINE_C_INT16,
|
||||
OPERAND_REG_INLINE_C_INT32,
|
||||
OPERAND_REG_INLINE_C_INT64,
|
||||
OPERAND_REG_INLINE_C_FP16,
|
||||
OPERAND_REG_INLINE_C_FP32,
|
||||
OPERAND_REG_INLINE_C_FP64,
|
||||
|
||||
OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
|
||||
OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
|
||||
|
||||
OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
|
||||
OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
|
||||
|
||||
OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
|
||||
OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
|
||||
|
||||
// Operand for source modifiers for VOP instructions
|
||||
OPERAND_INPUT_MODS,
|
||||
|
||||
/// Operand with 32-bit immediate that uses the constant bus.
|
||||
OPERAND_KIMM32
|
||||
OPERAND_KIMM32,
|
||||
OPERAND_KIMM16
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
@ -315,12 +315,14 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
|
|||
return;
|
||||
}
|
||||
|
||||
APInt Imm(64, OpToFold.getImm());
|
||||
|
||||
const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc();
|
||||
const TargetRegisterClass *FoldRC =
|
||||
TRI.getRegClass(FoldDesc.OpInfo[0].RegClass);
|
||||
|
||||
APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
|
||||
OpToFold.getImm());
|
||||
|
||||
// Split 64-bit constants into 32-bits for folding.
|
||||
if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
|
||||
unsigned UseReg = UseOp.getReg();
|
||||
|
@ -329,6 +331,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI,
|
|||
MRI.getRegClass(UseReg) :
|
||||
TRI.getPhysRegClass(UseReg);
|
||||
|
||||
assert(Imm.getBitWidth() == 64);
|
||||
|
||||
if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
|
||||
return;
|
||||
|
||||
|
@ -505,7 +509,6 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
|
|||
if (!isSafeToFold(MI))
|
||||
continue;
|
||||
|
||||
unsigned OpSize = TII->getOpSize(MI, 1);
|
||||
MachineOperand &OpToFold = MI.getOperand(1);
|
||||
bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
|
||||
|
||||
|
@ -559,14 +562,15 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
|
|||
Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end();
|
||||
Use != E; ++Use) {
|
||||
MachineInstr *UseMI = Use->getParent();
|
||||
unsigned OpNo = Use.getOperandNo();
|
||||
|
||||
if (TII->isInlineConstant(OpToFold, OpSize)) {
|
||||
foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList,
|
||||
if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) {
|
||||
foldOperand(OpToFold, UseMI, OpNo, FoldList,
|
||||
CopiesToReplace, TII, TRI, MRI);
|
||||
} else {
|
||||
if (++NumLiteralUses == 1) {
|
||||
NonInlineUse = &*Use;
|
||||
NonInlineUseOpNo = Use.getOperandNo();
|
||||
NonInlineUseOpNo = OpNo;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1415,10 +1415,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
|
|||
// If this is a free constant, there's no reason to do this.
|
||||
// TODO: We could fold this here instead of letting SIFoldOperands do it
|
||||
// later.
|
||||
if (isInlineConstant(ImmOp, 4))
|
||||
MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
|
||||
|
||||
// Any src operand can be used for the legality check.
|
||||
if (isInlineConstant(UseMI, *Src0, ImmOp))
|
||||
return false;
|
||||
|
||||
MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0);
|
||||
MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
|
||||
MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
|
||||
|
||||
|
@ -1620,8 +1622,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
|
|||
case AMDGPU::V_MAC_F16_e32:
|
||||
IsF16 = true;
|
||||
case AMDGPU::V_MAC_F32_e32: {
|
||||
const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
|
||||
if (Src0->isImm() && !isInlineConstant(*Src0, 4))
|
||||
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
|
||||
AMDGPU::OpName::src0);
|
||||
const MachineOperand *Src0 = &MI.getOperand(Src0Idx);
|
||||
if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0))
|
||||
return nullptr;
|
||||
break;
|
||||
}
|
||||
|
@ -1682,46 +1686,55 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
|
|||
case 64:
|
||||
return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
|
||||
ST.hasInv2PiInlineImm());
|
||||
case 16:
|
||||
return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
|
||||
ST.hasInv2PiInlineImm());
|
||||
default:
|
||||
llvm_unreachable("invalid bitwidth");
|
||||
}
|
||||
}
|
||||
|
||||
bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
|
||||
unsigned OpSize) const {
|
||||
if (MO.isImm()) {
|
||||
// MachineOperand provides no way to tell the true operand size, since it
|
||||
// only records a 64-bit value. We need to know the size to determine if a
|
||||
// 32-bit floating point immediate bit pattern is legal for an integer
|
||||
// immediate. It would be for any 32-bit integer operand, but would not be
|
||||
// for a 64-bit one.
|
||||
switch (OpSize) {
|
||||
case 4:
|
||||
return AMDGPU::isInlinableLiteral32(static_cast<int32_t>(MO.getImm()),
|
||||
ST.hasInv2PiInlineImm());
|
||||
case 8:
|
||||
uint8_t OperandType) const {
|
||||
if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET)
|
||||
return false;
|
||||
|
||||
// MachineOperand provides no way to tell the true operand size, since it only
|
||||
// records a 64-bit value. We need to know the size to determine if a 32-bit
|
||||
// floating point immediate bit pattern is legal for an integer immediate. It
|
||||
// would be for any 32-bit integer operand, but would not be for a 64-bit one.
|
||||
|
||||
int64_t Imm = MO.getImm();
|
||||
switch (operandBitWidth(OperandType)) {
|
||||
case 32: {
|
||||
int32_t Trunc = static_cast<int32_t>(Imm);
|
||||
return Trunc == Imm &&
|
||||
AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
|
||||
}
|
||||
case 64: {
|
||||
return AMDGPU::isInlinableLiteral64(MO.getImm(),
|
||||
ST.hasInv2PiInlineImm());
|
||||
default:
|
||||
llvm_unreachable("invalid bitwidth");
|
||||
}
|
||||
case 16: {
|
||||
if (isInt<16>(Imm) || isUInt<16>(Imm)) {
|
||||
int16_t Trunc = static_cast<int16_t>(Imm);
|
||||
return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO,
|
||||
unsigned OpSize) const {
|
||||
return MO.isImm() && !isInlineConstant(MO, OpSize);
|
||||
}
|
||||
default:
|
||||
llvm_unreachable("invalid bitwidth");
|
||||
}
|
||||
}
|
||||
|
||||
bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO,
|
||||
unsigned OpSize) const {
|
||||
const MCOperandInfo &OpInfo) const {
|
||||
switch (MO.getType()) {
|
||||
case MachineOperand::MO_Register:
|
||||
return false;
|
||||
case MachineOperand::MO_Immediate:
|
||||
return !isInlineConstant(MO, OpSize);
|
||||
return !isInlineConstant(MO, OpInfo);
|
||||
case MachineOperand::MO_FrameIndex:
|
||||
case MachineOperand::MO_MachineBasicBlock:
|
||||
case MachineOperand::MO_ExternalSymbol:
|
||||
|
@ -1760,11 +1773,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
|
|||
if (OpInfo.RegClass < 0)
|
||||
return false;
|
||||
|
||||
unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize();
|
||||
if (isLiteralConstant(MO, OpSize))
|
||||
return RI.opCanUseLiteralConstant(OpInfo.OperandType);
|
||||
|
||||
if (MO.isImm() && isInlineConstant(MO, OpInfo))
|
||||
return RI.opCanUseInlineConstant(OpInfo.OperandType);
|
||||
|
||||
return RI.opCanUseLiteralConstant(OpInfo.OperandType);
|
||||
}
|
||||
|
||||
bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const {
|
||||
|
@ -1791,12 +1803,17 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
|
|||
|
||||
bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
|
||||
const MachineOperand &MO,
|
||||
unsigned OpSize) const {
|
||||
const MCOperandInfo &OpInfo) const {
|
||||
// Literal constants use the constant bus.
|
||||
if (isLiteralConstant(MO, OpSize))
|
||||
return true;
|
||||
//if (isLiteralConstantLike(MO, OpInfo))
|
||||
// return true;
|
||||
if (MO.isImm())
|
||||
return !isInlineConstant(MO, OpInfo);
|
||||
|
||||
if (!MO.isReg() || !MO.isUse())
|
||||
if (!MO.isReg())
|
||||
return true; // Misc other operands like FrameIndex
|
||||
|
||||
if (!MO.isUse())
|
||||
return false;
|
||||
|
||||
if (TargetRegisterInfo::isVirtualRegister(MO.getReg()))
|
||||
|
@ -1925,17 +1942,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
|
|||
return false;
|
||||
}
|
||||
break;
|
||||
case AMDGPU::OPERAND_REG_IMM32_INT:
|
||||
case AMDGPU::OPERAND_REG_IMM32_FP:
|
||||
case AMDGPU::OPERAND_REG_IMM_INT32:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP32:
|
||||
break;
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP:
|
||||
if (isLiteralConstant(MI.getOperand(i),
|
||||
RI.getRegClass(RegClass)->getSize())) {
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
|
||||
const MachineOperand &MO = MI.getOperand(i);
|
||||
if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) {
|
||||
ErrInfo = "Illegal immediate value for operand.";
|
||||
return false;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case MCOI::OPERAND_IMMEDIATE:
|
||||
case AMDGPU::OPERAND_KIMM32:
|
||||
// Check if this operand is an immediate.
|
||||
|
@ -1987,7 +2009,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
|
|||
if (OpIdx == -1)
|
||||
break;
|
||||
const MachineOperand &MO = MI.getOperand(OpIdx);
|
||||
if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) {
|
||||
if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) {
|
||||
if (MO.isReg()) {
|
||||
if (MO.getReg() != SGPRUsed)
|
||||
++ConstantBusCount;
|
||||
|
@ -2330,7 +2352,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
|
|||
if (!MO)
|
||||
MO = &MI.getOperand(OpIdx);
|
||||
|
||||
if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) {
|
||||
if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) {
|
||||
|
||||
RegSubRegPair SGPRUsed;
|
||||
if (MO->isReg())
|
||||
|
@ -2342,7 +2364,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
|
|||
const MachineOperand &Op = MI.getOperand(i);
|
||||
if (Op.isReg()) {
|
||||
if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) &&
|
||||
usesConstantBus(MRI, Op, getOpSize(MI, i))) {
|
||||
usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) {
|
||||
return false;
|
||||
}
|
||||
} else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) {
|
||||
|
@ -3539,14 +3561,14 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
|
|||
if (Src0Idx == -1)
|
||||
return 4; // No operands.
|
||||
|
||||
if (isLiteralConstantLike(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx)))
|
||||
if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx]))
|
||||
return 8;
|
||||
|
||||
int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1);
|
||||
if (Src1Idx == -1)
|
||||
return 4;
|
||||
|
||||
if (isLiteralConstantLike(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx)))
|
||||
if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx]))
|
||||
return 8;
|
||||
|
||||
return 4;
|
||||
|
|
|
@ -462,15 +462,96 @@ public:
|
|||
return !RI.isSGPRReg(MRI, Dest);
|
||||
}
|
||||
|
||||
static int operandBitWidth(uint8_t OperandType) {
|
||||
switch (OperandType) {
|
||||
case AMDGPU::OPERAND_REG_IMM_INT32:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
|
||||
return 32;
|
||||
case AMDGPU::OPERAND_REG_IMM_INT64:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
|
||||
return 64;
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
|
||||
case AMDGPU::OPERAND_REG_IMM_INT16:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP16:
|
||||
return 16;
|
||||
default:
|
||||
llvm_unreachable("unexpected operand type");
|
||||
}
|
||||
}
|
||||
|
||||
bool isInlineConstant(const APInt &Imm) const;
|
||||
bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const;
|
||||
bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const;
|
||||
|
||||
bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
|
||||
|
||||
bool isInlineConstant(const MachineOperand &MO,
|
||||
const MCOperandInfo &OpInfo) const {
|
||||
return isInlineConstant(MO, OpInfo.OperandType);
|
||||
}
|
||||
|
||||
/// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would
|
||||
/// be an inline immediate.
|
||||
bool isInlineConstant(const MachineInstr &MI,
|
||||
const MachineOperand &UseMO,
|
||||
const MachineOperand &DefMO) const {
|
||||
assert(UseMO.getParent() == &MI);
|
||||
int OpIdx = MI.getOperandNo(&UseMO);
|
||||
if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]);
|
||||
}
|
||||
|
||||
/// \p returns true if the operand \p OpIdx in \p MI is a valid inline
|
||||
/// immediate.
|
||||
bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const {
|
||||
const MachineOperand &MO = MI.getOperand(OpIdx);
|
||||
return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
|
||||
}
|
||||
|
||||
bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx,
|
||||
const MachineOperand &MO) const {
|
||||
if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands)
|
||||
return false;
|
||||
|
||||
if (MI.isCopy()) {
|
||||
unsigned Size = getOpSize(MI, OpIdx);
|
||||
assert(Size == 8 || Size == 4);
|
||||
|
||||
uint8_t OpType = (Size == 8) ?
|
||||
AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32;
|
||||
return isInlineConstant(MO, OpType);
|
||||
}
|
||||
|
||||
return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType);
|
||||
}
|
||||
|
||||
bool isInlineConstant(const MachineOperand &MO) const {
|
||||
const MachineInstr *Parent = MO.getParent();
|
||||
return isInlineConstant(*Parent, Parent->getOperandNo(&MO));
|
||||
}
|
||||
|
||||
bool isLiteralConstant(const MachineOperand &MO,
|
||||
const MCOperandInfo &OpInfo) const {
|
||||
return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType);
|
||||
}
|
||||
|
||||
bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const {
|
||||
const MachineOperand &MO = MI.getOperand(OpIdx);
|
||||
return MO.isImm() && !isInlineConstant(MI, OpIdx);
|
||||
}
|
||||
|
||||
// Returns true if this operand could potentially require a 32-bit literal
|
||||
// operand, but not necessarily. A FrameIndex for example could resolve to an
|
||||
// inline immediate value that will not require an additional 4-bytes; this
|
||||
// assumes that it will.
|
||||
bool isLiteralConstantLike(const MachineOperand &MO, unsigned OpSize) const;
|
||||
bool isLiteralConstantLike(const MachineOperand &MO,
|
||||
const MCOperandInfo &OpInfo) const;
|
||||
|
||||
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
|
||||
const MachineOperand &MO) const;
|
||||
|
@ -482,7 +563,7 @@ public:
|
|||
/// \brief Returns true if this operand uses the constant bus.
|
||||
bool usesConstantBus(const MachineRegisterInfo &MRI,
|
||||
const MachineOperand &MO,
|
||||
unsigned OpSize) const;
|
||||
const MCOperandInfo &OpInfo) const;
|
||||
|
||||
/// \brief Return true if this instruction has any modifiers.
|
||||
/// e.g. src[012]_mod, omod, clamp.
|
||||
|
|
|
@ -445,21 +445,29 @@ def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
|
|||
|
||||
} // End OperandType = "OPERAND_IMMEDIATE"
|
||||
|
||||
class KImmMatchClass<int size> : AsmOperandClass {
|
||||
let Name = "KImmFP"#size;
|
||||
let PredicateMethod = "isKImmFP"#size;
|
||||
let ParserMethod = "parseImm";
|
||||
let RenderMethod = "addKImmFP"#size#"Operands";
|
||||
}
|
||||
|
||||
class kimmOperand<ValueType vt> : Operand<vt> {
|
||||
let OperandNamespace = "AMDGPU";
|
||||
let OperandType = "OPERAND_KIMM"#vt.Size;
|
||||
let PrintMethod = "printU"#vt.Size#"ImmOperand";
|
||||
let ParserMatchClass = !cast<AsmOperandClass>("KImmFP"#vt.Size#"MatchClass");
|
||||
}
|
||||
|
||||
// 32-bit VALU immediate operand that uses the constant bus.
|
||||
def KImmFP32MatchClass : AsmOperandClass {
|
||||
let Name = "KImmFP32";
|
||||
let PredicateMethod = "isKImmFP32";
|
||||
let ParserMethod = "parseImm";
|
||||
let RenderMethod = "addKImmFP32Operands";
|
||||
}
|
||||
def KImmFP32MatchClass : KImmMatchClass<32>;
|
||||
def f32kimm : kimmOperand<i32>;
|
||||
|
||||
// 32-bit VALU immediate operand with a 16-bit value that uses the
|
||||
// constant bus.
|
||||
def KImmFP16MatchClass : KImmMatchClass<16>;
|
||||
def f16kimm : kimmOperand<i16>;
|
||||
|
||||
def f32kimm : Operand<i32> {
|
||||
let OperandNamespace = "AMDGPU";
|
||||
let OperandType = "OPERAND_KIMM32";
|
||||
let PrintMethod = "printU32ImmOperand";
|
||||
let ParserMatchClass = KImmFP32MatchClass;
|
||||
}
|
||||
|
||||
def VOPDstS64 : VOPDstOperand <SReg_64>;
|
||||
|
||||
|
@ -468,6 +476,7 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass {
|
|||
let ParserMethod = "parseRegOrImmWithFPInputMods";
|
||||
let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
|
||||
}
|
||||
def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
|
||||
def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
|
||||
def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
|
||||
|
||||
|
@ -480,6 +489,8 @@ class InputMods <AsmOperandClass matchClass> : Operand <i32> {
|
|||
class FPInputMods <FPInputModsMatchClass matchClass> : InputMods <matchClass> {
|
||||
let PrintMethod = "printOperandAndFPInputMods";
|
||||
}
|
||||
|
||||
def FP16InputMods : FPInputMods<FP16InputModsMatchClass>;
|
||||
def FP32InputMods : FPInputMods<FP32InputModsMatchClass>;
|
||||
def FP64InputMods : FPInputMods<FP64InputModsMatchClass>;
|
||||
|
||||
|
@ -629,8 +640,8 @@ class getVOPSrc0ForVT<ValueType VT> {
|
|||
!if(!eq(VT.Value, f64.Value), 1,
|
||||
0)));
|
||||
RegisterOperand ret = !if(isFP,
|
||||
!if(!eq(VT.Size, 64), VSrc_f64, VSrc_f32),
|
||||
!if(!eq(VT.Size, 64), VSrc_b64, VSrc_b32));
|
||||
!if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
|
||||
!if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
|
||||
}
|
||||
|
||||
// Returns the vreg register class to use for source operand given VT
|
||||
|
@ -657,8 +668,9 @@ class getVOP3SrcForVT<ValueType VT> {
|
|||
!if(!eq(VT.Value, i1.Value),
|
||||
SCSrc_b64,
|
||||
!if(isFP,
|
||||
VCSrc_f32,
|
||||
VCSrc_b32)
|
||||
!if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
|
||||
!if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
|
||||
)
|
||||
)
|
||||
)
|
||||
);
|
||||
|
@ -691,7 +703,13 @@ class getSrcMod <ValueType VT> {
|
|||
0)));
|
||||
Operand ret = !if(!eq(VT.Size, 64),
|
||||
!if(isFP, FP64InputMods, Int64InputMods),
|
||||
!if(isFP, FP32InputMods, Int32InputMods));
|
||||
!if(isFP,
|
||||
!if(!eq(VT.Value, f16.Value),
|
||||
FP16InputMods,
|
||||
FP32InputMods
|
||||
),
|
||||
Int32InputMods)
|
||||
);
|
||||
}
|
||||
|
||||
// Returns the input arguments for VOP[12C] instructions for the given SrcVT.
|
||||
|
|
|
@ -107,9 +107,8 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
|
|||
|
||||
// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
|
||||
// pass to enable folding of inline immediates.
|
||||
def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)> {
|
||||
let VALU = 1;
|
||||
}
|
||||
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
|
||||
(ins VSrc_b64:$src0)>;
|
||||
} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
|
||||
|
||||
let usesCustomInserter = 1, SALU = 1 in {
|
||||
|
|
|
@ -1085,19 +1085,6 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
|
|||
return getCommonSubClass(DefRC, SrcRC) != nullptr;
|
||||
}
|
||||
|
||||
bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const {
|
||||
return OpType == AMDGPU::OPERAND_REG_IMM32_INT ||
|
||||
OpType == AMDGPU::OPERAND_REG_IMM32_FP;
|
||||
}
|
||||
|
||||
bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
|
||||
if (opCanUseLiteralConstant(OpType))
|
||||
return true;
|
||||
|
||||
return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
|
||||
OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
|
||||
}
|
||||
|
||||
// FIXME: Most of these are flexible with HSA and we don't need to reserve them
|
||||
// as input registers if unused. Whether the dispatch ptr is necessary should be
|
||||
// easy to detect from used intrinsics. Scratch setup is harder to know.
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
|
||||
|
||||
#include "AMDGPURegisterInfo.h"
|
||||
#include "SIDefines.h"
|
||||
#include "llvm/CodeGen/MachineRegisterInfo.h"
|
||||
|
||||
namespace llvm {
|
||||
|
@ -138,12 +139,19 @@ public:
|
|||
|
||||
/// \returns True if operands defined with this operand type can accept
|
||||
/// a literal constant (i.e. any 32-bit immediate).
|
||||
bool opCanUseLiteralConstant(unsigned OpType) const;
|
||||
bool opCanUseLiteralConstant(unsigned OpType) const {
|
||||
// TODO: 64-bit operands have extending behavior from 32-bit literal.
|
||||
return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST &&
|
||||
OpType <= AMDGPU::OPERAND_REG_IMM_LAST;
|
||||
}
|
||||
|
||||
/// \returns True if operands defined with this operand type can accept
|
||||
/// an inline constant. i.e. An integer value in the range (-16, 64) or
|
||||
/// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
|
||||
bool opCanUseInlineConstant(unsigned OpType) const;
|
||||
bool opCanUseInlineConstant(unsigned OpType) const {
|
||||
return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
|
||||
OpType <= AMDGPU::OPERAND_SRC_LAST;
|
||||
}
|
||||
|
||||
enum PreloadedValue {
|
||||
// SGPRS:
|
||||
|
|
|
@ -384,31 +384,43 @@ class RegImmMatcher<string name> : AsmOperandClass {
|
|||
|
||||
multiclass SIRegOperand <string rc, string MatchName, string opType> {
|
||||
let OperandNamespace = "AMDGPU" in {
|
||||
def _b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
|
||||
let OperandType = opType#"_INT16";
|
||||
let ParserMatchClass = RegImmMatcher<MatchName#"B16">;
|
||||
let DecoderMethod = "decodeOperand_VSrc16";
|
||||
}
|
||||
|
||||
def _f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
|
||||
let OperandType = opType#"_FP16";
|
||||
let ParserMatchClass = RegImmMatcher<MatchName#"F16">;
|
||||
let DecoderMethod = "decodeOperand_VSrc16";
|
||||
}
|
||||
|
||||
def _b32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
|
||||
let OperandType = opType#"_INT";
|
||||
let OperandType = opType#"_INT32";
|
||||
let ParserMatchClass = RegImmMatcher<MatchName#"B32">;
|
||||
}
|
||||
|
||||
def _f32 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
|
||||
let OperandType = opType#"_FP";
|
||||
let OperandType = opType#"_FP32";
|
||||
let ParserMatchClass = RegImmMatcher<MatchName#"F32">;
|
||||
}
|
||||
|
||||
def _b64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
|
||||
let OperandType = opType#"_INT";
|
||||
let OperandType = opType#"_INT64";
|
||||
let ParserMatchClass = RegImmMatcher<MatchName#"B64">;
|
||||
}
|
||||
|
||||
def _f64 : RegisterOperand<!cast<RegisterClass>(rc#"_64")> {
|
||||
let OperandType = opType#"_FP";
|
||||
let OperandType = opType#"_FP64";
|
||||
let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: 64-bit sources can sometimes use 32-bit constants.
|
||||
multiclass RegImmOperand <string rc, string MatchName>
|
||||
: SIRegOperand<rc, MatchName, "OPERAND_REG_IMM32">;
|
||||
: SIRegOperand<rc, MatchName, "OPERAND_REG_IMM">;
|
||||
|
||||
multiclass RegInlineOperand <string rc, string MatchName>
|
||||
: SIRegOperand<rc, MatchName, "OPERAND_REG_INLINE_C">;
|
||||
|
|
|
@ -134,15 +134,14 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII,
|
|||
assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI));
|
||||
|
||||
int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
|
||||
MachineOperand &Src0 = MI.getOperand(Src0Idx);
|
||||
|
||||
// Only one literal constant is allowed per instruction, so if src0 is a
|
||||
// literal constant then we can't do any folding.
|
||||
if (Src0.isImm() &&
|
||||
TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx)))
|
||||
if (TII->isLiteralConstant(MI, Src0Idx))
|
||||
return;
|
||||
|
||||
// Try to fold Src0
|
||||
MachineOperand &Src0 = MI.getOperand(Src0Idx);
|
||||
if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) {
|
||||
unsigned Reg = Src0.getReg();
|
||||
MachineInstr *Def = MRI.getUniqueVRegDef(Reg);
|
||||
|
@ -184,11 +183,15 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
|
|||
}
|
||||
|
||||
static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
|
||||
return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
|
||||
return isInt<16>(Src.getImm()) &&
|
||||
!TII->isInlineConstant(*Src.getParent(),
|
||||
Src.getParent()->getOperandNo(&Src));
|
||||
}
|
||||
|
||||
static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) {
|
||||
return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4);
|
||||
return isUInt<16>(Src.getImm()) &&
|
||||
!TII->isInlineConstant(*Src.getParent(),
|
||||
Src.getParent()->getOperandNo(&Src));
|
||||
}
|
||||
|
||||
static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
|
||||
|
@ -196,12 +199,12 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
|
|||
bool &IsUnsigned) {
|
||||
if (isInt<16>(Src.getImm())) {
|
||||
IsUnsigned = false;
|
||||
return !TII->isInlineConstant(Src, 4);
|
||||
return !TII->isInlineConstant(Src);
|
||||
}
|
||||
|
||||
if (isUInt<16>(Src.getImm())) {
|
||||
IsUnsigned = true;
|
||||
return !TII->isInlineConstant(Src, 4);
|
||||
return !TII->isInlineConstant(Src);
|
||||
}
|
||||
|
||||
return false;
|
||||
|
@ -212,7 +215,7 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII,
|
|||
static bool isReverseInlineImm(const SIInstrInfo *TII,
|
||||
const MachineOperand &Src,
|
||||
int32_t &ReverseImm) {
|
||||
if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src, 4))
|
||||
if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src))
|
||||
return false;
|
||||
|
||||
ReverseImm = reverseBits<int32_t>(static_cast<int32_t>(Src.getImm()));
|
||||
|
|
|
@ -329,25 +329,29 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
|
|||
|
||||
bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
|
||||
unsigned OpType = Desc.OpInfo[OpNo].OperandType;
|
||||
|
||||
return OpType == AMDGPU::OPERAND_REG_IMM32_INT ||
|
||||
OpType == AMDGPU::OPERAND_REG_IMM32_FP ||
|
||||
OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
|
||||
OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
|
||||
return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
|
||||
OpType <= AMDGPU::OPERAND_SRC_LAST;
|
||||
}
|
||||
|
||||
bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
|
||||
unsigned OpType = Desc.OpInfo[OpNo].OperandType;
|
||||
|
||||
return OpType == AMDGPU::OPERAND_REG_IMM32_FP ||
|
||||
OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
|
||||
switch (OpType) {
|
||||
case AMDGPU::OPERAND_REG_IMM_FP32:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP64:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP16:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
|
||||
unsigned OpType = Desc.OpInfo[OpNo].OperandType;
|
||||
|
||||
return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT ||
|
||||
OpType == AMDGPU::OPERAND_REG_INLINE_C_FP;
|
||||
return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
|
||||
OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
|
||||
}
|
||||
|
||||
// Avoid using MCRegisterClass::getSize, since that function will go away
|
||||
|
@ -413,6 +417,15 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
|
|||
if (Literal >= -16 && Literal <= 64)
|
||||
return true;
|
||||
|
||||
// The actual type of the operand does not seem to matter as long
|
||||
// as the bits match one of the inline immediate values. For example:
|
||||
//
|
||||
// -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal,
|
||||
// so it is a legal inline immediate.
|
||||
//
|
||||
// 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in
|
||||
// floating-point, so it is a legal inline immediate.
|
||||
|
||||
uint32_t Val = static_cast<uint32_t>(Literal);
|
||||
return (Val == FloatToBits(0.0f)) ||
|
||||
(Val == FloatToBits(1.0f)) ||
|
||||
|
@ -426,6 +439,23 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
|
|||
(Val == 0x3e22f983 && HasInv2Pi);
|
||||
}
|
||||
|
||||
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
|
||||
assert(HasInv2Pi);
|
||||
|
||||
if (Literal >= -16 && Literal <= 64)
|
||||
return true;
|
||||
|
||||
uint16_t Val = static_cast<uint16_t>(Literal);
|
||||
return Val == 0x3C00 || // 1.0
|
||||
Val == 0xBC00 || // -1.0
|
||||
Val == 0x3800 || // 0.5
|
||||
Val == 0xB800 || // -0.5
|
||||
Val == 0x4000 || // 2.0
|
||||
Val == 0xC000 || // -2.0
|
||||
Val == 0x4400 || // 4.0
|
||||
Val == 0xC400 || // -4.0
|
||||
Val == 0x3118; // 1/2pi
|
||||
}
|
||||
|
||||
} // End namespace AMDGPU
|
||||
} // End namespace llvm
|
||||
|
|
|
@ -13,6 +13,8 @@
|
|||
#include "AMDKernelCodeT.h"
|
||||
#include "llvm/IR/CallingConv.h"
|
||||
|
||||
#include "SIDefines.h"
|
||||
|
||||
#define GET_INSTRINFO_OPERAND_ENUM
|
||||
#include "AMDGPUGenInstrInfo.inc"
|
||||
#undef GET_INSTRINFO_OPERAND_ENUM
|
||||
|
@ -167,6 +169,37 @@ unsigned getRegBitWidth(const MCRegisterClass &RC);
|
|||
unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
|
||||
unsigned OpNo);
|
||||
|
||||
LLVM_READNONE
|
||||
inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
|
||||
switch (OpInfo.OperandType) {
|
||||
case AMDGPU::OPERAND_REG_IMM_INT32:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT32:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP32:
|
||||
return 4;
|
||||
|
||||
case AMDGPU::OPERAND_REG_IMM_INT64:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT64:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP64:
|
||||
return 8;
|
||||
|
||||
case AMDGPU::OPERAND_REG_IMM_INT16:
|
||||
case AMDGPU::OPERAND_REG_IMM_FP16:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_INT16:
|
||||
case AMDGPU::OPERAND_REG_INLINE_C_FP16:
|
||||
return 2;
|
||||
|
||||
default:
|
||||
llvm_unreachable("unhandled operand type");
|
||||
}
|
||||
}
|
||||
|
||||
LLVM_READNONE
|
||||
inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
|
||||
return getOperandSize(Desc.OpInfo[OpNo]);
|
||||
}
|
||||
|
||||
/// \brief Is this literal inlinable
|
||||
LLVM_READNONE
|
||||
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
|
||||
|
@ -174,6 +207,8 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
|
|||
LLVM_READNONE
|
||||
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
|
||||
|
||||
LLVM_READNONE
|
||||
bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
|
||||
|
||||
} // end namespace AMDGPU
|
||||
} // end namespace llvm
|
||||
|
|
|
@ -134,7 +134,8 @@ multiclass VOP2eInst <string opName,
|
|||
}
|
||||
|
||||
class VOP_MADAK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
|
||||
field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, f32kimm:$imm);
|
||||
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
|
||||
field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm);
|
||||
field string Asm32 = "$vdst, $src0, $src1, $imm";
|
||||
field bit HasExt = 0;
|
||||
}
|
||||
|
@ -143,7 +144,8 @@ def VOP_MADAK_F16 : VOP_MADAK <f16>;
|
|||
def VOP_MADAK_F32 : VOP_MADAK <f32>;
|
||||
|
||||
class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
|
||||
field dag Ins32 = (ins VCSrc_f32:$src0, f32kimm:$imm, VGPR_32:$src1);
|
||||
field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm);
|
||||
field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1);
|
||||
field string Asm32 = "$vdst, $src0, $imm, $src1";
|
||||
field bit HasExt = 0;
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ two:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}br_cc_f16_imm_a
|
||||
; GCN: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}}
|
||||
; SI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
|
||||
|
||||
|
@ -49,7 +49,7 @@ two:
|
|||
; SI: v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
|
||||
; SI: s_cbranch_vccz
|
||||
|
||||
; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
|
||||
; VI: v_cmp_nlt_f16_e32 vcc, 0.5, v[[B_F16]]
|
||||
; VI: s_cbranch_vccnz
|
||||
|
||||
; VI: one{{$}}
|
||||
|
@ -80,13 +80,13 @@ two:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}br_cc_f16_imm_b
|
||||
; GCN: v_mov_b32_e32 v[[B_F16:[0-9]+]], {{0x37ff|0x3800}}{{$}}
|
||||
; SI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
|
||||
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
|
||||
; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
|
||||
|
||||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
|
||||
; VI: v_cmp_ngt_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
|
||||
; VI: v_cmp_ngt_f16_e32 vcc, 0.5, v[[A_F16]]
|
||||
; GCN: s_cbranch_vccnz
|
||||
|
||||
; GCN: one{{$}}
|
||||
|
|
|
@ -693,11 +693,16 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
|
|||
ret void
|
||||
}
|
||||
|
||||
|
||||
; FIXME: Should be able to fold this frameindex
|
||||
; Without commuting the frame index in the pre-regalloc run of
|
||||
; SIShrinkInstructions, this was using the VOP3 compare.
|
||||
|
||||
; GCN-LABEL: {{^}}commute_frameindex:
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
|
||||
; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
|
||||
|
||||
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
|
||||
; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
|
||||
define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
|
||||
entry:
|
||||
%stack0 = alloca i32
|
||||
|
|
|
@ -29,7 +29,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
|
||||
; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define void @fadd_f16_imm_a(
|
||||
|
@ -48,7 +48,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[A_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define void @fadd_f16_imm_b(
|
||||
|
@ -104,8 +104,8 @@ entry:
|
|||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
@ -132,8 +132,8 @@ entry:
|
|||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x4000, v[[A_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x3c00, v[[A_F16_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
|
|
@ -48,7 +48,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define void @fmul_f16_imm_b(
|
||||
|
@ -105,7 +105,7 @@ entry:
|
|||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
@ -132,7 +132,7 @@ entry:
|
|||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
|
|
@ -29,7 +29,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
|
||||
; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]]
|
||||
; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define void @fsub_f16_imm_a(
|
||||
|
@ -48,7 +48,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0xc000, v[[A_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define void @fsub_f16_imm_b(
|
||||
|
@ -104,8 +104,8 @@ entry:
|
|||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]]
|
||||
; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]]
|
||||
; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
|
||||
; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
@ -132,8 +132,8 @@ entry:
|
|||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0xc000, v[[A_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0xbc00, v[[A_F16_1]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
|
||||
; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
|
|
@ -0,0 +1,316 @@
|
|||
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
|
||||
|
||||
; FIXME: Merge into imm.ll
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i16:
|
||||
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
|
||||
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
|
||||
store volatile i16 -32768, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_0.0_f16:
|
||||
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
|
||||
store half 0.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_imm_neg_0.0_f16:
|
||||
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
|
||||
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
|
||||
store half -0.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_0.5_f16:
|
||||
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
|
||||
store half 0.5, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f16:
|
||||
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}}
|
||||
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
|
||||
store half -0.5, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_1.0_f16:
|
||||
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
|
||||
store half 1.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f16:
|
||||
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}}
|
||||
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
|
||||
store half -1.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_2.0_f16:
|
||||
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
|
||||
store half 2.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f16:
|
||||
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}}
|
||||
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
|
||||
store half -2.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_4.0_f16:
|
||||
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
|
||||
store half 4.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f16:
|
||||
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}}
|
||||
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
|
||||
store half -4.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16:
|
||||
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
|
||||
store half 0xH3118, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f16:
|
||||
; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}}
|
||||
; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}}
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
|
||||
store half 0xHB118, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}store_literal_imm_f16:
|
||||
; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00
|
||||
; GCN: buffer_store_short [[REG]]
|
||||
define void @store_literal_imm_f16(half addrspace(1)* %out) {
|
||||
store half 4096.0, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_0.0_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_0.5_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0.5
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, -0.5
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_1.0_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 1.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, -1.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_2.0_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 2.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, -2.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_4.0_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 4.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, -4.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
|
||||
%x = load half, half addrspace(1)* %in
|
||||
%y = fadd half %x, 0.5
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}commute_add_literal_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]]
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
|
||||
%x = load half, half addrspace(1)* %in
|
||||
%y = fadd half %x, 1024.0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_1_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xH0001
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_2_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xH0002
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_16_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xH0010
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xHFFFF
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xHFFFE
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}}
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xHFFF0
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_63_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xH003F
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}add_inline_imm_64_f16:
|
||||
; VI: buffer_load_ushort [[VAL:v[0-9]+]]
|
||||
; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
|
||||
; VI: buffer_store_short [[REG]]
|
||||
define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
|
||||
%y = fadd half %x, 0xH0040
|
||||
store half %y, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
|
@ -20,7 +20,7 @@ define void @ldexp_f16(
|
|||
|
||||
; GCN-LABEL: {{^}}ldexp_f16_imm_a
|
||||
; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
|
||||
; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[B_I32]]
|
||||
; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
define void @ldexp_f16_imm_a(
|
||||
half addrspace(1)* %r,
|
||||
|
|
|
@ -51,7 +51,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define void @maxnum_f16_imm_b(
|
||||
|
@ -108,7 +108,7 @@ entry:
|
|||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
@ -135,7 +135,7 @@ entry:
|
|||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
|
|
@ -51,7 +51,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
|
||||
; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
define void @minnum_f16_imm_b(
|
||||
|
@ -108,7 +108,7 @@ entry:
|
|||
; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
|
||||
|
@ -135,7 +135,7 @@ entry:
|
|||
; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
|
||||
; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
|
||||
; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
|
||||
; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
|
||||
; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
|
||||
|
|
|
@ -45,8 +45,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
|
||||
; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}}
|
||||
; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
|
||||
; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]]
|
||||
; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
@ -76,8 +75,7 @@ entry:
|
|||
; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]]
|
||||
; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]]
|
||||
; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
|
||||
; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
|
||||
; VI: v_cmp_gt_f16_e32 vcc, v[[B_F16]], v[[A_F16]]
|
||||
; VI: v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]]
|
||||
; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
|
||||
; GCN: buffer_store_short v[[R_F16]]
|
||||
; GCN: s_endpgm
|
||||
|
|
|
@ -0,0 +1,709 @@
|
|||
# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
|
||||
--- |
|
||||
define void @add_f32_1.0_one_f16_use() #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f32.val = load volatile float, float addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xH3C00
|
||||
%f32.add = fadd float %f32.val, 1.000000e+00
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile float %f32.add, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_f32_1.0_multi_f16_use() #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f32.val = load volatile float, float addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xH3C00
|
||||
%f32.add = fadd float %f32.val, 1.000000e+00
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile float %f32.add, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f32.val = load volatile float, float addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xH3C00
|
||||
%f32.add = fadd float %f32.val, 1.000000e+00
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile float %f32.add, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f32.val = load volatile float, float addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xH3C00
|
||||
%f16.add1 = fadd half %f16.val1, 0xH3C00
|
||||
%f32.add = fadd float %f32.val, 1.000000e+00
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile half %f16.add1, half addrspace(1)* undef
|
||||
store volatile float %f32.add, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_i32_1_multi_f16_use() #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xH0001
|
||||
%f16.add1 = fadd half %f16.val1, 0xH0001
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile half %f16.add1,half addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f32.val = load volatile float, float addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xHFFFE
|
||||
%f16.add1 = fadd half %f16.val1, 0xHFFFE
|
||||
%f32.add = fadd float %f32.val, 0xffffffffc0000000
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile half %f16.add1, half addrspace(1)* undef
|
||||
store volatile float %f32.add, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_f16_1.0_multi_f32_use() #0 {
|
||||
%f32.val0 = load volatile float, float addrspace(1)* undef
|
||||
%f32.val1 = load volatile float, float addrspace(1)* undef
|
||||
%f32.val = load volatile float, float addrspace(1)* undef
|
||||
%f32.add0 = fadd float %f32.val0, 1.0
|
||||
%f32.add1 = fadd float %f32.val1, 1.0
|
||||
store volatile float %f32.add0, float addrspace(1)* undef
|
||||
store volatile float %f32.add1, float addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f32.val = load volatile half, half addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xH3C00
|
||||
%f32.add = fadd half %f32.val, 1.000000e+00
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile half %f32.add, half addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
|
||||
%f16.val0 = load volatile half, half addrspace(1)* undef
|
||||
%f16.val1 = load volatile half, half addrspace(1)* undef
|
||||
%f32.val = load volatile half, half addrspace(1)* undef
|
||||
%f16.add0 = fadd half %f16.val0, 0xH3C00
|
||||
%f32.add = fadd half %f32.val, 1.000000e+00
|
||||
store volatile half %f16.add0, half addrspace(1)* undef
|
||||
store volatile half %f32.add, half addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
||||
...
|
||||
---
|
||||
|
||||
# f32 1.0 with a single use should be folded as the low 32-bits of a
|
||||
# literal constant.
|
||||
|
||||
# CHECK-LABEL: name: add_f32_1.0_one_f16_use
|
||||
# CHECK: %13 = V_ADD_F16_e32 1065353216, killed %11, implicit %exec
|
||||
|
||||
name: add_f32_1.0_one_f16_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%12 = V_MOV_B32_e32 1065353216, implicit %exec
|
||||
%13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit %exec
|
||||
BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
# Materialized f32 inline immediate should not be folded into the f16
|
||||
# operands
|
||||
|
||||
# CHECK-LABEL: name: add_f32_1.0_multi_f16_use
|
||||
# CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec
|
||||
# CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec
|
||||
# CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec
|
||||
|
||||
|
||||
name: add_f32_1.0_multi_f16_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%13 = V_MOV_B32_e32 1065353216, implicit %exec
|
||||
%14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
|
||||
%15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
|
||||
BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
|
||||
# f32 1.0 should be folded into the single f32 use as an inline
|
||||
# immediate, and folded into the single f16 use as a literal constant
|
||||
|
||||
# CHECK-LABEL: name: add_f32_1.0_one_f32_use_one_f16_use
|
||||
# CHECK: %15 = V_ADD_F16_e32 1065353216, %11, implicit %exec
|
||||
# CHECK: %16 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
|
||||
|
||||
name: add_f32_1.0_one_f32_use_one_f16_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
- { id: 16, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%14 = V_MOV_B32_e32 1065353216, implicit %exec
|
||||
%15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
|
||||
%16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
|
||||
BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
|
||||
# f32 1.0 should be folded for the single f32 use as an inline
|
||||
# constant, and not folded as a multi-use literal for the f16 cases
|
||||
|
||||
# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use
|
||||
# CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec
|
||||
# CHECK: %15 = V_ADD_F16_e32 %14, %11, implicit %exec
|
||||
# CHECK: %16 = V_ADD_F16_e32 %14, %12, implicit %exec
|
||||
# CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec
|
||||
|
||||
name: add_f32_1.0_one_f32_use_multi_f16_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
- { id: 16, class: vgpr_32 }
|
||||
- { id: 17, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%14 = V_MOV_B32_e32 1065353216, implicit %exec
|
||||
%15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
|
||||
%16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
|
||||
%17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
|
||||
BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
# CHECK-LABEL: name: add_i32_1_multi_f16_use
|
||||
# CHECK: %13 = V_MOV_B32_e32 1, implicit %exec
|
||||
# CHECK: %14 = V_ADD_F16_e32 1, killed %11, implicit %exec
|
||||
# CHECK: %15 = V_ADD_F16_e32 1, killed %12, implicit %exec
|
||||
|
||||
|
||||
name: add_i32_1_multi_f16_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%13 = V_MOV_B32_e32 1, implicit %exec
|
||||
%14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec
|
||||
%15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec
|
||||
BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
|
||||
# CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use
|
||||
# CHECK: %14 = V_MOV_B32_e32 -2, implicit %exec
|
||||
# CHECK: %15 = V_ADD_F16_e32 -2, %11, implicit %exec
|
||||
# CHECK: %16 = V_ADD_F16_e32 -2, %12, implicit %exec
|
||||
# CHECK: %17 = V_ADD_F32_e32 -2, killed %13, implicit %exec
|
||||
|
||||
name: add_i32_m2_one_f32_use_multi_f16_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
- { id: 16, class: vgpr_32 }
|
||||
- { id: 17, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%14 = V_MOV_B32_e32 -2, implicit %exec
|
||||
%15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec
|
||||
%16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec
|
||||
%17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec
|
||||
BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
|
||||
# f32 1.0 should be folded for the single f32 use as an inline
|
||||
# constant, and not folded as a multi-use literal for the f16 cases
|
||||
|
||||
# CHECK-LABEL: name: add_f16_1.0_multi_f32_use
|
||||
# CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec
|
||||
# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
|
||||
# CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec
|
||||
|
||||
name: add_f16_1.0_multi_f32_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%13 = V_MOV_B32_e32 15360, implicit %exec
|
||||
%14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
|
||||
%15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit %exec
|
||||
BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
|
||||
BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
|
||||
# The low 16-bits are an inline immediate, but the high bits are junk
|
||||
# FIXME: Should be able to fold this
|
||||
|
||||
# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use
|
||||
# CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec
|
||||
# CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec
|
||||
# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
|
||||
|
||||
name: add_f16_1.0_other_high_bits_multi_f16_use
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%13 = V_MOV_B32_e32 80886784, implicit %exec
|
||||
%14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit %exec
|
||||
%15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
|
||||
BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
||||
---
|
||||
|
||||
# FIXME: Should fold inline immediate into f16 and literal use into
|
||||
# f32 instruction.
|
||||
|
||||
# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32
|
||||
# CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec
|
||||
# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec
|
||||
# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec
|
||||
name: add_f16_1.0_other_high_bits_use_f16_f32
|
||||
alignment: 0
|
||||
exposesReturnsTwice: false
|
||||
legalized: false
|
||||
regBankSelected: false
|
||||
selected: false
|
||||
tracksRegLiveness: true
|
||||
registers:
|
||||
- { id: 0, class: sreg_64 }
|
||||
- { id: 1, class: sreg_32 }
|
||||
- { id: 2, class: sgpr_32 }
|
||||
- { id: 3, class: vgpr_32 }
|
||||
- { id: 4, class: sreg_64 }
|
||||
- { id: 5, class: sreg_32 }
|
||||
- { id: 6, class: sreg_64 }
|
||||
- { id: 7, class: sreg_32 }
|
||||
- { id: 8, class: sreg_32 }
|
||||
- { id: 9, class: sreg_32 }
|
||||
- { id: 10, class: sreg_128 }
|
||||
- { id: 11, class: vgpr_32 }
|
||||
- { id: 12, class: vgpr_32 }
|
||||
- { id: 13, class: vgpr_32 }
|
||||
- { id: 14, class: vgpr_32 }
|
||||
- { id: 15, class: vgpr_32 }
|
||||
frameInfo:
|
||||
isFrameAddressTaken: false
|
||||
isReturnAddressTaken: false
|
||||
hasStackMap: false
|
||||
hasPatchPoint: false
|
||||
stackSize: 0
|
||||
offsetAdjustment: 0
|
||||
maxAlignment: 0
|
||||
adjustsStack: false
|
||||
hasCalls: false
|
||||
maxCallFrameSize: 0
|
||||
hasOpaqueSPAdjustment: false
|
||||
hasVAStart: false
|
||||
hasMustTailInVarArgFunc: false
|
||||
body: |
|
||||
bb.0 (%ir-block.0):
|
||||
%4 = IMPLICIT_DEF
|
||||
%5 = COPY %4.sub1
|
||||
%6 = IMPLICIT_DEF
|
||||
%7 = COPY %6.sub0
|
||||
%8 = S_MOV_B32 61440
|
||||
%9 = S_MOV_B32 -1
|
||||
%10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4
|
||||
%11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`)
|
||||
%12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`)
|
||||
%13 = V_MOV_B32_e32 305413120, implicit %exec
|
||||
%14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec
|
||||
%15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec
|
||||
BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`)
|
||||
BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`)
|
||||
S_ENDPGM
|
||||
|
||||
...
|
|
@ -0,0 +1,21 @@
|
|||
// XFAIL: *
|
||||
// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
|
||||
|
||||
v_add_f16 v1, 0xfffff, v2
|
||||
// NOVI: 19: error: invalid operand for instruction
|
||||
|
||||
v_add_f16 v1, 0x10000, v2
|
||||
// NOVI: 19: error: invalid operand for instruction
|
||||
|
||||
v_add_f16 v1, v2, -0.0
|
||||
v_add_f16 v1, v2, 1
|
||||
|
||||
|
||||
|
||||
// FIXME: Should give truncate error
|
||||
v_add_f16 v1, -32769, v2
|
||||
v_add_f16 v1, 65536, v2
|
||||
|
||||
v_add_f32 v1, 4294967296, v2
|
||||
v_add_f32 v1, 0x0000000100000000, v2
|
||||
v_and_b32 v1, 0x0000000100000000, v2
|
|
@ -0,0 +1,148 @@
|
|||
// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI %s
|
||||
|
||||
v_add_f16 v1, 0, v2
|
||||
// VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0.0, v2
|
||||
// VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, v2, 0
|
||||
// VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00]
|
||||
|
||||
v_add_f16 v1, v2, 0.0
|
||||
// VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00]
|
||||
|
||||
v_add_f16 v1, -0.0, v2
|
||||
// VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00]
|
||||
|
||||
v_add_f16 v1, 1.0, v2
|
||||
// VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -1.0, v2
|
||||
// VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -0.5, v2
|
||||
// VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0.5, v2
|
||||
// VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 2.0, v2
|
||||
// VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -2.0, v2
|
||||
// VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 4.0, v2
|
||||
// VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -4.0, v2
|
||||
// VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0.15915494, v2
|
||||
// VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -0.15915494, v2
|
||||
// VI: v_add_f16_e32 v1, 0xb118, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x18,0xb1,0x00,0x00]
|
||||
|
||||
v_add_f16 v1, -1, v2
|
||||
// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
|
||||
|
||||
|
||||
v_add_f16 v1, -2, v2
|
||||
// VI: v_add_f16_e32 v1, -2, v2 ; encoding: [0xc2,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -3, v2
|
||||
// VI: v_add_f16_e32 v1, -3, v2 ; encoding: [0xc3,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -16, v2
|
||||
// VI: v_add_f16_e32 v1, -16, v2 ; encoding: [0xd0,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 1, v2
|
||||
// VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 2, v2
|
||||
// VI: v_add_f16_e32 v1, 2, v2 ; encoding: [0x82,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 3, v2
|
||||
// VI: v_add_f16_e32 v1, 3, v2 ; encoding: [0x83,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 4, v2
|
||||
// VI: v_add_f16_e32 v1, 4, v2 ; encoding: [0x84,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 15, v2
|
||||
// VI: v_add_f16_e32 v1, 15, v2 ; encoding: [0x8f,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 16, v2
|
||||
// VI: v_add_f16_e32 v1, 16, v2 ; encoding: [0x90,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 63, v2
|
||||
// VI: v_add_f16_e32 v1, 63, v2 ; encoding: [0xbf,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 64, v2
|
||||
// VI: v_add_f16_e32 v1, 64, v2 ; encoding: [0xc0,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0x0001, v2
|
||||
// VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0xffff, v2
|
||||
// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -17, v2
|
||||
// VI: v_add_f16_e32 v1, 0xffef, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xef,0xff,0x00,0x00]
|
||||
|
||||
v_add_f16 v1, 65, v2
|
||||
// VI: v_add_f16_e32 v1, 0x41, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x41,0x00,0x00,0x00]
|
||||
|
||||
v_add_f16 v1, 0x3c00, v2
|
||||
// VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0xbc00, v2
|
||||
// VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0x3800, v2
|
||||
// VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0xb800, v2
|
||||
// VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0x4000, v2
|
||||
// VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0xc000, v2
|
||||
// VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0x4400, v2
|
||||
// VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0xc400, v2
|
||||
// VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, 0x3118, v2
|
||||
// VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e]
|
||||
|
||||
v_add_f16 v1, -32768, v2
|
||||
// VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00]
|
||||
|
||||
v_add_f16 v1, 32767, v2
|
||||
// VI: v_add_f16_e32 v1, 0x7fff, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xff,0x7f,0x00,0x00]
|
||||
|
||||
v_add_f16 v1, 65535, v2
|
||||
// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e]
|
||||
|
||||
|
||||
// K-constant
|
||||
v_madmk_f16 v1, v2, 0x4280, v3
|
||||
// VI: v_madmk_f16_e32 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00]
|
||||
|
||||
v_madmk_f16 v1, v2, 1.0, v3
|
||||
// VI: v_madmk_f16_e32 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00]
|
||||
|
||||
v_madmk_f16 v1, v2, 1, v3
|
||||
// VI: v_madmk_f16_e32 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00]
|
||||
|
||||
v_madmk_f16 v1, v2, 64.0, v3
|
||||
// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
|
||||
|
||||
|
||||
v_add_f16_e32 v1, 64.0, v2
|
|
@ -422,12 +422,12 @@ v_mac_f16_e32 v1, v2, v3
|
|||
|
||||
// NOSICI: error: instruction not supported on this GPU
|
||||
// NOSICI: v_madmk_f16 v1, v2, 64.0, v3
|
||||
// VI: v_madmk_f16_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42]
|
||||
// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00]
|
||||
v_madmk_f16 v1, v2, 64.0, v3
|
||||
|
||||
// NOSICI: error: instruction not supported on this GPU
|
||||
// NOSICI: v_madak_f16 v1, v2, v3, 64.0
|
||||
// VI: v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42]
|
||||
// VI: v_madak_f16_e32 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00]
|
||||
v_madak_f16 v1, v2, v3, 64.0
|
||||
|
||||
// NOSICI: error: instruction not supported on this GPU
|
||||
|
|
|
@ -0,0 +1,54 @@
|
|||
# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding %s | FileCheck -check-prefix=VI %s
|
||||
|
||||
# VI: v_add_f16_e32 v1, 0.5, v3 ; encoding: [0xf0,0x06,0x02,0x3e]
|
||||
0xf0 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, -0.5, v3 ; encoding: [0xf1,0x06,0x02,0x3e]
|
||||
0xf1 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x3e]
|
||||
0xf2 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x3e]
|
||||
0xf3 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, 2.0, v3 ; encoding: [0xf4,0x06,0x02,0x3e]
|
||||
0xf4 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, -2.0, v3 ; encoding: [0xf5,0x06,0x02,0x3e]
|
||||
0xf5 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, 4.0, v3 ; encoding: [0xf6,0x06,0x02,0x3e]
|
||||
0xf6 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, -4.0, v3 ; encoding: [0xf7,0x06,0x02,0x3e]
|
||||
0xf7 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, 0.15915494, v3 ; encoding: [0xf8,0x06,0x02,0x3e]
|
||||
0xf8 0x06 0x02 0x3e
|
||||
|
||||
# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00]
|
||||
0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x00
|
||||
|
||||
# VI: v_add_f16_e32 v1, 0x100, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x01,0x00,0x00]
|
||||
0xff 0x06 0x02 0x3e 0x00 0x01 0x00 0x00
|
||||
|
||||
# non-zero unused bits in constant
|
||||
# VI: v_add_f16_e32 v1, 0x10041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x01,0x00]
|
||||
0xff 0x06 0x02 0x3e 0x41 0x00 0x01 0x00
|
||||
|
||||
# VI: v_add_f16_e32 v1, 0x1000041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x01]
|
||||
0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01
|
||||
|
||||
# FIXME: This should be able to round trip with literal after instruction
|
||||
# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e]
|
||||
0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00
|
||||
|
||||
# VI: v_madmk_f16_e32 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00]
|
||||
0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x00
|
||||
|
||||
# VI: v_madmk_f16_e32 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00]
|
||||
0x02 0x07 0x02 0x48 0x41 0x00 0x01 0x00
|
||||
|
||||
# VI: v_madmk_f16_e32 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01]
|
||||
0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x01
|
|
@ -246,5 +246,5 @@
|
|||
# CHECK: v_cvt_f16_i16_e32 v123, 0x21c2 ; encoding: [0xff,0x74,0xf6,0x7e,0xc2,0x21,0x00,0x00]
|
||||
0xff 0x74 0xf6 0x7e 0xc2 0x21 0x00 0x00
|
||||
|
||||
# CHECK: v_cvt_u16_f16_e32 v123, 0x3f200000 ; encoding: [0xff,0x76,0xf6,0x7e,0x00,0x00,0x20,0x3f]
|
||||
0xff 0x76 0xf6 0x7e 0x00 0x00 0x20 0x3f
|
||||
# CHECK: v_cvt_u16_f16_e32 v123, 0x3f20 ; encoding: [0xff,0x76,0xf6,0x7e,0x20,0x3f,0x00,0x00]
|
||||
0xff 0x76 0xf6 0x7e 0x20 0x3f 0x00 0x00
|
||||
|
|
Loading…
Reference in New Issue