From 4bd72361935300f1699d3e49ba3304f0c7631f1a Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Sat, 10 Dec 2016 00:39:12 +0000 Subject: [PATCH] AMDGPU: Fix handling of 16-bit immediates Since 32-bit instructions with 32-bit input immediate behavior are used to materialize 16-bit constants in 32-bit registers for 16-bit instructions, determining the legality based on the size is incorrect. Change operands to have the size specified in the type. Also adds a workaround for a disassembler bug that produces an immediate MCOperand for an operand that is supposed to be OPERAND_REGISTER. The assembler appears to accept out of bounds immediates and truncates them, but this seems to be an issue for 32-bit already. llvm-svn: 289306 --- .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp | 205 +++-- .../Disassembler/AMDGPUDisassembler.cpp | 144 +++- .../AMDGPU/Disassembler/AMDGPUDisassembler.h | 4 +- .../AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp | 83 +- .../AMDGPU/InstPrinter/AMDGPUInstPrinter.h | 2 + .../AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp | 63 +- llvm/lib/Target/AMDGPU/SIDefines.h | 29 +- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 14 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 118 +-- llvm/lib/Target/AMDGPU/SIInstrInfo.h | 89 ++- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 52 +- llvm/lib/Target/AMDGPU/SIInstructions.td | 5 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp | 13 - llvm/lib/Target/AMDGPU/SIRegisterInfo.h | 12 +- llvm/lib/Target/AMDGPU/SIRegisterInfo.td | 22 +- .../Target/AMDGPU/SIShrinkInstructions.cpp | 19 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 52 +- llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 35 + llvm/lib/Target/AMDGPU/VOP2Instructions.td | 6 +- llvm/test/CodeGen/AMDGPU/br_cc.f16.ll | 8 +- llvm/test/CodeGen/AMDGPU/commute-compares.ll | 7 +- llvm/test/CodeGen/AMDGPU/fadd.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/fmul.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/fsub.f16.ll | 12 +- llvm/test/CodeGen/AMDGPU/imm16.ll | 316 ++++++++ .../CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll | 2 +- llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 6 +- llvm/test/CodeGen/AMDGPU/select.f16.ll | 6 +- .../CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir | 709 ++++++++++++++++++ llvm/test/MC/AMDGPU/literal16-err.s | 21 + llvm/test/MC/AMDGPU/literal16.s | 148 ++++ llvm/test/MC/AMDGPU/vop2.s | 4 +- .../MC/Disassembler/AMDGPU/literal16_vi.txt | 54 ++ llvm/test/MC/Disassembler/AMDGPU/vop1.txt | 4 +- 35 files changed, 2029 insertions(+), 259 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/imm16.ll create mode 100644 llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir create mode 100644 llvm/test/MC/AMDGPU/literal16-err.s create mode 100644 llvm/test/MC/AMDGPU/literal16.s create mode 100644 llvm/test/MC/Disassembler/AMDGPU/literal16_vi.txt diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 453d0d91d3e3..80c815e830b9 100644 --- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -215,6 +215,10 @@ public: return isRegKind() || isInlinableImm(type); } + bool isRegOrImmWithInt16InputMods() const { + return isRegOrImmWithInputMods(MVT::i16); + } + bool isRegOrImmWithInt32InputMods() const { return isRegOrImmWithInputMods(MVT::i32); } @@ -223,6 +227,10 @@ public: return isRegOrImmWithInputMods(MVT::i64); } + bool isRegOrImmWithFP16InputMods() const { + return isRegOrImmWithInputMods(MVT::f16); + } + bool isRegOrImmWithFP32InputMods() const { return isRegOrImmWithInputMods(MVT::f32); } @@ -282,6 +290,10 @@ public: bool isRegClass(unsigned RCID) const; + bool isSCSrcB16() const { + return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i16); + } + bool isSCSrcB32() const { return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::i32); } @@ -290,6 +302,10 @@ public: return isRegClass(AMDGPU::SReg_64RegClassID) || isInlinableImm(MVT::i64); } + bool isSCSrcF16() const { + return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f16); + } + bool isSCSrcF32() const { return isRegClass(AMDGPU::SReg_32RegClassID) || isInlinableImm(MVT::f32); } @@ -302,6 +318,10 @@ public: return isSCSrcB32() || isLiteralImm(MVT::i32) || isExpr(); } + bool isSSrcB16() const { + return isSCSrcB16() || isLiteralImm(MVT::i16); + } + bool isSSrcB64() const { // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits. // See isVSrc64(). @@ -316,6 +336,10 @@ public: return isSCSrcB64() || isLiteralImm(MVT::f64); } + bool isSSrcF16() const { + return isSCSrcB16() || isLiteralImm(MVT::f16); + } + bool isVCSrcB32() const { return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i32); } @@ -324,6 +348,10 @@ public: return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::i64); } + bool isVCSrcB16() const { + return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::i16); + } + bool isVCSrcF32() const { return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f32); } @@ -332,6 +360,10 @@ public: return isRegClass(AMDGPU::VS_64RegClassID) || isInlinableImm(MVT::f64); } + bool isVCSrcF16() const { + return isRegClass(AMDGPU::VS_32RegClassID) || isInlinableImm(MVT::f16); + } + bool isVSrcB32() const { return isVCSrcF32() || isLiteralImm(MVT::i32); } @@ -340,6 +372,10 @@ public: return isVCSrcF64() || isLiteralImm(MVT::i64); } + bool isVSrcB16() const { + return isVCSrcF16() || isLiteralImm(MVT::i16); + } + bool isVSrcF32() const { return isVCSrcF32() || isLiteralImm(MVT::f32); } @@ -348,10 +384,18 @@ public: return isVCSrcF64() || isLiteralImm(MVT::f64); } + bool isVSrcF16() const { + return isVCSrcF16() || isLiteralImm(MVT::f16); + } + bool isKImmFP32() const { return isLiteralImm(MVT::f32); } + bool isKImmFP16() const { + return isLiteralImm(MVT::f16); + } + bool isMem() const override { return false; } @@ -439,7 +483,16 @@ public: void addLiteralImmOperand(MCInst &Inst, int64_t Val) const; - void addKImmFP32Operands(MCInst &Inst, unsigned N) const; + template + void addKImmFPOperands(MCInst &Inst, unsigned N) const; + + void addKImmFP16Operands(MCInst &Inst, unsigned N) const { + addKImmFPOperands<16>(Inst, N); + } + + void addKImmFP32Operands(MCInst &Inst, unsigned N) const { + addKImmFPOperands<32>(Inst, N); + } void addRegOperands(MCInst &Inst, unsigned N) const; @@ -826,19 +879,23 @@ struct OptionalOperand { } // end anonymous namespace // May be called with integer type with equivalent bitwidth. -static const fltSemantics *getFltSemantics(MVT VT) { - switch (VT.getSizeInBits()) { - case 32: +static const fltSemantics *getFltSemantics(unsigned Size) { + switch (Size) { + case 4: return &APFloat::IEEEsingle; - case 64: + case 8: return &APFloat::IEEEdouble; - case 16: + case 2: return &APFloat::IEEEhalf; default: llvm_unreachable("unsupported fp type"); } } +static const fltSemantics *getFltSemantics(MVT VT) { + return getFltSemantics(VT.getSizeInBits() / 8); +} + //===----------------------------------------------------------------------===// // Operand //===----------------------------------------------------------------------===// @@ -895,6 +952,12 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { AsmParser->hasInv2PiInlineImm()); } + if (type.getScalarSizeInBits() == 16) { + return AMDGPU::isInlinableLiteral16( + static_cast(Literal.getLoBits(16).getSExtValue()), + AsmParser->hasInv2PiInlineImm()); + } + return AMDGPU::isInlinableLiteral32( static_cast(Literal.getLoBits(32).getZExtValue()), AsmParser->hasInv2PiInlineImm()); @@ -909,9 +972,13 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const { if (!Imm.IsFPImm) { // We got int literal token. + unsigned Size = type.getSizeInBits(); + if (Size == 64) + Size = 32; + // FIXME: 64-bit operands can zero extend, sign extend, or pad zeroes for FP // types. - return isUInt<32>(Imm.Val) || isInt<32>(Imm.Val); + return isUIntN(Size, Imm.Val) || isIntN(Size, Imm.Val); } // We got fp literal token @@ -947,7 +1014,8 @@ void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers } } - if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), Inst.getNumOperands())) { + if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()), + Inst.getNumOperands())) { addLiteralImmOperand(Inst, Val); } else { Inst.addOperand(MCOperand::createImm(Val)); @@ -960,69 +1028,112 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const { // Check that this operand accepts literals assert(AMDGPU::isSISrcOperand(InstDesc, OpNum)); - APInt Literal(64, Val); - auto OpSize = AMDGPU::getRegOperandSize(AsmParser->getMRI(), InstDesc, OpNum); // expected operand size + auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size if (Imm.IsFPImm) { // We got fp literal token - if (OpSize == 8) { // Expected 64-bit operand - // Check if literal is inlinable + APInt Literal(64, Val); + + switch (OpSize) { + case 8: { if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(), AsmParser->hasInv2PiInlineImm())) { Inst.addOperand(MCOperand::createImm(Literal.getZExtValue())); - } else if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand + return; + } + + // Non-inlineable + if (AMDGPU::isSISrcFPOperand(InstDesc, OpNum)) { // Expected 64-bit fp operand // For fp operands we check if low 32 bits are zeros if (Literal.getLoBits(32) != 0) { const_cast(AsmParser)->Warning(Inst.getLoc(), - "Can't encode literal as exact 64-bit" - " floating-point operand. Low 32-bits will be" - " set to zero"); + "Can't encode literal as exact 64-bit floating-point operand. " + "Low 32-bits will be set to zero"); } + Inst.addOperand(MCOperand::createImm(Literal.lshr(32).getZExtValue())); - } else { - // We don't allow fp literals in 64-bit integer instructions. It is - // unclear how we should encode them. This case should be checked earlier - // in predicate methods (isLiteralImm()) - llvm_unreachable("fp literal in 64-bit integer instruction."); + return; } - } else { // Expected 32-bit operand + + // We don't allow fp literals in 64-bit integer instructions. It is + // unclear how we should encode them. This case should be checked earlier + // in predicate methods (isLiteralImm()) + llvm_unreachable("fp literal in 64-bit integer instruction."); + } + case 4: + case 2: { bool lost; APFloat FPLiteral(APFloat::IEEEdouble, Literal); // Convert literal to single precision - FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost); + FPLiteral.convert(*getFltSemantics(OpSize), + APFloat::rmNearestTiesToEven, &lost); // We allow precision lost but not overflow or underflow. This should be // checked earlier in isLiteralImm() Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); + return; } - } else { // We got int literal token - if (OpSize == 8) { // Expected 64-bit operand - auto LiteralVal = Literal.getZExtValue(); - if (AMDGPU::isInlinableLiteral64(LiteralVal, - AsmParser->hasInv2PiInlineImm())) { - Inst.addOperand(MCOperand::createImm(LiteralVal)); - return; - } - } else { // Expected 32-bit operand - auto LiteralVal = static_cast(Literal.getLoBits(32).getZExtValue()); - if (AMDGPU::isInlinableLiteral32(LiteralVal, - AsmParser->hasInv2PiInlineImm())) { - Inst.addOperand(MCOperand::createImm(LiteralVal)); - return; - } + default: + llvm_unreachable("invalid operand size"); } - Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue())); + + return; + } + + // We got int literal token. + // Only sign extend inline immediates. + // FIXME: No errors on truncation + switch (OpSize) { + case 4: { + if (isInt<32>(Val) && + AMDGPU::isInlinableLiteral32(static_cast(Val), + AsmParser->hasInv2PiInlineImm())) { + Inst.addOperand(MCOperand::createImm(Val)); + return; + } + + Inst.addOperand(MCOperand::createImm(Val & 0xffffffff)); + return; + } + case 8: { + if (AMDGPU::isInlinableLiteral64(Val, + AsmParser->hasInv2PiInlineImm())) { + Inst.addOperand(MCOperand::createImm(Val)); + return; + } + + Inst.addOperand(MCOperand::createImm(Lo_32(Val))); + return; + } + case 2: { + if (isInt<16>(Val) && + AMDGPU::isInlinableLiteral16(static_cast(Val), + AsmParser->hasInv2PiInlineImm())) { + Inst.addOperand(MCOperand::createImm(Val)); + return; + } + + Inst.addOperand(MCOperand::createImm(Val & 0xffff)); + return; + } + default: + llvm_unreachable("invalid operand size"); } } -void AMDGPUOperand::addKImmFP32Operands(MCInst &Inst, unsigned N) const { +template +void AMDGPUOperand::addKImmFPOperands(MCInst &Inst, unsigned N) const { APInt Literal(64, Imm.Val); - if (Imm.IsFPImm) { // We got fp literal - bool lost; - APFloat FPLiteral(APFloat::IEEEdouble, Literal); - FPLiteral.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &lost); - Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); - } else { // We got int literal token - Inst.addOperand(MCOperand::createImm(Literal.getLoBits(32).getZExtValue())); + + if (!Imm.IsFPImm) { + // We got int literal token. + Inst.addOperand(MCOperand::createImm(Literal.getLoBits(Bitwidth).getZExtValue())); + return; } + + bool Lost; + APFloat FPLiteral(APFloat::IEEEdouble, Literal); + FPLiteral.convert(*getFltSemantics(Bitwidth / 8), + APFloat::rmNearestTiesToEven, &Lost); + Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue())); } void AMDGPUOperand::addRegOperands(MCInst &Inst, unsigned N) const { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index 1a8c04b150f5..2247cad7bb51 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -88,6 +88,15 @@ DECODE_OPERAND(SReg_128) DECODE_OPERAND(SReg_256) DECODE_OPERAND(SReg_512) + +static DecodeStatus decodeOperand_VSrc16(MCInst &Inst, + unsigned Imm, + uint64_t Addr, + const void *Decoder) { + auto DAsm = static_cast(Decoder); + return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm)); +} + #define GET_SUBTARGETINFO_ENUM #include "AMDGPUGenSubtargetInfo.inc" #undef GET_SUBTARGETINFO_ENUM @@ -250,6 +259,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const { + return decodeSrcOp(OPW16, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const { // Some instructions have operand restrictions beyond what the encoding // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra @@ -324,28 +337,96 @@ MCOperand AMDGPUDisassembler::decodeIntImmed(unsigned Imm) { // Cast prevents negative overflow. } -MCOperand AMDGPUDisassembler::decodeFPImmed(bool Is32, unsigned Imm) { +static int64_t getInlineImmVal32(unsigned Imm) { + switch (Imm) { + case 240: + return FloatToBits(0.5f); + case 241: + return FloatToBits(-0.5f); + case 242: + return FloatToBits(1.0f); + case 243: + return FloatToBits(-1.0f); + case 244: + return FloatToBits(2.0f); + case 245: + return FloatToBits(-2.0f); + case 246: + return FloatToBits(4.0f); + case 247: + return FloatToBits(-4.0f); + case 248: // 1 / (2 * PI) + return 0x3e22f983; + default: + llvm_unreachable("invalid fp inline imm"); + } +} + +static int64_t getInlineImmVal64(unsigned Imm) { + switch (Imm) { + case 240: + return DoubleToBits(0.5); + case 241: + return DoubleToBits(-0.5); + case 242: + return DoubleToBits(1.0); + case 243: + return DoubleToBits(-1.0); + case 244: + return DoubleToBits(2.0); + case 245: + return DoubleToBits(-2.0); + case 246: + return DoubleToBits(4.0); + case 247: + return DoubleToBits(-4.0); + case 248: // 1 / (2 * PI) + return 0x3fc45f306dc9c882; + default: + llvm_unreachable("invalid fp inline imm"); + } +} + +static int64_t getInlineImmVal16(unsigned Imm) { + switch (Imm) { + case 240: + return 0x3800; + case 241: + return 0xB800; + case 242: + return 0x3C00; + case 243: + return 0xBC00; + case 244: + return 0x4000; + case 245: + return 0xC000; + case 246: + return 0x4400; + case 247: + return 0xC400; + case 248: // 1 / (2 * PI) + return 0x3118; + default: + llvm_unreachable("invalid fp inline imm"); + } +} + +MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) { assert(Imm >= AMDGPU::EncValues::INLINE_FLOATING_C_MIN && Imm <= AMDGPU::EncValues::INLINE_FLOATING_C_MAX); + // ToDo: case 248: 1/(2*PI) - is allowed only on VI - // ToDo: AMDGPUInstPrinter does not support 1/(2*PI). It consider 1/(2*PI) as - // literal constant. - float V = 0.0f; - switch (Imm) { - case 240: V = 0.5f; break; - case 241: V = -0.5f; break; - case 242: V = 1.0f; break; - case 243: V = -1.0f; break; - case 244: V = 2.0f; break; - case 245: V = -2.0f; break; - case 246: V = 4.0f; break; - case 247: V = -4.0f; break; - case 248: return MCOperand::createImm(Is32 ? // 1/(2*PI) - 0x3e22f983 : - 0x3fc45f306dc9c882); - default: break; + switch (Width) { + case OPW32: + return MCOperand::createImm(getInlineImmVal32(Imm)); + case OPW64: + return MCOperand::createImm(getInlineImmVal64(Imm)); + case OPW16: + return MCOperand::createImm(getInlineImmVal16(Imm)); + default: + llvm_unreachable("implement me"); } - return MCOperand::createImm(Is32? FloatToBits(V) : DoubleToBits(V)); } unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { @@ -353,7 +434,9 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const { assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall - case OPW32: return VGPR_32RegClassID; + case OPW32: + case OPW16: + return VGPR_32RegClassID; case OPW64: return VReg_64RegClassID; case OPW128: return VReg_128RegClassID; } @@ -364,7 +447,9 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const { assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall - case OPW32: return SGPR_32RegClassID; + case OPW32: + case OPW16: + return SGPR_32RegClassID; case OPW64: return SGPR_64RegClassID; case OPW128: return SGPR_128RegClassID; } @@ -375,7 +460,9 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const { assert(OPW_FIRST_ <= Width && Width < OPW_LAST_); switch (Width) { default: // fall - case OPW32: return TTMP_32RegClassID; + case OPW32: + case OPW16: + return TTMP_32RegClassID; case OPW64: return TTMP_64RegClassID; case OPW128: return TTMP_128RegClassID; } @@ -396,19 +483,26 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c return createSRegOperand(getTtmpClassId(Width), Val - TTMP_MIN); } - assert(Width == OPW32 || Width == OPW64); - const bool Is32 = (Width == OPW32); + assert(Width == OPW16 || Width == OPW32 || Width == OPW64); if (INLINE_INTEGER_C_MIN <= Val && Val <= INLINE_INTEGER_C_MAX) return decodeIntImmed(Val); if (INLINE_FLOATING_C_MIN <= Val && Val <= INLINE_FLOATING_C_MAX) - return decodeFPImmed(Is32, Val); + return decodeFPImmed(Width, Val); if (Val == LITERAL_CONST) return decodeLiteralConstant(); - return Is32 ? decodeSpecialReg32(Val) : decodeSpecialReg64(Val); + switch (Width) { + case OPW32: + case OPW16: + return decodeSpecialReg32(Val); + case OPW64: + return decodeSpecialReg64(Val); + default: + llvm_unreachable("unexpected immediate type"); + } } MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const { diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index c8b2f1ff239a..ee5883a984e0 100644 --- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -66,6 +66,7 @@ public: MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; + MCOperand decodeOperand_VSrc16(unsigned Val) const; MCOperand decodeOperand_VReg_64(unsigned Val) const; MCOperand decodeOperand_VReg_96(unsigned Val) const; @@ -83,6 +84,7 @@ public: OPW32, OPW64, OPW128, + OPW16, OPW_LAST_, OPW_FIRST_ = OPW32 }; @@ -92,7 +94,7 @@ public: unsigned getTtmpClassId(const OpWidthTy Width) const; static MCOperand decodeIntImmed(unsigned Imm); - static MCOperand decodeFPImmed(bool Is32, unsigned Imm); + static MCOperand decodeFPImmed(OpWidthTy Width, unsigned Imm); MCOperand decodeLiteralConstant() const; MCOperand decodeSrcOp(const OpWidthTy Width, unsigned Val) const; diff --git a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp index f95d790e2bc8..b84aaaef0905 100644 --- a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp +++ b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp @@ -47,7 +47,13 @@ void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo, void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O) { - O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff); + // It's possible to end up with a 32-bit literal used with a 16-bit operand + // with ignored high bits. Print as 32-bit anyway in that case. + int64_t Imm = MI->getOperand(OpNo).getImm(); + if (isInt<16>(Imm) || isUInt<16>(Imm)) + O << formatHex(static_cast(Imm & 0xffff)); + else + printU32ImmOperand(MI, OpNo, STI, O); } void AMDGPUInstPrinter::printU4ImmDecOperand(const MCInst *MI, unsigned OpNo, @@ -336,6 +342,38 @@ void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo, printOperand(MI, OpNo, STI, O); } +void AMDGPUInstPrinter::printImmediate16(uint32_t Imm, + const MCSubtargetInfo &STI, + raw_ostream &O) { + int16_t SImm = static_cast(Imm); + if (SImm >= -16 && SImm <= 64) { + O << SImm; + return; + } + + if (Imm == 0x3C00) + O<< "1.0"; + else if (Imm == 0xBC00) + O<< "-1.0"; + else if (Imm == 0x3800) + O<< "0.5"; + else if (Imm == 0xB800) + O<< "-0.5"; + else if (Imm == 0x4000) + O<< "2.0"; + else if (Imm == 0xC000) + O<< "-2.0"; + else if (Imm == 0x4400) + O<< "4.0"; + else if (Imm == 0xC400) + O<< "-4.0"; + else if (Imm == 0x3118) { + assert(STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]); + O << "0.15915494"; + } else + O << formatHex(static_cast(Imm)); +} + void AMDGPUInstPrinter::printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O) { @@ -431,22 +469,39 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, } } else if (Op.isImm()) { const MCInstrDesc &Desc = MII.get(MI->getOpcode()); - int RCID = Desc.OpInfo[OpNo].RegClass; - if (RCID != -1) { - unsigned RCBits = AMDGPU::getRegBitWidth(MRI.getRegClass(RCID)); - if (RCBits == 32) - printImmediate32(Op.getImm(), STI, O); - else if (RCBits == 64) - printImmediate64(Op.getImm(), STI, O); - else - llvm_unreachable("Invalid register class size"); - } else if (Desc.OpInfo[OpNo].OperandType == MCOI::OPERAND_IMMEDIATE) { + switch (Desc.OpInfo[OpNo].OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case MCOI::OPERAND_IMMEDIATE: printImmediate32(Op.getImm(), STI, O); - } else { + break; + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + printImmediate64(Op.getImm(), STI, O); + break; + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + printImmediate16(Op.getImm(), STI, O); + break; + case MCOI::OPERAND_UNKNOWN: + case MCOI::OPERAND_PCREL: + O << formatDec(Op.getImm()); + break; + case MCOI::OPERAND_REGISTER: + // FIXME: This should be removed and handled somewhere else. Seems to come + // from a disassembler bug. + O << "/*invalid immediate*/"; + break; + default: // We hit this for the immediate instruction bits that don't yet have a // custom printer. - // TODO: Eventually this should be unnecessary. - O << formatDec(Op.getImm()); + llvm_unreachable("unexpected immediate operand type"); } } else if (Op.isFPImm()) { // We special case 0.0 because otherwise it will be printed as an integer. diff --git a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h index 9d6a203426ad..f2ed0e09bbff 100644 --- a/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h +++ b/llvm/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h @@ -88,6 +88,8 @@ private: void printRegOperand(unsigned RegNo, raw_ostream &O); void printVOPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI, raw_ostream &O); + void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI, + raw_ostream &O); void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI, raw_ostream &O); void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI, diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp index e60ead8acdc0..4a046acfabbc 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp @@ -39,7 +39,7 @@ class SIMCCodeEmitter : public AMDGPUMCCodeEmitter { const MCRegisterInfo &MRI; /// \brief Encode an fp or int literal - uint32_t getLitEncoding(const MCOperand &MO, unsigned OpSize, + uint32_t getLitEncoding(const MCOperand &MO, const MCOperandInfo &OpInfo, const MCSubtargetInfo &STI) const; public: @@ -87,6 +87,42 @@ static uint32_t getIntInlineImmEncoding(IntTy Imm) { return 0; } +static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) { + uint16_t IntImm = getIntInlineImmEncoding(static_cast(Val)); + if (IntImm != 0) + return IntImm; + + if (Val == 0x3800) // 0.5 + return 240; + + if (Val == 0xB800) // -0.5 + return 241; + + if (Val == 0x3C00) // 1.0 + return 242; + + if (Val == 0xBC00) // -1.0 + return 243; + + if (Val == 0x4000) // 2.0 + return 244; + + if (Val == 0xC000) // -2.0 + return 245; + + if (Val == 0x4400) // 4.0 + return 246; + + if (Val == 0xC400) // -4.0 + return 247; + + if (Val == 0x3118 && // 1.0 / (2.0 * pi) + STI.getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm]) + return 248; + + return 255; +} + static uint32_t getLit32Encoding(uint32_t Val, const MCSubtargetInfo &STI) { uint32_t IntImm = getIntInlineImmEncoding(static_cast(Val)); if (IntImm != 0) @@ -160,7 +196,7 @@ static uint32_t getLit64Encoding(uint64_t Val, const MCSubtargetInfo &STI) { } uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, - unsigned OpSize, + const MCOperandInfo &OpInfo, const MCSubtargetInfo &STI) const { int64_t Imm; @@ -180,12 +216,16 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO, Imm = MO.getImm(); } - if (OpSize == 4) + switch (AMDGPU::getOperandSize(OpInfo)) { + case 4: return getLit32Encoding(static_cast(Imm), STI); - - assert(OpSize == 8); - - return getLit64Encoding(static_cast(Imm), STI); + case 8: + return getLit64Encoding(static_cast(Imm), STI); + case 2: + return getLit16Encoding(static_cast(Imm), STI); + default: + llvm_unreachable("invalid operand size"); + } } void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, @@ -212,12 +252,9 @@ void SIMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS, if (!AMDGPU::isSISrcOperand(Desc, i)) continue; - int RCID = Desc.OpInfo[i].RegClass; - const MCRegisterClass &RC = MRI.getRegClass(RCID); - // Is this operand a literal immediate? const MCOperand &Op = MI.getOperand(i); - if (getLitEncoding(Op, AMDGPU::getRegBitWidth(RC) / 8, STI) != 255) + if (getLitEncoding(Op, Desc.OpInfo[i], STI) != 255) continue; // Yes! Encode it @@ -282,9 +319,7 @@ uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCInstrDesc &Desc = MCII.get(MI.getOpcode()); if (AMDGPU::isSISrcOperand(Desc, OpNo)) { - uint32_t Enc = getLitEncoding(MO, - AMDGPU::getRegOperandSize(&MRI, Desc, OpNo), - STI); + uint32_t Enc = getLitEncoding(MO, Desc.OpInfo[OpNo], STI); if (Enc != ~0U && (Enc != 255 || Desc.getSize() == 4)) return Enc; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index 6bb31a9a781b..ff4e32147184 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -88,17 +88,36 @@ enum ClassFlags { namespace AMDGPU { enum OperandType { /// Operands with register or 32-bit immediate - OPERAND_REG_IMM32_INT = MCOI::OPERAND_FIRST_TARGET, - OPERAND_REG_IMM32_FP, + OPERAND_REG_IMM_INT32 = MCOI::OPERAND_FIRST_TARGET, + OPERAND_REG_IMM_INT64, + OPERAND_REG_IMM_INT16, + OPERAND_REG_IMM_FP32, + OPERAND_REG_IMM_FP64, + OPERAND_REG_IMM_FP16, + /// Operands with register or inline constant - OPERAND_REG_INLINE_C_INT, - OPERAND_REG_INLINE_C_FP, + OPERAND_REG_INLINE_C_INT16, + OPERAND_REG_INLINE_C_INT32, + OPERAND_REG_INLINE_C_INT64, + OPERAND_REG_INLINE_C_FP16, + OPERAND_REG_INLINE_C_FP32, + OPERAND_REG_INLINE_C_FP64, + + OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32, + OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16, + + OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16, + OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64, + + OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32, + OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST, // Operand for source modifiers for VOP instructions OPERAND_INPUT_MODS, /// Operand with 32-bit immediate that uses the constant bus. - OPERAND_KIMM32 + OPERAND_KIMM32, + OPERAND_KIMM16 }; } diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index 3d59f8d82ae7..831ac5948a68 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -315,12 +315,14 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, return; } - APInt Imm(64, OpToFold.getImm()); const MCInstrDesc &FoldDesc = OpToFold.getParent()->getDesc(); const TargetRegisterClass *FoldRC = TRI.getRegClass(FoldDesc.OpInfo[0].RegClass); + APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType), + OpToFold.getImm()); + // Split 64-bit constants into 32-bits for folding. if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) { unsigned UseReg = UseOp.getReg(); @@ -329,6 +331,8 @@ static void foldOperand(MachineOperand &OpToFold, MachineInstr *UseMI, MRI.getRegClass(UseReg) : TRI.getPhysRegClass(UseReg); + assert(Imm.getBitWidth() == 64); + if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64) return; @@ -505,7 +509,6 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { if (!isSafeToFold(MI)) continue; - unsigned OpSize = TII->getOpSize(MI, 1); MachineOperand &OpToFold = MI.getOperand(1); bool FoldingImm = OpToFold.isImm() || OpToFold.isFI(); @@ -559,14 +562,15 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) { Use = MRI.use_begin(Dst.getReg()), E = MRI.use_end(); Use != E; ++Use) { MachineInstr *UseMI = Use->getParent(); + unsigned OpNo = Use.getOperandNo(); - if (TII->isInlineConstant(OpToFold, OpSize)) { - foldOperand(OpToFold, UseMI, Use.getOperandNo(), FoldList, + if (TII->isInlineConstant(*UseMI, OpNo, OpToFold)) { + foldOperand(OpToFold, UseMI, OpNo, FoldList, CopiesToReplace, TII, TRI, MRI); } else { if (++NumLiteralUses == 1) { NonInlineUse = &*Use; - NonInlineUseOpNo = Use.getOperandNo(); + NonInlineUseOpNo = OpNo; } } } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 9071ded65671..981b63c59a83 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -1415,10 +1415,12 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, // If this is a free constant, there's no reason to do this. // TODO: We could fold this here instead of letting SIFoldOperands do it // later. - if (isInlineConstant(ImmOp, 4)) + MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); + + // Any src operand can be used for the legality check. + if (isInlineConstant(UseMI, *Src0, ImmOp)) return false; - MachineOperand *Src0 = getNamedOperand(UseMI, AMDGPU::OpName::src0); MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -1620,8 +1622,10 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, case AMDGPU::V_MAC_F16_e32: IsF16 = true; case AMDGPU::V_MAC_F32_e32: { - const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0); - if (Src0->isImm() && !isInlineConstant(*Src0, 4)) + int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), + AMDGPU::OpName::src0); + const MachineOperand *Src0 = &MI.getOperand(Src0Idx); + if (Src0->isImm() && !isInlineConstant(MI, Src0Idx, *Src0)) return nullptr; break; } @@ -1682,46 +1686,55 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const { case 64: return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(), ST.hasInv2PiInlineImm()); + case 16: + return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(), + ST.hasInv2PiInlineImm()); default: llvm_unreachable("invalid bitwidth"); } } bool SIInstrInfo::isInlineConstant(const MachineOperand &MO, - unsigned OpSize) const { - if (MO.isImm()) { - // MachineOperand provides no way to tell the true operand size, since it - // only records a 64-bit value. We need to know the size to determine if a - // 32-bit floating point immediate bit pattern is legal for an integer - // immediate. It would be for any 32-bit integer operand, but would not be - // for a 64-bit one. - switch (OpSize) { - case 4: - return AMDGPU::isInlinableLiteral32(static_cast(MO.getImm()), - ST.hasInv2PiInlineImm()); - case 8: - return AMDGPU::isInlinableLiteral64(MO.getImm(), - ST.hasInv2PiInlineImm()); - default: - llvm_unreachable("invalid bitwidth"); - } + uint8_t OperandType) const { + if (!MO.isImm() || OperandType < MCOI::OPERAND_FIRST_TARGET) + return false; + + // MachineOperand provides no way to tell the true operand size, since it only + // records a 64-bit value. We need to know the size to determine if a 32-bit + // floating point immediate bit pattern is legal for an integer immediate. It + // would be for any 32-bit integer operand, but would not be for a 64-bit one. + + int64_t Imm = MO.getImm(); + switch (operandBitWidth(OperandType)) { + case 32: { + int32_t Trunc = static_cast(Imm); + return Trunc == Imm && + AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm()); } + case 64: { + return AMDGPU::isInlinableLiteral64(MO.getImm(), + ST.hasInv2PiInlineImm()); + } + case 16: { + if (isInt<16>(Imm) || isUInt<16>(Imm)) { + int16_t Trunc = static_cast(Imm); + return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm()); + } - return false; -} - -bool SIInstrInfo::isLiteralConstant(const MachineOperand &MO, - unsigned OpSize) const { - return MO.isImm() && !isInlineConstant(MO, OpSize); + return false; + } + default: + llvm_unreachable("invalid bitwidth"); + } } bool SIInstrInfo::isLiteralConstantLike(const MachineOperand &MO, - unsigned OpSize) const { + const MCOperandInfo &OpInfo) const { switch (MO.getType()) { case MachineOperand::MO_Register: return false; case MachineOperand::MO_Immediate: - return !isInlineConstant(MO, OpSize); + return !isInlineConstant(MO, OpInfo); case MachineOperand::MO_FrameIndex: case MachineOperand::MO_MachineBasicBlock: case MachineOperand::MO_ExternalSymbol: @@ -1760,11 +1773,10 @@ bool SIInstrInfo::isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, if (OpInfo.RegClass < 0) return false; - unsigned OpSize = RI.getRegClass(OpInfo.RegClass)->getSize(); - if (isLiteralConstant(MO, OpSize)) - return RI.opCanUseLiteralConstant(OpInfo.OperandType); + if (MO.isImm() && isInlineConstant(MO, OpInfo)) + return RI.opCanUseInlineConstant(OpInfo.OperandType); - return RI.opCanUseInlineConstant(OpInfo.OperandType); + return RI.opCanUseLiteralConstant(OpInfo.OperandType); } bool SIInstrInfo::hasVALU32BitEncoding(unsigned Opcode) const { @@ -1791,12 +1803,17 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI, bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, - unsigned OpSize) const { + const MCOperandInfo &OpInfo) const { // Literal constants use the constant bus. - if (isLiteralConstant(MO, OpSize)) - return true; + //if (isLiteralConstantLike(MO, OpInfo)) + // return true; + if (MO.isImm()) + return !isInlineConstant(MO, OpInfo); - if (!MO.isReg() || !MO.isUse()) + if (!MO.isReg()) + return true; // Misc other operands like FrameIndex + + if (!MO.isUse()) return false; if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) @@ -1925,17 +1942,22 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, return false; } break; - case AMDGPU::OPERAND_REG_IMM32_INT: - case AMDGPU::OPERAND_REG_IMM32_FP: + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: break; - case AMDGPU::OPERAND_REG_INLINE_C_INT: - case AMDGPU::OPERAND_REG_INLINE_C_FP: - if (isLiteralConstant(MI.getOperand(i), - RI.getRegClass(RegClass)->getSize())) { + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: { + const MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() && (!MO.isImm() || !isInlineConstant(MI, i))) { ErrInfo = "Illegal immediate value for operand."; return false; } break; + } case MCOI::OPERAND_IMMEDIATE: case AMDGPU::OPERAND_KIMM32: // Check if this operand is an immediate. @@ -1987,7 +2009,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI, if (OpIdx == -1) break; const MachineOperand &MO = MI.getOperand(OpIdx); - if (usesConstantBus(MRI, MO, getOpSize(Opcode, OpIdx))) { + if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { if (MO.getReg() != SGPRUsed) ++ConstantBusCount; @@ -2330,7 +2352,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); - if (isVALU(MI) && usesConstantBus(MRI, *MO, DefinedRC->getSize())) { + if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { RegSubRegPair SGPRUsed; if (MO->isReg()) @@ -2342,7 +2364,7 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, const MachineOperand &Op = MI.getOperand(i); if (Op.isReg()) { if ((Op.getReg() != SGPRUsed.Reg || Op.getSubReg() != SGPRUsed.SubReg) && - usesConstantBus(MRI, Op, getOpSize(MI, i))) { + usesConstantBus(MRI, Op, InstDesc.OpInfo[i])) { return false; } } else if (InstDesc.OpInfo[i].OperandType == AMDGPU::OPERAND_KIMM32) { @@ -3539,14 +3561,14 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { if (Src0Idx == -1) return 4; // No operands. - if (isLiteralConstantLike(MI.getOperand(Src0Idx), getOpSize(MI, Src0Idx))) + if (isLiteralConstantLike(MI.getOperand(Src0Idx), Desc.OpInfo[Src0Idx])) return 8; int Src1Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1); if (Src1Idx == -1) return 4; - if (isLiteralConstantLike(MI.getOperand(Src1Idx), getOpSize(MI, Src1Idx))) + if (isLiteralConstantLike(MI.getOperand(Src1Idx), Desc.OpInfo[Src1Idx])) return 8; return 4; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h index 0f16fa0902f7..81d0ef42234b 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -462,15 +462,96 @@ public: return !RI.isSGPRReg(MRI, Dest); } + static int operandBitWidth(uint8_t OperandType) { + switch (OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + return 32; + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + return 64; + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + return 16; + default: + llvm_unreachable("unexpected operand type"); + } + } + bool isInlineConstant(const APInt &Imm) const; - bool isInlineConstant(const MachineOperand &MO, unsigned OpSize) const; - bool isLiteralConstant(const MachineOperand &MO, unsigned OpSize) const; + + bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const; + + bool isInlineConstant(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const { + return isInlineConstant(MO, OpInfo.OperandType); + } + + /// \p returns true if \p UseMO is substituted with \p DefMO in \p MI it would + /// be an inline immediate. + bool isInlineConstant(const MachineInstr &MI, + const MachineOperand &UseMO, + const MachineOperand &DefMO) const { + assert(UseMO.getParent() == &MI); + int OpIdx = MI.getOperandNo(&UseMO); + if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands) { + return false; + } + + return isInlineConstant(DefMO, MI.getDesc().OpInfo[OpIdx]); + } + + /// \p returns true if the operand \p OpIdx in \p MI is a valid inline + /// immediate. + bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx) const { + const MachineOperand &MO = MI.getOperand(OpIdx); + return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType); + } + + bool isInlineConstant(const MachineInstr &MI, unsigned OpIdx, + const MachineOperand &MO) const { + if (!MI.getDesc().OpInfo || OpIdx > MI.getDesc().NumOperands) + return false; + + if (MI.isCopy()) { + unsigned Size = getOpSize(MI, OpIdx); + assert(Size == 8 || Size == 4); + + uint8_t OpType = (Size == 8) ? + AMDGPU::OPERAND_REG_IMM_INT64 : AMDGPU::OPERAND_REG_IMM_INT32; + return isInlineConstant(MO, OpType); + } + + return isInlineConstant(MO, MI.getDesc().OpInfo[OpIdx].OperandType); + } + + bool isInlineConstant(const MachineOperand &MO) const { + const MachineInstr *Parent = MO.getParent(); + return isInlineConstant(*Parent, Parent->getOperandNo(&MO)); + } + + bool isLiteralConstant(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const { + return MO.isImm() && !isInlineConstant(MO, OpInfo.OperandType); + } + + bool isLiteralConstant(const MachineInstr &MI, int OpIdx) const { + const MachineOperand &MO = MI.getOperand(OpIdx); + return MO.isImm() && !isInlineConstant(MI, OpIdx); + } // Returns true if this operand could potentially require a 32-bit literal // operand, but not necessarily. A FrameIndex for example could resolve to an // inline immediate value that will not require an additional 4-bytes; this // assumes that it will. - bool isLiteralConstantLike(const MachineOperand &MO, unsigned OpSize) const; + bool isLiteralConstantLike(const MachineOperand &MO, + const MCOperandInfo &OpInfo) const; bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo, const MachineOperand &MO) const; @@ -482,7 +563,7 @@ public: /// \brief Returns true if this operand uses the constant bus. bool usesConstantBus(const MachineRegisterInfo &MRI, const MachineOperand &MO, - unsigned OpSize) const; + const MCOperandInfo &OpInfo) const; /// \brief Return true if this instruction has any modifiers. /// e.g. src[012]_mod, omod, clamp. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index aeef7acdfefc..9f7c921c5654 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -445,21 +445,29 @@ def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> { } // End OperandType = "OPERAND_IMMEDIATE" +class KImmMatchClass : AsmOperandClass { + let Name = "KImmFP"#size; + let PredicateMethod = "isKImmFP"#size; + let ParserMethod = "parseImm"; + let RenderMethod = "addKImmFP"#size#"Operands"; +} + +class kimmOperand : Operand { + let OperandNamespace = "AMDGPU"; + let OperandType = "OPERAND_KIMM"#vt.Size; + let PrintMethod = "printU"#vt.Size#"ImmOperand"; + let ParserMatchClass = !cast("KImmFP"#vt.Size#"MatchClass"); +} // 32-bit VALU immediate operand that uses the constant bus. -def KImmFP32MatchClass : AsmOperandClass { - let Name = "KImmFP32"; - let PredicateMethod = "isKImmFP32"; - let ParserMethod = "parseImm"; - let RenderMethod = "addKImmFP32Operands"; -} +def KImmFP32MatchClass : KImmMatchClass<32>; +def f32kimm : kimmOperand; + +// 32-bit VALU immediate operand with a 16-bit value that uses the +// constant bus. +def KImmFP16MatchClass : KImmMatchClass<16>; +def f16kimm : kimmOperand; -def f32kimm : Operand { - let OperandNamespace = "AMDGPU"; - let OperandType = "OPERAND_KIMM32"; - let PrintMethod = "printU32ImmOperand"; - let ParserMatchClass = KImmFP32MatchClass; -} def VOPDstS64 : VOPDstOperand ; @@ -468,6 +476,7 @@ class FPInputModsMatchClass : AsmOperandClass { let ParserMethod = "parseRegOrImmWithFPInputMods"; let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods"; } +def FP16InputModsMatchClass : FPInputModsMatchClass<16>; def FP32InputModsMatchClass : FPInputModsMatchClass<32>; def FP64InputModsMatchClass : FPInputModsMatchClass<64>; @@ -480,6 +489,8 @@ class InputMods : Operand { class FPInputMods : InputMods { let PrintMethod = "printOperandAndFPInputMods"; } + +def FP16InputMods : FPInputMods; def FP32InputMods : FPInputMods; def FP64InputMods : FPInputMods; @@ -629,8 +640,8 @@ class getVOPSrc0ForVT { !if(!eq(VT.Value, f64.Value), 1, 0))); RegisterOperand ret = !if(isFP, - !if(!eq(VT.Size, 64), VSrc_f64, VSrc_f32), - !if(!eq(VT.Size, 64), VSrc_b64, VSrc_b32)); + !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)), + !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32))); } // Returns the vreg register class to use for source operand given VT @@ -657,8 +668,9 @@ class getVOP3SrcForVT { !if(!eq(VT.Value, i1.Value), SCSrc_b64, !if(isFP, - VCSrc_f32, - VCSrc_b32) + !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32), + !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32) + ) ) ) ); @@ -691,7 +703,13 @@ class getSrcMod { 0))); Operand ret = !if(!eq(VT.Size, 64), !if(isFP, FP64InputMods, Int64InputMods), - !if(isFP, FP32InputMods, Int32InputMods)); + !if(isFP, + !if(!eq(VT.Value, f16.Value), + FP16InputMods, + FP32InputMods + ), + Int32InputMods) + ); } // Returns the input arguments for VOP[12C] instructions for the given SrcVT. diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 0d6166ac2aff..83c4fc45cef4 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -107,9 +107,8 @@ def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst), // 64-bit vector move instruction. This is mainly used by the SIFoldOperands // pass to enable folding of inline immediates. -def V_MOV_B64_PSEUDO : PseudoInstSI <(outs VReg_64:$vdst), (ins VSrc_b64:$src0)> { - let VALU = 1; -} +def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst), + (ins VSrc_b64:$src0)>; } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] let usesCustomInserter = 1, SALU = 1 in { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp index 41633a2b6a07..bda0a44c984e 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -1085,19 +1085,6 @@ bool SIRegisterInfo::shouldRewriteCopySrc( return getCommonSubClass(DefRC, SrcRC) != nullptr; } -bool SIRegisterInfo::opCanUseLiteralConstant(unsigned OpType) const { - return OpType == AMDGPU::OPERAND_REG_IMM32_INT || - OpType == AMDGPU::OPERAND_REG_IMM32_FP; -} - -bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const { - if (opCanUseLiteralConstant(OpType)) - return true; - - return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT || - OpType == AMDGPU::OPERAND_REG_INLINE_C_FP; -} - // FIXME: Most of these are flexible with HSA and we don't need to reserve them // as input registers if unused. Whether the dispatch ptr is necessary should be // easy to detect from used intrinsics. Scratch setup is harder to know. diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h index 672df79218ba..0bcae7d9840c 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H #include "AMDGPURegisterInfo.h" +#include "SIDefines.h" #include "llvm/CodeGen/MachineRegisterInfo.h" namespace llvm { @@ -138,12 +139,19 @@ public: /// \returns True if operands defined with this operand type can accept /// a literal constant (i.e. any 32-bit immediate). - bool opCanUseLiteralConstant(unsigned OpType) const; + bool opCanUseLiteralConstant(unsigned OpType) const { + // TODO: 64-bit operands have extending behavior from 32-bit literal. + return OpType >= AMDGPU::OPERAND_REG_IMM_FIRST && + OpType <= AMDGPU::OPERAND_REG_IMM_LAST; + } /// \returns True if operands defined with this operand type can accept /// an inline constant. i.e. An integer value in the range (-16, 64) or /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f. - bool opCanUseInlineConstant(unsigned OpType) const; + bool opCanUseInlineConstant(unsigned OpType) const { + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; + } enum PreloadedValue { // SGPRS: diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td index 3bd3f882d04a..0dd9fa5bb34a 100644 --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td @@ -384,31 +384,43 @@ class RegImmMatcher : AsmOperandClass { multiclass SIRegOperand { let OperandNamespace = "AMDGPU" in { + def _b16 : RegisterOperand(rc#"_32")> { + let OperandType = opType#"_INT16"; + let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_VSrc16"; + } + + def _f16 : RegisterOperand(rc#"_32")> { + let OperandType = opType#"_FP16"; + let ParserMatchClass = RegImmMatcher; + let DecoderMethod = "decodeOperand_VSrc16"; + } def _b32 : RegisterOperand(rc#"_32")> { - let OperandType = opType#"_INT"; + let OperandType = opType#"_INT32"; let ParserMatchClass = RegImmMatcher; } def _f32 : RegisterOperand(rc#"_32")> { - let OperandType = opType#"_FP"; + let OperandType = opType#"_FP32"; let ParserMatchClass = RegImmMatcher; } def _b64 : RegisterOperand(rc#"_64")> { - let OperandType = opType#"_INT"; + let OperandType = opType#"_INT64"; let ParserMatchClass = RegImmMatcher; } def _f64 : RegisterOperand(rc#"_64")> { - let OperandType = opType#"_FP"; + let OperandType = opType#"_FP64"; let ParserMatchClass = RegImmMatcher; } } } +// FIXME: 64-bit sources can sometimes use 32-bit constants. multiclass RegImmOperand - : SIRegOperand; + : SIRegOperand; multiclass RegInlineOperand : SIRegOperand; diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 9ee2ededbb0e..b27d7c691032 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -134,15 +134,14 @@ static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - MachineOperand &Src0 = MI.getOperand(Src0Idx); // Only one literal constant is allowed per instruction, so if src0 is a // literal constant then we can't do any folding. - if (Src0.isImm() && - TII->isLiteralConstant(Src0, TII->getOpSize(MI, Src0Idx))) + if (TII->isLiteralConstant(MI, Src0Idx)) return; // Try to fold Src0 + MachineOperand &Src0 = MI.getOperand(Src0Idx); if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { unsigned Reg = Src0.getReg(); MachineInstr *Def = MRI.getUniqueVRegDef(Reg); @@ -184,11 +183,15 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI, } static bool isKImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { - return isInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4); + return isInt<16>(Src.getImm()) && + !TII->isInlineConstant(*Src.getParent(), + Src.getParent()->getOperandNo(&Src)); } static bool isKUImmOperand(const SIInstrInfo *TII, const MachineOperand &Src) { - return isUInt<16>(Src.getImm()) && !TII->isInlineConstant(Src, 4); + return isUInt<16>(Src.getImm()) && + !TII->isInlineConstant(*Src.getParent(), + Src.getParent()->getOperandNo(&Src)); } static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, @@ -196,12 +199,12 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, bool &IsUnsigned) { if (isInt<16>(Src.getImm())) { IsUnsigned = false; - return !TII->isInlineConstant(Src, 4); + return !TII->isInlineConstant(Src); } if (isUInt<16>(Src.getImm())) { IsUnsigned = true; - return !TII->isInlineConstant(Src, 4); + return !TII->isInlineConstant(Src); } return false; @@ -212,7 +215,7 @@ static bool isKImmOrKUImmOperand(const SIInstrInfo *TII, static bool isReverseInlineImm(const SIInstrInfo *TII, const MachineOperand &Src, int32_t &ReverseImm) { - if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src, 4)) + if (!isInt<32>(Src.getImm()) || TII->isInlineConstant(Src)) return false; ReverseImm = reverseBits(static_cast(Src.getImm())); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 29cac2fbf6d3..85cbadf0a570 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -329,25 +329,29 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) { bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) { unsigned OpType = Desc.OpInfo[OpNo].OperandType; - - return OpType == AMDGPU::OPERAND_REG_IMM32_INT || - OpType == AMDGPU::OPERAND_REG_IMM32_FP || - OpType == AMDGPU::OPERAND_REG_INLINE_C_INT || - OpType == AMDGPU::OPERAND_REG_INLINE_C_FP; + return OpType >= AMDGPU::OPERAND_SRC_FIRST && + OpType <= AMDGPU::OPERAND_SRC_LAST; } bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) { unsigned OpType = Desc.OpInfo[OpNo].OperandType; - - return OpType == AMDGPU::OPERAND_REG_IMM32_FP || - OpType == AMDGPU::OPERAND_REG_INLINE_C_FP; + switch (OpType) { + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + return true; + default: + return false; + } } bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) { unsigned OpType = Desc.OpInfo[OpNo].OperandType; - - return OpType == AMDGPU::OPERAND_REG_INLINE_C_INT || - OpType == AMDGPU::OPERAND_REG_INLINE_C_FP; + return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST && + OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST; } // Avoid using MCRegisterClass::getSize, since that function will go away @@ -413,6 +417,15 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { if (Literal >= -16 && Literal <= 64) return true; + // The actual type of the operand does not seem to matter as long + // as the bits match one of the inline immediate values. For example: + // + // -nan has the hexadecimal encoding of 0xfffffffe which is -2 in decimal, + // so it is a legal inline immediate. + // + // 1065353216 has the hexadecimal encoding 0x3f800000 which is 1.0f in + // floating-point, so it is a legal inline immediate. + uint32_t Val = static_cast(Literal); return (Val == FloatToBits(0.0f)) || (Val == FloatToBits(1.0f)) || @@ -426,6 +439,23 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) { (Val == 0x3e22f983 && HasInv2Pi); } +bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) { + assert(HasInv2Pi); + + if (Literal >= -16 && Literal <= 64) + return true; + + uint16_t Val = static_cast(Literal); + return Val == 0x3C00 || // 1.0 + Val == 0xBC00 || // -1.0 + Val == 0x3800 || // 0.5 + Val == 0xB800 || // -0.5 + Val == 0x4000 || // 2.0 + Val == 0xC000 || // -2.0 + Val == 0x4400 || // 4.0 + Val == 0xC400 || // -4.0 + Val == 0x3118; // 1/2pi +} } // End namespace AMDGPU } // End namespace llvm diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 3101b96c8eb0..ea5fc366d205 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -13,6 +13,8 @@ #include "AMDKernelCodeT.h" #include "llvm/IR/CallingConv.h" +#include "SIDefines.h" + #define GET_INSTRINFO_OPERAND_ENUM #include "AMDGPUGenInstrInfo.inc" #undef GET_INSTRINFO_OPERAND_ENUM @@ -167,6 +169,37 @@ unsigned getRegBitWidth(const MCRegisterClass &RC); unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc, unsigned OpNo); +LLVM_READNONE +inline unsigned getOperandSize(const MCOperandInfo &OpInfo) { + switch (OpInfo.OperandType) { + case AMDGPU::OPERAND_REG_IMM_INT32: + case AMDGPU::OPERAND_REG_IMM_FP32: + case AMDGPU::OPERAND_REG_INLINE_C_INT32: + case AMDGPU::OPERAND_REG_INLINE_C_FP32: + return 4; + + case AMDGPU::OPERAND_REG_IMM_INT64: + case AMDGPU::OPERAND_REG_IMM_FP64: + case AMDGPU::OPERAND_REG_INLINE_C_INT64: + case AMDGPU::OPERAND_REG_INLINE_C_FP64: + return 8; + + case AMDGPU::OPERAND_REG_IMM_INT16: + case AMDGPU::OPERAND_REG_IMM_FP16: + case AMDGPU::OPERAND_REG_INLINE_C_INT16: + case AMDGPU::OPERAND_REG_INLINE_C_FP16: + return 2; + + default: + llvm_unreachable("unhandled operand type"); + } +} + +LLVM_READNONE +inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) { + return getOperandSize(Desc.OpInfo[OpNo]); +} + /// \brief Is this literal inlinable LLVM_READNONE bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi); @@ -174,6 +207,8 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi); LLVM_READNONE bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi); +LLVM_READNONE +bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi); } // end namespace AMDGPU } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 0e87f90b62ba..37e31f57b242 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -134,7 +134,8 @@ multiclass VOP2eInst : VOPProfile <[vt, vt, vt, vt]> { - field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, f32kimm:$imm); + field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); + field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); field string Asm32 = "$vdst, $src0, $src1, $imm"; field bit HasExt = 0; } @@ -143,7 +144,8 @@ def VOP_MADAK_F16 : VOP_MADAK ; def VOP_MADAK_F32 : VOP_MADAK ; class VOP_MADMK : VOPProfile <[vt, vt, vt, vt]> { - field dag Ins32 = (ins VCSrc_f32:$src0, f32kimm:$imm, VGPR_32:$src1); + field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); + field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1); field string Asm32 = "$vdst, $src0, $imm, $src1"; field bit HasExt = 0; } diff --git a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll index b75847149190..340d30b898e0 100644 --- a/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/br_cc.f16.ll @@ -41,7 +41,7 @@ two: } ; GCN-LABEL: {{^}}br_cc_f16_imm_a -; GCN: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x380{{0|1}}{{$}} +; SI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}} ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] @@ -49,7 +49,7 @@ two: ; SI: v_cmp_ngt_f32_e32 vcc, v[[B_F32]], v[[A_F32]] ; SI: s_cbranch_vccz -; VI: v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; VI: v_cmp_nlt_f16_e32 vcc, 0.5, v[[B_F16]] ; VI: s_cbranch_vccnz ; VI: one{{$}} @@ -80,13 +80,13 @@ two: } ; GCN-LABEL: {{^}}br_cc_f16_imm_b -; GCN: v_mov_b32_e32 v[[B_F16:[0-9]+]], {{0x37ff|0x3800}}{{$}} +; SI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}} ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]] -; VI: v_cmp_ngt_f16_e32 vcc, v[[B_F16]], v[[A_F16]] +; VI: v_cmp_ngt_f16_e32 vcc, 0.5, v[[A_F16]] ; GCN: s_cbranch_vccnz ; GCN: one{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/commute-compares.ll b/llvm/test/CodeGen/AMDGPU/commute-compares.ll index 055fd8f1ccd3..a4c51b233f41 100644 --- a/llvm/test/CodeGen/AMDGPU/commute-compares.ll +++ b/llvm/test/CodeGen/AMDGPU/commute-compares.ll @@ -693,11 +693,16 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i ret void } + +; FIXME: Should be able to fold this frameindex ; Without commuting the frame index in the pre-regalloc run of ; SIShrinkInstructions, this was using the VOP3 compare. ; GCN-LABEL: {{^}}commute_frameindex: -; GCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} +; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}} + +; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}} +; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}} define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 { entry: %stack0 = alloca i32 diff --git a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll index b2afc054ce19..fb2d418b4436 100644 --- a/llvm/test/CodeGen/AMDGPU/fadd.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fadd.f16.ll @@ -29,7 +29,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]], v[[B_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fadd_f16_imm_a( @@ -48,7 +48,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[A_F16]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fadd_f16_imm_b( @@ -104,8 +104,8 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]] -; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]] +; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] +; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] @@ -132,8 +132,8 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0x4000, v[[A_V2_F16]] -; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0x3c00, v[[A_F16_1]] +; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]] +; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] diff --git a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll index da0e01d6a7f5..9ce4d7684fe5 100644 --- a/llvm/test/CodeGen/AMDGPU/fmul.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fmul.f16.ll @@ -48,7 +48,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_mul_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]] +; VI: v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fmul_f16_imm_b( @@ -105,7 +105,7 @@ entry: ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]] +; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] @@ -132,7 +132,7 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]] +; VI: v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; VI: v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] diff --git a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll index a5c84b84bd20..fb15edbaaffc 100644 --- a/llvm/test/CodeGen/AMDGPU/fsub.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fsub.f16.ll @@ -29,7 +29,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]] ; SI: v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 0x3c00, v[[B_F16]] +; VI: v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fsub_f16_imm_a( @@ -48,7 +48,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_add_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], 0xc000, v[[A_F16]] +; VI: v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @fsub_f16_imm_b( @@ -104,8 +104,8 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 0x3c00, v[[B_V2_F16]] -; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 0x4000, v[[B_F16_1]] +; VI: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]] +; VI: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] @@ -132,8 +132,8 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 0xc000, v[[A_V2_F16]] -; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 0xbc00, v[[A_F16_1]] +; VI: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]] +; VI: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] diff --git a/llvm/test/CodeGen/AMDGPU/imm16.ll b/llvm/test/CodeGen/AMDGPU/imm16.ll new file mode 100644 index 000000000000..ed970287abbf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/imm16.ll @@ -0,0 +1,316 @@ +; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s + +; FIXME: Merge into imm.ll + +; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i16: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) { + store volatile i16 -32768, i16 addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_0.0_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) { + store half 0.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_imm_neg_0.0_f16: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) { + store half -0.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_0.5_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) { + store half 0.5, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f16: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) { + store half -0.5, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_1.0_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) { + store half 1.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f16: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) { + store half -1.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_2.0_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) { + store half 2.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f16: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) { + store half -2.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_4.0_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) { + store half 4.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f16: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) { + store half -4.0, half addrspace(1)* %out + ret void +} + + +; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) { + store half 0xH3118, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f16: +; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}} +; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}} +; GCN: buffer_store_short [[REG]] +define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) { + store half 0xHB118, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}store_literal_imm_f16: +; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00 +; GCN: buffer_store_short [[REG]] +define void @store_literal_imm_f16(half addrspace(1)* %out) { + store half 4096.0, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_0.0_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_0.5_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0.5 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, -0.5 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_1.0_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 1.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, -1.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_2.0_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 2.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, -2.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_4.0_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 4.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, -4.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]] +; VI: buffer_store_short [[REG]] +define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) { + %x = load half, half addrspace(1)* %in + %y = fadd half %x, 0.5 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}commute_add_literal_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]] +; VI: buffer_store_short [[REG]] +define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) { + %x = load half, half addrspace(1)* %in + %y = fadd half %x, 1024.0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_1_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xH0001 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_2_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xH0002 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_16_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xH0010 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_1_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xHFFFF + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_2_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xHFFFE + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_neg_16_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}} +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xHFFF0 + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_63_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]] +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xH003F + store half %y, half addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}add_inline_imm_64_f16: +; VI: buffer_load_ushort [[VAL:v[0-9]+]] +; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]] +; VI: buffer_store_short [[REG]] +define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) { + %y = fadd half %x, 0xH0040 + store half %y, half addrspace(1)* %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll index 8ab2efe651b8..a4b8d7fa58da 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll @@ -20,7 +20,7 @@ define void @ldexp_f16( ; GCN-LABEL: {{^}}ldexp_f16_imm_a ; GCN: buffer_load_dword v[[B_I32:[0-9]+]] -; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 0x4000, v[[B_I32]] +; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]] ; GCN: buffer_store_short v[[R_F16]] define void @ldexp_f16_imm_a( half addrspace(1)* %r, diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll index 0accbad99887..0f75f7a5a492 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll @@ -51,7 +51,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_max_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]] +; VI: v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @maxnum_f16_imm_b( @@ -108,7 +108,7 @@ entry: ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]] +; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] @@ -135,7 +135,7 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]] +; VI: v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; VI: v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll index 9f41df6fd259..6bf2e9ba2e32 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll @@ -51,7 +51,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]] ; SI: v_min_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4400, v[[A_F16]] +; VI: v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]] ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define void @minnum_f16_imm_b( @@ -108,7 +108,7 @@ entry: ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] ; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]] -; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4400, v[[B_F16_1]] +; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] ; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]] @@ -135,7 +135,7 @@ entry: ; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]] ; SI: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]] ; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]] -; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4400, v[[A_V2_F16]] +; VI: v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]] ; VI: v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]] ; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]] ; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]] diff --git a/llvm/test/CodeGen/AMDGPU/select.f16.ll b/llvm/test/CodeGen/AMDGPU/select.f16.ll index a0ad475c5699..eadec5c47ad6 100644 --- a/llvm/test/CodeGen/AMDGPU/select.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select.f16.ll @@ -45,8 +45,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x3800{{$}} -; VI: v_cmp_lt_f16_e32 vcc, v[[A_F16]], v[[B_F16]] +; VI: v_cmp_lt_f16_e32 vcc, 0.5, v[[B_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm @@ -76,8 +75,7 @@ entry: ; SI: v_cvt_f32_f16_e32 v[[D_F32:[0-9]+]], v[[D_F16]] ; SI: v_cndmask_b32_e32 v[[R_F32:[0-9]+]], v[[D_F32]], v[[C_F32]] ; SI: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]] -; VI: v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}} -; VI: v_cmp_gt_f16_e32 vcc, v[[B_F16]], v[[A_F16]] +; VI: v_cmp_gt_f16_e32 vcc, 0.5, v[[A_F16]] ; VI: v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc ; GCN: buffer_store_short v[[R_F16]] ; GCN: s_endpgm diff --git a/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir b/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir new file mode 100644 index 000000000000..3277d37d7e4d --- /dev/null +++ b/llvm/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir @@ -0,0 +1,709 @@ +# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s +--- | + define void @add_f32_1.0_one_f16_use() #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f32.val = load volatile float, float addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xH3C00 + %f32.add = fadd float %f32.val, 1.000000e+00 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile float %f32.add, float addrspace(1)* undef + ret void + } + + define void @add_f32_1.0_multi_f16_use() #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f32.val = load volatile float, float addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xH3C00 + %f32.add = fadd float %f32.val, 1.000000e+00 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile float %f32.add, float addrspace(1)* undef + ret void + } + + define void @add_f32_1.0_one_f32_use_one_f16_use () #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f32.val = load volatile float, float addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xH3C00 + %f32.add = fadd float %f32.val, 1.000000e+00 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile float %f32.add, float addrspace(1)* undef + ret void + } + + define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f32.val = load volatile float, float addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xH3C00 + %f16.add1 = fadd half %f16.val1, 0xH3C00 + %f32.add = fadd float %f32.val, 1.000000e+00 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile half %f16.add1, half addrspace(1)* undef + store volatile float %f32.add, float addrspace(1)* undef + ret void + } + + define void @add_i32_1_multi_f16_use() #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xH0001 + %f16.add1 = fadd half %f16.val1, 0xH0001 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile half %f16.add1,half addrspace(1)* undef + ret void + } + + define void @add_i32_m2_one_f32_use_multi_f16_use () #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f32.val = load volatile float, float addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xHFFFE + %f16.add1 = fadd half %f16.val1, 0xHFFFE + %f32.add = fadd float %f32.val, 0xffffffffc0000000 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile half %f16.add1, half addrspace(1)* undef + store volatile float %f32.add, float addrspace(1)* undef + ret void + } + + define void @add_f16_1.0_multi_f32_use() #0 { + %f32.val0 = load volatile float, float addrspace(1)* undef + %f32.val1 = load volatile float, float addrspace(1)* undef + %f32.val = load volatile float, float addrspace(1)* undef + %f32.add0 = fadd float %f32.val0, 1.0 + %f32.add1 = fadd float %f32.val1, 1.0 + store volatile float %f32.add0, float addrspace(1)* undef + store volatile float %f32.add1, float addrspace(1)* undef + ret void + } + + define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f32.val = load volatile half, half addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xH3C00 + %f32.add = fadd half %f32.val, 1.000000e+00 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile half %f32.add, half addrspace(1)* undef + ret void + } + + define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 { + %f16.val0 = load volatile half, half addrspace(1)* undef + %f16.val1 = load volatile half, half addrspace(1)* undef + %f32.val = load volatile half, half addrspace(1)* undef + %f16.add0 = fadd half %f16.val0, 0xH3C00 + %f32.add = fadd half %f32.val, 1.000000e+00 + store volatile half %f16.add0, half addrspace(1)* undef + store volatile half %f32.add, half addrspace(1)* undef + ret void + } + + attributes #0 = { nounwind } + +... +--- + +# f32 1.0 with a single use should be folded as the low 32-bits of a +# literal constant. + +# CHECK-LABEL: name: add_f32_1.0_one_f16_use +# CHECK: %13 = V_ADD_F16_e32 1065353216, killed %11, implicit %exec + +name: add_f32_1.0_one_f16_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = V_MOV_B32_e32 1065353216, implicit %exec + %13 = V_ADD_F16_e64 0, killed %11, 0, %12, 0, 0, implicit %exec + BUFFER_STORE_SHORT_OFFSET killed %13, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + S_ENDPGM + +... +--- +# Materialized f32 inline immediate should not be folded into the f16 +# operands + +# CHECK-LABEL: name: add_f32_1.0_multi_f16_use +# CHECK: %13 = V_MOV_B32_e32 1065353216, implicit %exec +# CHECK: %14 = V_ADD_F16_e32 %13, killed %11, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 killed %13, killed %12, implicit %exec + + +name: add_f32_1.0_multi_f16_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %13 = V_MOV_B32_e32 1065353216, implicit %exec + %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec + %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + S_ENDPGM + +... +--- + +# f32 1.0 should be folded into the single f32 use as an inline +# immediate, and folded into the single f16 use as a literal constant + +# CHECK-LABEL: name: add_f32_1.0_one_f32_use_one_f16_use +# CHECK: %15 = V_ADD_F16_e32 1065353216, %11, implicit %exec +# CHECK: %16 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec + +name: add_f32_1.0_one_f32_use_one_f16_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %14 = V_MOV_B32_e32 1065353216, implicit %exec + %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec + %16 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`) + S_ENDPGM + +... +--- + +# f32 1.0 should be folded for the single f32 use as an inline +# constant, and not folded as a multi-use literal for the f16 cases + +# CHECK-LABEL: name: add_f32_1.0_one_f32_use_multi_f16_use +# CHECK: %14 = V_MOV_B32_e32 1065353216, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 %14, %11, implicit %exec +# CHECK: %16 = V_ADD_F16_e32 %14, %12, implicit %exec +# CHECK: %17 = V_ADD_F32_e32 1065353216, killed %13, implicit %exec + +name: add_f32_1.0_one_f32_use_multi_f16_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %14 = V_MOV_B32_e32 1065353216, implicit %exec + %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec + %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec + %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`) + S_ENDPGM + +... +--- +# CHECK-LABEL: name: add_i32_1_multi_f16_use +# CHECK: %13 = V_MOV_B32_e32 1, implicit %exec +# CHECK: %14 = V_ADD_F16_e32 1, killed %11, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 1, killed %12, implicit %exec + + +name: add_i32_1_multi_f16_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %13 = V_MOV_B32_e32 1, implicit %exec + %14 = V_ADD_F16_e64 0, killed %11, 0, %13, 0, 0, implicit %exec + %15 = V_ADD_F16_e64 0, killed %12, 0, killed %13, 0, 0, implicit %exec + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + S_ENDPGM + +... +--- + +# CHECK-LABEL: name: add_i32_m2_one_f32_use_multi_f16_use +# CHECK: %14 = V_MOV_B32_e32 -2, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 -2, %11, implicit %exec +# CHECK: %16 = V_ADD_F16_e32 -2, %12, implicit %exec +# CHECK: %17 = V_ADD_F32_e32 -2, killed %13, implicit %exec + +name: add_i32_m2_one_f32_use_multi_f16_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } + - { id: 16, class: vgpr_32 } + - { id: 17, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %14 = V_MOV_B32_e32 -2, implicit %exec + %15 = V_ADD_F16_e64 0, %11, 0, %14, 0, 0, implicit %exec + %16 = V_ADD_F16_e64 0, %12, 0, %14, 0, 0, implicit %exec + %17 = V_ADD_F32_e64 0, killed %13, 0, killed %14, 0, 0, implicit %exec + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %16, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %17, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`) + S_ENDPGM + +... +--- + +# f32 1.0 should be folded for the single f32 use as an inline +# constant, and not folded as a multi-use literal for the f16 cases + +# CHECK-LABEL: name: add_f16_1.0_multi_f32_use +# CHECK: %13 = V_MOV_B32_e32 15360, implicit %exec +# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec +# CHECK: %15 = V_ADD_F32_e32 %13, %12, implicit %exec + +name: add_f16_1.0_multi_f32_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %13 = V_MOV_B32_e32 15360, implicit %exec + %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec + %15 = V_ADD_F32_e64 0, %12, 0, %13, 0, 0, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_DWORD_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`) + S_ENDPGM + +... +--- + +# The low 16-bits are an inline immediate, but the high bits are junk +# FIXME: Should be able to fold this + +# CHECK-LABEL: name: add_f16_1.0_other_high_bits_multi_f16_use +# CHECK: %13 = V_MOV_B32_e32 80886784, implicit %exec +# CHECK: %14 = V_ADD_F16_e32 %13, %11, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec + +name: add_f16_1.0_other_high_bits_multi_f16_use +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = V_MOV_B32_e32 80886784, implicit %exec + %14 = V_ADD_F16_e64 0, %11, 0, %13, 0, 0, implicit %exec + %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec + BUFFER_STORE_SHORT_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + S_ENDPGM + +... +--- + +# FIXME: Should fold inline immediate into f16 and literal use into +# f32 instruction. + +# CHECK-LABEL: name: add_f16_1.0_other_high_bits_use_f16_f32 +# CHECK: %13 = V_MOV_B32_e32 305413120, implicit %exec +# CHECK: %14 = V_ADD_F32_e32 %13, %11, implicit %exec +# CHECK: %15 = V_ADD_F16_e32 %13, %12, implicit %exec +name: add_f16_1.0_other_high_bits_use_f16_f32 +alignment: 0 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +registers: + - { id: 0, class: sreg_64 } + - { id: 1, class: sreg_32 } + - { id: 2, class: sgpr_32 } + - { id: 3, class: vgpr_32 } + - { id: 4, class: sreg_64 } + - { id: 5, class: sreg_32 } + - { id: 6, class: sreg_64 } + - { id: 7, class: sreg_32 } + - { id: 8, class: sreg_32 } + - { id: 9, class: sreg_32 } + - { id: 10, class: sreg_128 } + - { id: 11, class: vgpr_32 } + - { id: 12, class: vgpr_32 } + - { id: 13, class: vgpr_32 } + - { id: 14, class: vgpr_32 } + - { id: 15, class: vgpr_32 } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + %4 = IMPLICIT_DEF + %5 = COPY %4.sub1 + %6 = IMPLICIT_DEF + %7 = COPY %6.sub0 + %8 = S_MOV_B32 61440 + %9 = S_MOV_B32 -1 + %10 = REG_SEQUENCE killed %7, 1, killed %5, 2, killed %9, 3, killed %8, 4 + %11 = BUFFER_LOAD_DWORD_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4 from `float addrspace(1)* undef`) + %12 = BUFFER_LOAD_USHORT_OFFSET %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 2 from `half addrspace(1)* undef`) + %13 = V_MOV_B32_e32 305413120, implicit %exec + %14 = V_ADD_F32_e64 0, %11, 0, %13, 0, 0, implicit %exec + %15 = V_ADD_F16_e64 0, %12, 0, %13, 0, 0, implicit %exec + BUFFER_STORE_DWORD_OFFSET killed %14, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 4 into `float addrspace(1)* undef`) + BUFFER_STORE_SHORT_OFFSET killed %15, %10, 0, 0, 0, 0, 0, implicit %exec :: (volatile store 2 into `half addrspace(1)* undef`) + S_ENDPGM + +... diff --git a/llvm/test/MC/AMDGPU/literal16-err.s b/llvm/test/MC/AMDGPU/literal16-err.s new file mode 100644 index 000000000000..b364e06bcb93 --- /dev/null +++ b/llvm/test/MC/AMDGPU/literal16-err.s @@ -0,0 +1,21 @@ +// XFAIL: * +// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s + +v_add_f16 v1, 0xfffff, v2 +// NOVI: 19: error: invalid operand for instruction + +v_add_f16 v1, 0x10000, v2 +// NOVI: 19: error: invalid operand for instruction + +v_add_f16 v1, v2, -0.0 +v_add_f16 v1, v2, 1 + + + +// FIXME: Should give truncate error +v_add_f16 v1, -32769, v2 +v_add_f16 v1, 65536, v2 + +v_add_f32 v1, 4294967296, v2 +v_add_f32 v1, 0x0000000100000000, v2 +v_and_b32 v1, 0x0000000100000000, v2 diff --git a/llvm/test/MC/AMDGPU/literal16.s b/llvm/test/MC/AMDGPU/literal16.s new file mode 100644 index 000000000000..e578ce82372f --- /dev/null +++ b/llvm/test/MC/AMDGPU/literal16.s @@ -0,0 +1,148 @@ +// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI %s + +v_add_f16 v1, 0, v2 +// VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e] + +v_add_f16 v1, 0.0, v2 +// VI: v_add_f16_e32 v1, 0, v2 ; encoding: [0x80,0x04,0x02,0x3e] + +v_add_f16 v1, v2, 0 +// VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00] + +v_add_f16 v1, v2, 0.0 +// VI: v_add_f16_e64 v1, v2, 0 ; encoding: [0x01,0x00,0x1f,0xd1,0x02,0x01,0x01,0x00] + +v_add_f16 v1, -0.0, v2 +// VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00] + +v_add_f16 v1, 1.0, v2 +// VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e] + +v_add_f16 v1, -1.0, v2 +// VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e] + +v_add_f16 v1, -0.5, v2 +// VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e] + +v_add_f16 v1, 0.5, v2 +// VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e] + +v_add_f16 v1, 2.0, v2 +// VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e] + +v_add_f16 v1, -2.0, v2 +// VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e] + +v_add_f16 v1, 4.0, v2 +// VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e] + +v_add_f16 v1, -4.0, v2 +// VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e] + +v_add_f16 v1, 0.15915494, v2 +// VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e] + +v_add_f16 v1, -0.15915494, v2 +// VI: v_add_f16_e32 v1, 0xb118, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x18,0xb1,0x00,0x00] + +v_add_f16 v1, -1, v2 +// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e] + + +v_add_f16 v1, -2, v2 +// VI: v_add_f16_e32 v1, -2, v2 ; encoding: [0xc2,0x04,0x02,0x3e] + +v_add_f16 v1, -3, v2 +// VI: v_add_f16_e32 v1, -3, v2 ; encoding: [0xc3,0x04,0x02,0x3e] + +v_add_f16 v1, -16, v2 +// VI: v_add_f16_e32 v1, -16, v2 ; encoding: [0xd0,0x04,0x02,0x3e] + +v_add_f16 v1, 1, v2 +// VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e] + +v_add_f16 v1, 2, v2 +// VI: v_add_f16_e32 v1, 2, v2 ; encoding: [0x82,0x04,0x02,0x3e] + +v_add_f16 v1, 3, v2 +// VI: v_add_f16_e32 v1, 3, v2 ; encoding: [0x83,0x04,0x02,0x3e] + +v_add_f16 v1, 4, v2 +// VI: v_add_f16_e32 v1, 4, v2 ; encoding: [0x84,0x04,0x02,0x3e] + +v_add_f16 v1, 15, v2 +// VI: v_add_f16_e32 v1, 15, v2 ; encoding: [0x8f,0x04,0x02,0x3e] + +v_add_f16 v1, 16, v2 +// VI: v_add_f16_e32 v1, 16, v2 ; encoding: [0x90,0x04,0x02,0x3e] + +v_add_f16 v1, 63, v2 +// VI: v_add_f16_e32 v1, 63, v2 ; encoding: [0xbf,0x04,0x02,0x3e] + +v_add_f16 v1, 64, v2 +// VI: v_add_f16_e32 v1, 64, v2 ; encoding: [0xc0,0x04,0x02,0x3e] + +v_add_f16 v1, 0x0001, v2 +// VI: v_add_f16_e32 v1, 1, v2 ; encoding: [0x81,0x04,0x02,0x3e] + +v_add_f16 v1, 0xffff, v2 +// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e] + +v_add_f16 v1, -17, v2 +// VI: v_add_f16_e32 v1, 0xffef, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xef,0xff,0x00,0x00] + +v_add_f16 v1, 65, v2 +// VI: v_add_f16_e32 v1, 0x41, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x41,0x00,0x00,0x00] + +v_add_f16 v1, 0x3c00, v2 +// VI: v_add_f16_e32 v1, 1.0, v2 ; encoding: [0xf2,0x04,0x02,0x3e] + +v_add_f16 v1, 0xbc00, v2 +// VI: v_add_f16_e32 v1, -1.0, v2 ; encoding: [0xf3,0x04,0x02,0x3e] + +v_add_f16 v1, 0x3800, v2 +// VI: v_add_f16_e32 v1, 0.5, v2 ; encoding: [0xf0,0x04,0x02,0x3e] + +v_add_f16 v1, 0xb800, v2 +// VI: v_add_f16_e32 v1, -0.5, v2 ; encoding: [0xf1,0x04,0x02,0x3e] + +v_add_f16 v1, 0x4000, v2 +// VI: v_add_f16_e32 v1, 2.0, v2 ; encoding: [0xf4,0x04,0x02,0x3e] + +v_add_f16 v1, 0xc000, v2 +// VI: v_add_f16_e32 v1, -2.0, v2 ; encoding: [0xf5,0x04,0x02,0x3e] + +v_add_f16 v1, 0x4400, v2 +// VI: v_add_f16_e32 v1, 4.0, v2 ; encoding: [0xf6,0x04,0x02,0x3e] + +v_add_f16 v1, 0xc400, v2 +// VI: v_add_f16_e32 v1, -4.0, v2 ; encoding: [0xf7,0x04,0x02,0x3e] + +v_add_f16 v1, 0x3118, v2 +// VI: v_add_f16_e32 v1, 0.15915494, v2 ; encoding: [0xf8,0x04,0x02,0x3e] + +v_add_f16 v1, -32768, v2 +// VI: v_add_f16_e32 v1, 0x8000, v2 ; encoding: [0xff,0x04,0x02,0x3e,0x00,0x80,0x00,0x00] + +v_add_f16 v1, 32767, v2 +// VI: v_add_f16_e32 v1, 0x7fff, v2 ; encoding: [0xff,0x04,0x02,0x3e,0xff,0x7f,0x00,0x00] + +v_add_f16 v1, 65535, v2 +// VI: v_add_f16_e32 v1, -1, v2 ; encoding: [0xc1,0x04,0x02,0x3e] + + +// K-constant +v_madmk_f16 v1, v2, 0x4280, v3 +// VI: v_madmk_f16_e32 v1, v2, 0x4280, v3 ; encoding: [0x02,0x07,0x02,0x48,0x80,0x42,0x00,0x00] + +v_madmk_f16 v1, v2, 1.0, v3 +// VI: v_madmk_f16_e32 v1, v2, 0x3c00, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x3c,0x00,0x00] + +v_madmk_f16 v1, v2, 1, v3 +// VI: v_madmk_f16_e32 v1, v2, 0x1, v3 ; encoding: [0x02,0x07,0x02,0x48,0x01,0x00,0x00,0x00] + +v_madmk_f16 v1, v2, 64.0, v3 +// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00] + + +v_add_f16_e32 v1, 64.0, v2 diff --git a/llvm/test/MC/AMDGPU/vop2.s b/llvm/test/MC/AMDGPU/vop2.s index 3fa11dd8417a..5941ffb03a51 100644 --- a/llvm/test/MC/AMDGPU/vop2.s +++ b/llvm/test/MC/AMDGPU/vop2.s @@ -422,12 +422,12 @@ v_mac_f16_e32 v1, v2, v3 // NOSICI: error: instruction not supported on this GPU // NOSICI: v_madmk_f16 v1, v2, 64.0, v3 -// VI: v_madmk_f16_e32 v1, v2, 0x42800000, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x00,0x80,0x42] +// VI: v_madmk_f16_e32 v1, v2, 0x5400, v3 ; encoding: [0x02,0x07,0x02,0x48,0x00,0x54,0x00,0x00] v_madmk_f16 v1, v2, 64.0, v3 // NOSICI: error: instruction not supported on this GPU // NOSICI: v_madak_f16 v1, v2, v3, 64.0 -// VI: v_madak_f16_e32 v1, v2, v3, 0x42800000 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x00,0x80,0x42] +// VI: v_madak_f16_e32 v1, v2, v3, 0x5400 ; encoding: [0x02,0x07,0x02,0x4a,0x00,0x54,0x00,0x00] v_madak_f16 v1, v2, v3, 64.0 // NOSICI: error: instruction not supported on this GPU diff --git a/llvm/test/MC/Disassembler/AMDGPU/literal16_vi.txt b/llvm/test/MC/Disassembler/AMDGPU/literal16_vi.txt new file mode 100644 index 000000000000..362e87703694 --- /dev/null +++ b/llvm/test/MC/Disassembler/AMDGPU/literal16_vi.txt @@ -0,0 +1,54 @@ +# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding %s | FileCheck -check-prefix=VI %s + +# VI: v_add_f16_e32 v1, 0.5, v3 ; encoding: [0xf0,0x06,0x02,0x3e] +0xf0 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, -0.5, v3 ; encoding: [0xf1,0x06,0x02,0x3e] +0xf1 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, 1.0, v3 ; encoding: [0xf2,0x06,0x02,0x3e] +0xf2 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, -1.0, v3 ; encoding: [0xf3,0x06,0x02,0x3e] +0xf3 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, 2.0, v3 ; encoding: [0xf4,0x06,0x02,0x3e] +0xf4 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, -2.0, v3 ; encoding: [0xf5,0x06,0x02,0x3e] +0xf5 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, 4.0, v3 ; encoding: [0xf6,0x06,0x02,0x3e] +0xf6 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, -4.0, v3 ; encoding: [0xf7,0x06,0x02,0x3e] +0xf7 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, 0.15915494, v3 ; encoding: [0xf8,0x06,0x02,0x3e] +0xf8 0x06 0x02 0x3e + +# VI: v_add_f16_e32 v1, 0x41, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x00] +0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x00 + +# VI: v_add_f16_e32 v1, 0x100, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x00,0x01,0x00,0x00] +0xff 0x06 0x02 0x3e 0x00 0x01 0x00 0x00 + +# non-zero unused bits in constant +# VI: v_add_f16_e32 v1, 0x10041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x01,0x00] +0xff 0x06 0x02 0x3e 0x41 0x00 0x01 0x00 + +# VI: v_add_f16_e32 v1, 0x1000041, v3 ; encoding: [0xff,0x06,0x02,0x3e,0x41,0x00,0x00,0x01] +0xff 0x06 0x02 0x3e 0x41 0x00 0x00 0x01 + +# FIXME: This should be able to round trip with literal after instruction +# VI: v_add_f16_e32 v1, 0, v3 ; encoding: [0x80,0x06,0x02,0x3e] +0xff 0x06 0x02 0x3e 0x00 0x00 0x00 0x00 + +# VI: v_madmk_f16_e32 v1, v2, 0x41, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x00] +0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x00 + +# VI: v_madmk_f16_e32 v1, v2, 0x10041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x01,0x00] +0x02 0x07 0x02 0x48 0x41 0x00 0x01 0x00 + +# VI: v_madmk_f16_e32 v1, v2, 0x1000041, v3 ; encoding: [0x02,0x07,0x02,0x48,0x41,0x00,0x00,0x01] +0x02 0x07 0x02 0x48 0x41 0x00 0x00 0x01 diff --git a/llvm/test/MC/Disassembler/AMDGPU/vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/vop1.txt index d3af4a57a086..025408487960 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/vop1.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/vop1.txt @@ -246,5 +246,5 @@ # CHECK: v_cvt_f16_i16_e32 v123, 0x21c2 ; encoding: [0xff,0x74,0xf6,0x7e,0xc2,0x21,0x00,0x00] 0xff 0x74 0xf6 0x7e 0xc2 0x21 0x00 0x00 -# CHECK: v_cvt_u16_f16_e32 v123, 0x3f200000 ; encoding: [0xff,0x76,0xf6,0x7e,0x00,0x00,0x20,0x3f] -0xff 0x76 0xf6 0x7e 0x00 0x00 0x20 0x3f \ No newline at end of file +# CHECK: v_cvt_u16_f16_e32 v123, 0x3f20 ; encoding: [0xff,0x76,0xf6,0x7e,0x20,0x3f,0x00,0x00] +0xff 0x76 0xf6 0x7e 0x20 0x3f 0x00 0x00