forked from OSchip/llvm-project
R600/SI: Add intrinsics for various math instructions.
These will be used for custom lowering and for library implementations of various math functions, so it's useful to expose these as builtins. llvm-svn: 211247
This commit is contained in:
parent
d3d6de2703
commit
a0050b0961
|
@ -33,4 +33,40 @@ defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
|
|||
"__builtin_r600_read_tgid">;
|
||||
defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
|
||||
"__builtin_r600_read_tidig">;
|
||||
|
||||
} // End TargetPrefix = "r600"
|
||||
|
||||
let TargetPrefix = "AMDGPU" in {
|
||||
def int_AMDGPU_div_scale :
|
||||
Intrinsic<[llvm_anyfloat_ty, llvm_i1_ty],
|
||||
[LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
|
||||
GCCBuiltin<"__builtin_amdgpu_div_scale">;
|
||||
|
||||
def int_AMDGPU_div_fmas :
|
||||
Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
|
||||
[IntrNoMem]>,
|
||||
GCCBuiltin<"__builtin_amdgpu_div_fmas">;
|
||||
|
||||
def int_AMDGPU_div_fixup :
|
||||
Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>,
|
||||
GCCBuiltin<"__builtin_amdgpu_div_fixup">;
|
||||
|
||||
def int_AMDGPU_trig_preop :
|
||||
Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>, llvm_i32_ty], [IntrNoMem]>,
|
||||
GCCBuiltin<"__builtin_amdgpu_trig_preop">;
|
||||
|
||||
def int_AMDGPU_rcp :
|
||||
Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>], [IntrNoMem]>,
|
||||
GCCBuiltin<"__builtin_amdgpu_rcp">;
|
||||
|
||||
def int_AMDGPU_rsq :
|
||||
Intrinsic<[llvm_anyfloat_ty],
|
||||
[LLVMMatchType<0>], [IntrNoMem]>,
|
||||
GCCBuiltin<"__builtin_amdgpu_rsq">;
|
||||
|
||||
|
||||
} // End TargetPrefix = "AMDGPU"
|
||||
|
|
|
@ -842,6 +842,28 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
|
|||
return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
|
||||
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
||||
|
||||
case Intrinsic::AMDGPU_div_scale:
|
||||
return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, VT,
|
||||
Op.getOperand(1), Op.getOperand(2));
|
||||
|
||||
case Intrinsic::AMDGPU_div_fmas:
|
||||
return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT,
|
||||
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
||||
|
||||
case Intrinsic::AMDGPU_div_fixup:
|
||||
return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
|
||||
Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
|
||||
|
||||
case Intrinsic::AMDGPU_trig_preop:
|
||||
return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
|
||||
Op.getOperand(1), Op.getOperand(2));
|
||||
|
||||
case Intrinsic::AMDGPU_rcp:
|
||||
return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
|
||||
|
||||
case Intrinsic::AMDGPU_rsq:
|
||||
return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
|
||||
|
||||
case AMDGPUIntrinsic::AMDGPU_imax:
|
||||
return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
|
||||
Op.getOperand(2));
|
||||
|
@ -2042,6 +2064,14 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
NODE_NAME_CASE(FMIN)
|
||||
NODE_NAME_CASE(SMIN)
|
||||
NODE_NAME_CASE(UMIN)
|
||||
NODE_NAME_CASE(URECIP)
|
||||
NODE_NAME_CASE(DIV_SCALE)
|
||||
NODE_NAME_CASE(DIV_FMAS)
|
||||
NODE_NAME_CASE(DIV_FIXUP)
|
||||
NODE_NAME_CASE(TRIG_PREOP)
|
||||
NODE_NAME_CASE(RCP)
|
||||
NODE_NAME_CASE(RSQ)
|
||||
NODE_NAME_CASE(DOT4)
|
||||
NODE_NAME_CASE(BFE_U32)
|
||||
NODE_NAME_CASE(BFE_I32)
|
||||
NODE_NAME_CASE(BFI)
|
||||
|
@ -2051,8 +2081,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
NODE_NAME_CASE(MUL_I24)
|
||||
NODE_NAME_CASE(MAD_U24)
|
||||
NODE_NAME_CASE(MAD_I24)
|
||||
NODE_NAME_CASE(URECIP)
|
||||
NODE_NAME_CASE(DOT4)
|
||||
NODE_NAME_CASE(EXPORT)
|
||||
NODE_NAME_CASE(CONST_ADDRESS)
|
||||
NODE_NAME_CASE(REGISTER_LOAD)
|
||||
|
|
|
@ -175,6 +175,9 @@ enum {
|
|||
DWORDADDR,
|
||||
FRACT,
|
||||
CLAMP,
|
||||
|
||||
// SIN_HW, COS_HW - f32 for SI, 1 ULP max error, valid from -100 pi to 100 pi.
|
||||
// Denormals handled on some parts.
|
||||
COS_HW,
|
||||
SIN_HW,
|
||||
FMAX,
|
||||
|
@ -184,6 +187,15 @@ enum {
|
|||
SMIN,
|
||||
UMIN,
|
||||
URECIP,
|
||||
DIV_SCALE,
|
||||
DIV_FMAS,
|
||||
DIV_FIXUP,
|
||||
TRIG_PREOP, // 1 ULP max error for f64
|
||||
|
||||
// RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
|
||||
// For f64, max error 2^29 ULP, handles denormals.
|
||||
RCP,
|
||||
RSQ,
|
||||
DOT4,
|
||||
BFE_U32, // Extract range of bits with zero extension to 32-bits.
|
||||
BFE_I32, // Extract range of bits with sign extension to 32-bits.
|
||||
|
|
|
@ -19,6 +19,14 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
|
|||
SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
|
||||
]>;
|
||||
|
||||
def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
|
||||
[SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
|
||||
>;
|
||||
|
||||
def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
|
||||
[SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// AMDGPU DAG Nodes
|
||||
//
|
||||
|
@ -29,6 +37,12 @@ def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
|
|||
// out = a - floor(a)
|
||||
def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
|
||||
|
||||
// out = 1.0 / a
|
||||
def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
|
||||
|
||||
// out = 1.0 / sqrt(a)
|
||||
def AMDGPUrsq : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
|
||||
|
||||
// out = max(a, b) a and b are floats
|
||||
def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
|
||||
[SDNPCommutative, SDNPAssociative]
|
||||
|
@ -78,6 +92,21 @@ def AMDGPUcvt_f32_ubyte3 : SDNode<"AMDGPUISD::CVT_F32_UBYTE3",
|
|||
// e is rounding error
|
||||
def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
|
||||
|
||||
// Special case divide preop and flags.
|
||||
def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
|
||||
|
||||
// Special case divide FMA with scale and flags (src0 = Quotient,
|
||||
// src1 = Denominator, src2 = Numerator).
|
||||
def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", SDTFPTernaryOp>;
|
||||
|
||||
// Single or double precision division fixup.
|
||||
// Special case divide fixup and flags(src0 = Quotient, src1 =
|
||||
// Denominator, src2 = Numerator).
|
||||
def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
|
||||
|
||||
// Look Up 2.0 / pi src0 with segment select src1[4:0]
|
||||
def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
|
||||
|
||||
def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
|
||||
SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
|
||||
[SDNPHasChain, SDNPMayLoad]>;
|
||||
|
|
|
@ -519,6 +519,16 @@ multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
|
|||
>;
|
||||
}
|
||||
|
||||
class RcpPat<Instruction RcpInst, ValueType vt> : Pat <
|
||||
(fdiv FP_ONE, vt:$src),
|
||||
(RcpInst $src)
|
||||
>;
|
||||
|
||||
class RsqPat<Instruction RsqInst, ValueType vt> : Pat <
|
||||
(AMDGPUrcp (fsqrt vt:$src)),
|
||||
(RsqInst $src)
|
||||
>;
|
||||
|
||||
include "R600Instructions.td"
|
||||
include "R700Instructions.td"
|
||||
include "EvergreenInstructions.td"
|
||||
|
|
|
@ -30,8 +30,6 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
|
|||
def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
||||
def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
|
||||
|
|
|
@ -1083,7 +1083,7 @@ class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
|
|||
}
|
||||
|
||||
class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
|
||||
inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
|
||||
inst, "RECIPSQRT_CLAMPED", AMDGPUrsq
|
||||
> {
|
||||
let Itinerary = TransALU;
|
||||
}
|
||||
|
|
|
@ -341,6 +341,8 @@ Counters SIInsertWaits::handleOperands(MachineInstr &MI) {
|
|||
return Result;
|
||||
}
|
||||
|
||||
// FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
|
||||
// around other non-memory instructions.
|
||||
bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
|
||||
bool Changes = false;
|
||||
|
||||
|
|
|
@ -1116,22 +1116,23 @@ defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
|
|||
defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32",
|
||||
[(set f32:$dst, (flog2 f32:$src0))]
|
||||
>;
|
||||
|
||||
defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
|
||||
defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
|
||||
defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
|
||||
[(set f32:$dst, (fdiv FP_ONE, f32:$src0))]
|
||||
[(set f32:$dst, (AMDGPUrcp f32:$src0))]
|
||||
>;
|
||||
defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
|
||||
defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
|
||||
defm V_RSQ_LEGACY_F32 : VOP1_32 <
|
||||
0x0000002d, "V_RSQ_LEGACY_F32",
|
||||
[(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
|
||||
[(set f32:$dst, (AMDGPUrsq f32:$src0))]
|
||||
>;
|
||||
defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32",
|
||||
[(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))]
|
||||
>;
|
||||
defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64",
|
||||
[(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
|
||||
[(set f64:$dst, (AMDGPUrcp f64:$src0))]
|
||||
>;
|
||||
defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
|
||||
defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64",
|
||||
|
@ -1417,8 +1418,12 @@ defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
|
|||
//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
|
||||
defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
|
||||
////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
|
||||
defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
|
||||
def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
|
||||
defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32",
|
||||
[(set f32:$dst, (AMDGPUdiv_fixup f32:$src0, f32:$src1, f32:$src2))]
|
||||
>;
|
||||
def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64",
|
||||
[(set f64:$dst, (AMDGPUdiv_fixup f64:$src0, f64:$src1, f64:$src2))]
|
||||
>;
|
||||
|
||||
def V_LSHL_B64 : VOP3_64_32 <0x00000161, "V_LSHL_B64",
|
||||
[(set i64:$dst, (shl i64:$src0, i32:$src1))]
|
||||
|
@ -1452,12 +1457,19 @@ defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
|
|||
|
||||
defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
|
||||
def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
|
||||
defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
|
||||
def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
|
||||
|
||||
defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32",
|
||||
[(set f32:$dst, (AMDGPUdiv_fmas f32:$src0, f32:$src1, f32:$src2))]
|
||||
>;
|
||||
def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64",
|
||||
[(set f64:$dst, (AMDGPUdiv_fmas f64:$src0, f64:$src1, f64:$src2))]
|
||||
>;
|
||||
//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
|
||||
//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
|
||||
//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
|
||||
def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
|
||||
def V_TRIG_PREOP_F64 : VOP3_64_32 <0x00000174, "V_TRIG_PREOP_F64",
|
||||
[(set f64:$dst, (AMDGPUtrig_preop f64:$src0, i32:$src1))]
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Pseudo Instructions
|
||||
|
@ -1748,6 +1760,15 @@ def : Pat <
|
|||
(S_BARRIER)
|
||||
>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VOP1 Patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def : RcpPat<V_RCP_F32_e32, f32>;
|
||||
def : RcpPat<V_RCP_F64_e32, f64>;
|
||||
def : RsqPat<V_RSQ_F32_e32, f32>;
|
||||
def : RsqPat<V_RSQ_F64_e32, f64>;
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// VOP2 Patterns
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -922,6 +922,20 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
|
|||
break;
|
||||
}
|
||||
|
||||
case Intrinsic::AMDGPU_rcp: {
|
||||
if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) {
|
||||
const APFloat &ArgVal = C->getValueAPF();
|
||||
APFloat Val(ArgVal.getSemantics(), 1.0);
|
||||
APFloat::opStatus Status = Val.divide(ArgVal,
|
||||
APFloat::rmNearestTiesToEven);
|
||||
// Only do this if it was exact and therefore not dependent on the
|
||||
// rounding mode.
|
||||
if (Status == APFloat::opOK)
|
||||
return ReplaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
case Intrinsic::stackrestore: {
|
||||
// If the save is right next to the restore, remove the restore. This can
|
||||
// happen when variable allocas are DCE'd.
|
||||
|
|
|
@ -101,7 +101,7 @@ IF137: ; preds = %main_body
|
|||
%88 = insertelement <4 x float> %87, float %32, i32 2
|
||||
%89 = insertelement <4 x float> %88, float 0.000000e+00, i32 3
|
||||
%90 = call float @llvm.AMDGPU.dp4(<4 x float> %85, <4 x float> %89)
|
||||
%91 = call float @llvm.AMDGPU.rsq(float %90)
|
||||
%91 = call float @llvm.AMDGPU.rsq.f32(float %90)
|
||||
%92 = fmul float %30, %91
|
||||
%93 = fmul float %31, %91
|
||||
%94 = fmul float %32, %91
|
||||
|
@ -344,7 +344,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15
|
|||
%325 = insertelement <4 x float> %324, float %318, i32 2
|
||||
%326 = insertelement <4 x float> %325, float 0.000000e+00, i32 3
|
||||
%327 = call float @llvm.AMDGPU.dp4(<4 x float> %322, <4 x float> %326)
|
||||
%328 = call float @llvm.AMDGPU.rsq(float %327)
|
||||
%328 = call float @llvm.AMDGPU.rsq.f32(float %327)
|
||||
%329 = fmul float %314, %328
|
||||
%330 = fmul float %316, %328
|
||||
%331 = fmul float %318, %328
|
||||
|
@ -377,7 +377,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15
|
|||
%358 = insertelement <4 x float> %357, float %45, i32 2
|
||||
%359 = insertelement <4 x float> %358, float 0.000000e+00, i32 3
|
||||
%360 = call float @llvm.AMDGPU.dp4(<4 x float> %355, <4 x float> %359)
|
||||
%361 = call float @llvm.AMDGPU.rsq(float %360)
|
||||
%361 = call float @llvm.AMDGPU.rsq.f32(float %360)
|
||||
%362 = fmul float %45, %361
|
||||
%363 = call float @fabs(float %362)
|
||||
%364 = fmul float %176, 0x3FECCCCCC0000000
|
||||
|
@ -403,7 +403,7 @@ ENDIF136: ; preds = %main_body, %ENDIF15
|
|||
%384 = insertelement <4 x float> %383, float %45, i32 2
|
||||
%385 = insertelement <4 x float> %384, float 0.000000e+00, i32 3
|
||||
%386 = call float @llvm.AMDGPU.dp4(<4 x float> %381, <4 x float> %385)
|
||||
%387 = call float @llvm.AMDGPU.rsq(float %386)
|
||||
%387 = call float @llvm.AMDGPU.rsq.f32(float %386)
|
||||
%388 = fmul float %45, %387
|
||||
%389 = call float @fabs(float %388)
|
||||
%390 = fmul float %176, 0x3FF51EB860000000
|
||||
|
@ -1041,7 +1041,7 @@ IF179: ; preds = %ENDIF175
|
|||
%896 = insertelement <4 x float> %895, float %45, i32 2
|
||||
%897 = insertelement <4 x float> %896, float 0.000000e+00, i32 3
|
||||
%898 = call float @llvm.AMDGPU.dp4(<4 x float> %893, <4 x float> %897)
|
||||
%899 = call float @llvm.AMDGPU.rsq(float %898)
|
||||
%899 = call float @llvm.AMDGPU.rsq.f32(float %898)
|
||||
%900 = fmul float %45, %899
|
||||
%901 = call float @fabs(float %900)
|
||||
%902 = fmul float %176, 0x3FECCCCCC0000000
|
||||
|
@ -1150,7 +1150,7 @@ ENDIF178: ; preds = %ENDIF175, %IF179
|
|||
declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.rsq(float) #1
|
||||
declare float @llvm.AMDGPU.rsq.f32(float) #1
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare <4 x float> @llvm.AMDGPU.tex(<4 x float>, i32, i32, i32) #1
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
|
||||
declare float @llvm.AMDGPU.div.fixup.f32(float, float, float) nounwind readnone
|
||||
declare double @llvm.AMDGPU.div.fixup.f64(double, double, double) nounwind readnone
|
||||
|
||||
; SI-LABEL: @test_div_fixup_f32:
|
||||
; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
|
||||
; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: V_DIV_FIXUP_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
|
||||
; SI: BUFFER_STORE_DWORD [[RESULT]],
|
||||
; SI: S_ENDPGM
|
||||
define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
|
||||
%result = call float @llvm.AMDGPU.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
|
||||
store float %result, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @test_div_fixup_f64:
|
||||
; SI: V_DIV_FIXUP_F64
|
||||
define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
|
||||
%result = call double @llvm.AMDGPU.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
|
||||
store double %result, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,27 @@
|
|||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
|
||||
declare float @llvm.AMDGPU.div.fmas.f32(float, float, float) nounwind readnone
|
||||
declare double @llvm.AMDGPU.div.fmas.f64(double, double, double) nounwind readnone
|
||||
|
||||
; SI-LABEL: @test_div_fmas_f32:
|
||||
; SI-DAG: S_LOAD_DWORD [[SA:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xb
|
||||
; SI-DAG: S_LOAD_DWORD [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xd
|
||||
; SI-DAG: V_MOV_B32_e32 [[VC:v[0-9]+]], [[SC]]
|
||||
; SI-DAG: S_LOAD_DWORD [[SB:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; SI: V_MOV_B32_e32 [[VB:v[0-9]+]], [[SB]]
|
||||
; SI: V_DIV_FMAS_F32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
|
||||
; SI: BUFFER_STORE_DWORD [[RESULT]],
|
||||
; SI: S_ENDPGM
|
||||
define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
|
||||
%result = call float @llvm.AMDGPU.div.fmas.f32(float %a, float %b, float %c) nounwind readnone
|
||||
store float %result, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @test_div_fmas_f64:
|
||||
; SI: V_DIV_FMAS_F64
|
||||
define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
|
||||
%result = call double @llvm.AMDGPU.div.fmas.f64(double %a, double %b, double %c) nounwind readnone
|
||||
store double %result, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
; XFAIL: *
|
||||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
|
||||
declare float @llvm.AMDGPU.div.scale.f32(float, float) nounwind readnone
|
||||
declare double @llvm.AMDGPU.div.scale.f64(double, double) nounwind readnone
|
||||
|
||||
; SI-LABEL @test_div_scale_f32:
|
||||
define void @test_div_scale_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr) nounwind {
|
||||
%a = load float addrspace(1)* %aptr, align 4
|
||||
%b = load float addrspace(1)* %bptr, align 4
|
||||
%result = call float @llvm.AMDGPU.div.scale.f32(float %a, float %b) nounwind readnone
|
||||
store float %result, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL @test_div_scale_f64:
|
||||
define void @test_div_scale_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %bptr) nounwind {
|
||||
%a = load double addrspace(1)* %aptr, align 8
|
||||
%b = load double addrspace(1)* %bptr, align 8
|
||||
%result = call double @llvm.AMDGPU.div.scale.f64(double %a, double %b) nounwind readnone
|
||||
store double %result, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,58 @@
|
|||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
|
||||
declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
|
||||
|
||||
|
||||
declare float @llvm.sqrt.f32(float) nounwind readnone
|
||||
declare double @llvm.sqrt.f64(double) nounwind readnone
|
||||
|
||||
; FUNC-LABEL: @rcp_f32
|
||||
; SI: V_RCP_F32_e32
|
||||
define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind {
|
||||
%rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone
|
||||
store float %rcp, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @rcp_f64
|
||||
; SI: V_RCP_F64_e32
|
||||
define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind {
|
||||
%rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone
|
||||
store double %rcp, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @rcp_pat_f32
|
||||
; SI: V_RCP_F32_e32
|
||||
define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
|
||||
%rcp = fdiv float 1.0, %src
|
||||
store float %rcp, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @rcp_pat_f64
|
||||
; SI: V_RCP_F64_e32
|
||||
define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
|
||||
%rcp = fdiv double 1.0, %src
|
||||
store double %rcp, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @rsq_rcp_pat_f32
|
||||
; SI: V_RSQ_F32_e32
|
||||
define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind {
|
||||
%sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone
|
||||
%rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone
|
||||
store float %rcp, float addrspace(1)* %out, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: @rsq_rcp_pat_f64
|
||||
; SI: V_RSQ_F64_e32
|
||||
define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind {
|
||||
%sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone
|
||||
%rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone
|
||||
store double %rcp, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
|
@ -0,0 +1,29 @@
|
|||
; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
|
||||
|
||||
declare double @llvm.AMDGPU.trig.preop.f64(double, i32) nounwind readnone
|
||||
|
||||
; SI-LABEL: @test_trig_preop_f64:
|
||||
; SI-DAG: BUFFER_LOAD_DWORD [[SEG:v[0-9]+]]
|
||||
; SI-DAG: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
|
||||
; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
|
||||
; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
|
||||
; SI: S_ENDPGM
|
||||
define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
|
||||
%a = load double addrspace(1)* %aptr, align 8
|
||||
%b = load i32 addrspace(1)* %bptr, align 4
|
||||
%result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 %b) nounwind readnone
|
||||
store double %result, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; SI-LABEL: @test_trig_preop_f64_imm_segment:
|
||||
; SI: BUFFER_LOAD_DWORDX2 [[SRC:v\[[0-9]+:[0-9]+\]]],
|
||||
; SI: V_TRIG_PREOP_F64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
|
||||
; SI: BUFFER_STORE_DWORDX2 [[RESULT]],
|
||||
; SI: S_ENDPGM
|
||||
define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
|
||||
%a = load double addrspace(1)* %aptr, align 8
|
||||
%result = call double @llvm.AMDGPU.trig.preop.f64(double %a, i32 7) nounwind readnone
|
||||
store double %result, double addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
|
@ -103,7 +103,7 @@ main_body:
|
|||
%95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
|
||||
%96 = call float @llvm.AMDGPU.dp4(<4 x float> %91, <4 x float> %95)
|
||||
%97 = call float @fabs(float %96)
|
||||
%98 = call float @llvm.AMDGPU.rsq(float %97)
|
||||
%98 = call float @llvm.AMDGPU.rsq.f32(float %97)
|
||||
%99 = fmul float %4, %98
|
||||
%100 = fmul float %5, %98
|
||||
%101 = fmul float %6, %98
|
||||
|
@ -225,7 +225,7 @@ declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) #1
|
|||
declare float @fabs(float) #2
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.rsq(float) #1
|
||||
declare float @llvm.AMDGPU.rsq.f32(float) #1
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDIL.clamp.(float, float, float) #1
|
||||
|
|
|
@ -70,7 +70,7 @@ main_body:
|
|||
%55 = fadd float %54, %53
|
||||
%56 = fmul float %45, %45
|
||||
%57 = fadd float %55, %56
|
||||
%58 = call float @llvm.AMDGPU.rsq(float %57)
|
||||
%58 = call float @llvm.AMDGPU.rsq.f32(float %57)
|
||||
%59 = fmul float %43, %58
|
||||
%60 = fmul float %44, %58
|
||||
%61 = fmul float %45, %58
|
||||
|
@ -212,7 +212,7 @@ declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
|
|||
declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <32 x i8>, <16 x i8>, i32) #1
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.rsq(float) #3
|
||||
declare float @llvm.AMDGPU.rsq.f32(float) #3
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDIL.exp.(float) #3
|
||||
|
|
|
@ -203,7 +203,7 @@ main_body:
|
|||
%198 = fadd float %197, %196
|
||||
%199 = fmul float %97, %97
|
||||
%200 = fadd float %198, %199
|
||||
%201 = call float @llvm.AMDGPU.rsq(float %200)
|
||||
%201 = call float @llvm.AMDGPU.rsq.f32(float %200)
|
||||
%202 = fmul float %95, %201
|
||||
%203 = fmul float %96, %201
|
||||
%204 = fmul float %202, %29
|
||||
|
@ -384,7 +384,7 @@ IF67: ; preds = %LOOP65
|
|||
%355 = fadd float %354, %353
|
||||
%356 = fmul float %352, %352
|
||||
%357 = fadd float %355, %356
|
||||
%358 = call float @llvm.AMDGPU.rsq(float %357)
|
||||
%358 = call float @llvm.AMDGPU.rsq.f32(float %357)
|
||||
%359 = fmul float %350, %358
|
||||
%360 = fmul float %351, %358
|
||||
%361 = fmul float %352, %358
|
||||
|
@ -512,7 +512,7 @@ IF67: ; preds = %LOOP65
|
|||
%483 = fadd float %482, %481
|
||||
%484 = fmul float %109, %109
|
||||
%485 = fadd float %483, %484
|
||||
%486 = call float @llvm.AMDGPU.rsq(float %485)
|
||||
%486 = call float @llvm.AMDGPU.rsq.f32(float %485)
|
||||
%487 = fmul float %107, %486
|
||||
%488 = fmul float %108, %486
|
||||
%489 = fmul float %109, %486
|
||||
|
@ -541,7 +541,7 @@ IF67: ; preds = %LOOP65
|
|||
%512 = fadd float %511, %510
|
||||
%513 = fmul float %97, %97
|
||||
%514 = fadd float %512, %513
|
||||
%515 = call float @llvm.AMDGPU.rsq(float %514)
|
||||
%515 = call float @llvm.AMDGPU.rsq.f32(float %514)
|
||||
%516 = fmul float %95, %515
|
||||
%517 = fmul float %96, %515
|
||||
%518 = fmul float %97, %515
|
||||
|
@ -658,7 +658,7 @@ declare i32 @llvm.SI.tid() #2
|
|||
declare float @ceil(float) #3
|
||||
|
||||
; Function Attrs: readnone
|
||||
declare float @llvm.AMDGPU.rsq(float) #2
|
||||
declare float @llvm.AMDGPU.rsq.f32(float) #2
|
||||
|
||||
; Function Attrs: nounwind readnone
|
||||
declare <4 x float> @llvm.SI.sampled.v8i32(<8 x i32>, <32 x i8>, <16 x i8>, i32) #1
|
||||
|
@ -887,7 +887,7 @@ main_body:
|
|||
%212 = fadd float %211, %210
|
||||
%213 = fmul float %209, %209
|
||||
%214 = fadd float %212, %213
|
||||
%215 = call float @llvm.AMDGPU.rsq(float %214)
|
||||
%215 = call float @llvm.AMDGPU.rsq.f32(float %214)
|
||||
%216 = fmul float %205, %215
|
||||
%217 = fmul float %207, %215
|
||||
%218 = fmul float %209, %215
|
||||
|
@ -1123,7 +1123,7 @@ IF189: ; preds = %LOOP
|
|||
%434 = fsub float -0.000000e+00, %433
|
||||
%435 = fadd float 0x3FF00068E0000000, %434
|
||||
%436 = call float @llvm.AMDIL.clamp.(float %435, float 0.000000e+00, float 1.000000e+00)
|
||||
%437 = call float @llvm.AMDGPU.rsq(float %436)
|
||||
%437 = call float @llvm.AMDGPU.rsq.f32(float %436)
|
||||
%438 = fmul float %437, %436
|
||||
%439 = fsub float -0.000000e+00, %436
|
||||
%440 = call float @llvm.AMDGPU.cndlt(float %439, float %438, float 0.000000e+00)
|
||||
|
@ -1147,7 +1147,7 @@ IF189: ; preds = %LOOP
|
|||
%458 = fadd float %457, %456
|
||||
%459 = fmul float %455, %455
|
||||
%460 = fadd float %458, %459
|
||||
%461 = call float @llvm.AMDGPU.rsq(float %460)
|
||||
%461 = call float @llvm.AMDGPU.rsq.f32(float %460)
|
||||
%462 = fmul float %451, %461
|
||||
%463 = fmul float %453, %461
|
||||
%464 = fmul float %455, %461
|
||||
|
@ -1257,7 +1257,7 @@ ENDIF197: ; preds = %IF189, %IF198
|
|||
%559 = fadd float %558, %557
|
||||
%560 = fmul float %556, %556
|
||||
%561 = fadd float %559, %560
|
||||
%562 = call float @llvm.AMDGPU.rsq(float %561)
|
||||
%562 = call float @llvm.AMDGPU.rsq.f32(float %561)
|
||||
%563 = fmul float %562, %561
|
||||
%564 = fsub float -0.000000e+00, %561
|
||||
%565 = call float @llvm.AMDGPU.cndlt(float %564, float %563, float 0.000000e+00)
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
; RUN: opt -instcombine -S < %s | FileCheck %s
|
||||
|
||||
declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone
|
||||
declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone
|
||||
|
||||
; CHECK-LABEL: @test_constant_fold_rcp_f32_1
|
||||
; CHECK-NEXT: ret float 1.000000e+00
|
||||
define float @test_constant_fold_rcp_f32_1() nounwind {
|
||||
%val = call float @llvm.AMDGPU.rcp.f32(float 1.0) nounwind readnone
|
||||
ret float %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @test_constant_fold_rcp_f64_1
|
||||
; CHECK-NEXT: ret double 1.000000e+00
|
||||
define double @test_constant_fold_rcp_f64_1() nounwind {
|
||||
%val = call double @llvm.AMDGPU.rcp.f64(double 1.0) nounwind readnone
|
||||
ret double %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @test_constant_fold_rcp_f32_half
|
||||
; CHECK-NEXT: ret float 2.000000e+00
|
||||
define float @test_constant_fold_rcp_f32_half() nounwind {
|
||||
%val = call float @llvm.AMDGPU.rcp.f32(float 0.5) nounwind readnone
|
||||
ret float %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @test_constant_fold_rcp_f64_half
|
||||
; CHECK-NEXT: ret double 2.000000e+00
|
||||
define double @test_constant_fold_rcp_f64_half() nounwind {
|
||||
%val = call double @llvm.AMDGPU.rcp.f64(double 0.5) nounwind readnone
|
||||
ret double %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @test_constant_fold_rcp_f32_43
|
||||
; CHECK-NEXT: call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01)
|
||||
define float @test_constant_fold_rcp_f32_43() nounwind {
|
||||
%val = call float @llvm.AMDGPU.rcp.f32(float 4.300000e+01) nounwind readnone
|
||||
ret float %val
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @test_constant_fold_rcp_f64_43
|
||||
; CHECK-NEXT: call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01)
|
||||
define double @test_constant_fold_rcp_f64_43() nounwind {
|
||||
%val = call double @llvm.AMDGPU.rcp.f64(double 4.300000e+01) nounwind readnone
|
||||
ret double %val
|
||||
}
|
||||
|
Loading…
Reference in New Issue