forked from OSchip/llvm-project
AMDGPU: Make v4i16/v4f16 legal
Some image loads return these, and it's awkward working around them not being legal. llvm-svn: 334835
This commit is contained in:
parent
fa5597b24d
commit
02dc7e19e2
|
@ -127,7 +127,7 @@ def CC_AMDGPU_Func : CallingConv<[
|
|||
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
|
||||
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
|
||||
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
|
||||
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>,
|
||||
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>,
|
||||
CCIfType<[i32, f32, v2i16, v2f16, i16, f16, i1], CCAssignToStack<4, 4>>,
|
||||
CCIfType<[i64, f64, v2i32, v2f32], CCAssignToStack<8, 4>>,
|
||||
CCIfType<[v4i32, v4f32, v2i64, v2f64], CCAssignToStack<16, 4>>,
|
||||
|
@ -144,7 +144,7 @@ def RetCC_AMDGPU_Func : CallingConv<[
|
|||
VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
|
||||
VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
|
||||
VGPR24, VGPR25, VGPR26, VGPR27, VGPR28, VGPR29, VGPR30, VGPR31]>>,
|
||||
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64], CCCustom<"allocateVGPRTuple">>
|
||||
CCIfType<[i64, f64, v2i32, v2f32, v4i32, v4f32, v8i32, v8f32, v16i32, v16f32, v2i64, v2f64, v4i16, v4f16], CCCustom<"allocateVGPRTuple">>
|
||||
]>;
|
||||
|
||||
def CC_AMDGPU : CallingConv<[
|
||||
|
|
|
@ -73,7 +73,9 @@ static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
|
|||
case MVT::i64:
|
||||
case MVT::f64:
|
||||
case MVT::v2i32:
|
||||
case MVT::v2f32: {
|
||||
case MVT::v2f32:
|
||||
case MVT::v4i16:
|
||||
case MVT::v4f16: {
|
||||
// Up to SGPR0-SGPR39
|
||||
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
|
||||
&AMDGPU::SGPR_64RegClass, 20);
|
||||
|
@ -94,7 +96,9 @@ static bool allocateVGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
|
|||
case MVT::i64:
|
||||
case MVT::f64:
|
||||
case MVT::v2i32:
|
||||
case MVT::v2f32: {
|
||||
case MVT::v2f32:
|
||||
case MVT::v4i16:
|
||||
case MVT::v4f16: {
|
||||
return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
|
||||
&AMDGPU::VReg_64RegClass, 31);
|
||||
}
|
||||
|
@ -1234,6 +1238,16 @@ SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
|
|||
SelectionDAG &DAG) const {
|
||||
SmallVector<SDValue, 8> Args;
|
||||
|
||||
EVT VT = Op.getValueType();
|
||||
if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
SDLoc SL(Op);
|
||||
SDValue Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(0));
|
||||
SDValue Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Op.getOperand(1));
|
||||
|
||||
SDValue BV = DAG.getBuildVector(MVT::v2i32, SL, { Lo, Hi });
|
||||
return DAG.getNode(ISD::BITCAST, SL, VT, BV);
|
||||
}
|
||||
|
||||
for (const SDUse &U : Op->ops())
|
||||
DAG.ExtractVectorElements(U.get(), Args);
|
||||
|
||||
|
|
|
@ -1084,8 +1084,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_XY">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XYZW">;
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
|
||||
} // End HasPackedD16VMem.
|
||||
|
||||
defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
|
||||
|
@ -1145,8 +1144,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_XY">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XYZW">;
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
|
||||
} // End HasPackedD16VMem.
|
||||
|
||||
defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
|
||||
|
@ -1571,8 +1569,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16, "TBUFFER_LOAD_FORMAT_D16_X">;
|
||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
|
||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32, "TBUFFER_LOAD_FORMAT_D16_XY">;
|
||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
|
||||
defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
|
||||
} // End HasPackedD16VMem.
|
||||
|
||||
multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
|
||||
|
@ -1633,8 +1630,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16, "TBUFFER_STORE_FORMAT_D16_X">;
|
||||
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
|
||||
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32, "TBUFFER_STORE_FORMAT_D16_XY">;
|
||||
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XYZW">;
|
||||
defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
|
||||
} // End HasPackedD16VMem.
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -594,12 +594,6 @@ foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
|
|||
def intr#_pat_v4 : ImageDimPattern<intr, "_V4", v4f32>;
|
||||
}
|
||||
|
||||
// v2f16 and v4f16 are used as data types to signal that D16 should be used.
|
||||
// However, they are not (always) legal types, and the SelectionDAG requires us
|
||||
// to legalize them before running any patterns. So we legalize them by
|
||||
// converting to an int type of equal size and using an internal 'd16helper'
|
||||
// intrinsic instead which signifies both the use of D16 and actually allows
|
||||
// this integer-based return type.
|
||||
multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I,
|
||||
AMDGPUImageDimIntrinsic d16helper> {
|
||||
let SubtargetPredicate = HasUnpackedD16VMem in {
|
||||
|
@ -611,7 +605,7 @@ multiclass ImageDimD16Helper<AMDGPUImageDimIntrinsic I,
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
def _packed_v1 : ImageDimPattern<I, "_V1", f16, "_D16">;
|
||||
def _packed_v2 : ImageDimPattern<I, "_V1", v2f16, "_D16">;
|
||||
def _packed_v4 : ImageDimPattern<d16helper, "_V2", v2i32, "_D16">;
|
||||
def _packed_v4 : ImageDimPattern<I, "_V2", v4f16, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
|
@ -653,10 +647,7 @@ foreach intr = AMDGPUImageDimGatherIntrinsics in {
|
|||
} // End HasUnpackedD16VMem.
|
||||
|
||||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
def intr#_packed_v4 :
|
||||
ImageDimPattern<!cast<AMDGPUImageDimIntrinsic>(
|
||||
"int_SI_image_d16helper_" # intr.P.OpMod # intr.P.Dim.Name),
|
||||
"_V2", v2i32, "_D16">;
|
||||
def intr#_packed_v4 : ImageDimPattern<intr, "_V2", v4f16, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
|
@ -703,6 +694,7 @@ multiclass ImageSamplePatterns<SDPatternOperator name, string opcode> {
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">;
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
|
@ -712,16 +704,15 @@ multiclass ImageSampleAltPatterns<SDPatternOperator name, string opcode> {
|
|||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16_gfx80">;
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">;
|
||||
} // End HasUnpackedD16VMem.
|
||||
|
||||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
// ImageGather4 patterns.
|
||||
multiclass ImageGather4Patterns<SDPatternOperator name, string opcode> {
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4f32>;
|
||||
|
||||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
// ImageGather4 alternative patterns for illegal vector half Types.
|
||||
|
@ -730,9 +721,6 @@ multiclass ImageGather4AltPatterns<SDPatternOperator name, string opcode> {
|
|||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">;
|
||||
} // End HasUnpackedD16VMem.
|
||||
|
||||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : ImageSampleDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
// ImageLoad for amdgcn.
|
||||
|
@ -766,6 +754,7 @@ multiclass ImageLoadPatterns<SDPatternOperator name, string opcode> {
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
|
||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">;
|
||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
|
@ -775,11 +764,6 @@ multiclass ImageLoadAltPatterns<SDPatternOperator name, string opcode> {
|
|||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16_gfx80">;
|
||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V4), v4i32, "_D16_gfx80">;
|
||||
} // End HasUnPackedD16VMem.
|
||||
|
||||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
|
||||
defm : ImageLoadDataPatterns<name, !cast<string>(opcode # _V2), v2i32, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
// ImageStore for amdgcn.
|
||||
|
@ -813,6 +797,7 @@ multiclass ImageStorePatterns<SDPatternOperator name, string opcode> {
|
|||
let SubtargetPredicate = HasPackedD16VMem in {
|
||||
defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), f16, "_D16">;
|
||||
defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V1), v2f16, "_D16">;
|
||||
defm : ImageStoreDataPatterns<name, !cast<string>(opcode # _V2), v4f16, "_D16">;
|
||||
} // End HasPackedD16VMem.
|
||||
}
|
||||
|
||||
|
|
|
@ -143,6 +143,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
// Unless there are also VOP3P operations, not operations are really legal.
|
||||
addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
|
||||
addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
|
||||
addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
|
||||
addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
|
||||
}
|
||||
|
||||
computeRegisterProperties(STI.getRegisterInfo());
|
||||
|
@ -237,7 +239,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
// We only support LOAD/STORE and vector manipulation ops for vectors
|
||||
// with > 4 elements.
|
||||
for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
|
||||
MVT::v2i64, MVT::v2f64}) {
|
||||
MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16 }) {
|
||||
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
||||
switch (Op) {
|
||||
case ISD::LOAD:
|
||||
|
@ -260,6 +262,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
}
|
||||
}
|
||||
|
||||
setOperationAction(ISD::FP_EXTEND, MVT::v4f32, Expand);
|
||||
|
||||
// TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
|
||||
// is expanded to avoid having two separate loops in case the index is a VGPR.
|
||||
|
||||
|
@ -426,7 +430,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
if (!Subtarget->hasFP16Denormals())
|
||||
setOperationAction(ISD::FMAD, MVT::f16, Legal);
|
||||
|
||||
for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
|
||||
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
|
||||
for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
|
||||
switch (Op) {
|
||||
case ISD::LOAD:
|
||||
|
@ -488,6 +492,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
|
||||
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
|
||||
|
||||
setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Expand);
|
||||
setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Expand);
|
||||
setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Expand);
|
||||
|
||||
if (!Subtarget->hasVOP3PInsts()) {
|
||||
setOperationAction(ISD::BUILD_VECTOR, MVT::v2i16, Custom);
|
||||
setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
|
||||
|
@ -520,8 +528,31 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
|
||||
|
||||
setOperationAction(ISD::SHL, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::SRA, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::SRL, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::ADD, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::SUB, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::MUL, MVT::v4i16, Custom);
|
||||
|
||||
setOperationAction(ISD::SMIN, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::SMAX, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::UMIN, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::UMAX, MVT::v4i16, Custom);
|
||||
|
||||
setOperationAction(ISD::FADD, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::FMUL, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::FMINNUM, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::FMAXNUM, MVT::v4f16, Custom);
|
||||
|
||||
setOperationAction(ISD::SELECT, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::SELECT, MVT::v4f16, Custom);
|
||||
}
|
||||
|
||||
setOperationAction(ISD::FNEG, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::FABS, MVT::v4f16, Custom);
|
||||
|
||||
if (Subtarget->has16BitInsts()) {
|
||||
setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
|
||||
AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
|
||||
|
@ -3383,6 +3414,49 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
|
|||
// Custom DAG Lowering Operations
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
|
||||
// wider vector type is legal.
|
||||
SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
unsigned Opc = Op.getOpcode();
|
||||
EVT VT = Op.getValueType();
|
||||
assert(VT == MVT::v4f16);
|
||||
|
||||
SDValue Lo, Hi;
|
||||
std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
|
||||
|
||||
SDLoc SL(Op);
|
||||
SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo,
|
||||
Op->getFlags());
|
||||
SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi,
|
||||
Op->getFlags());
|
||||
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
|
||||
}
|
||||
|
||||
// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
|
||||
// wider vector type is legal.
|
||||
SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
unsigned Opc = Op.getOpcode();
|
||||
EVT VT = Op.getValueType();
|
||||
assert(VT == MVT::v4i16 || VT == MVT::v4f16);
|
||||
|
||||
SDValue Lo0, Hi0;
|
||||
std::tie(Lo0, Hi0) = DAG.SplitVectorOperand(Op.getNode(), 0);
|
||||
SDValue Lo1, Hi1;
|
||||
std::tie(Lo1, Hi1) = DAG.SplitVectorOperand(Op.getNode(), 1);
|
||||
|
||||
SDLoc SL(Op);
|
||||
|
||||
SDValue OpLo = DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1,
|
||||
Op->getFlags());
|
||||
SDValue OpHi = DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1,
|
||||
Op->getFlags());
|
||||
|
||||
return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
||||
switch (Op.getOpcode()) {
|
||||
default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
|
||||
|
@ -3423,6 +3497,24 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
return lowerTRAP(Op, DAG);
|
||||
case ISD::DEBUGTRAP:
|
||||
return lowerDEBUGTRAP(Op, DAG);
|
||||
case ISD::FABS:
|
||||
case ISD::FNEG:
|
||||
return splitUnaryVectorOp(Op, DAG);
|
||||
case ISD::SHL:
|
||||
case ISD::SRA:
|
||||
case ISD::SRL:
|
||||
case ISD::ADD:
|
||||
case ISD::SUB:
|
||||
case ISD::MUL:
|
||||
case ISD::SMIN:
|
||||
case ISD::SMAX:
|
||||
case ISD::UMIN:
|
||||
case ISD::UMAX:
|
||||
case ISD::FMINNUM:
|
||||
case ISD::FMAXNUM:
|
||||
case ISD::FADD:
|
||||
case ISD::FMUL:
|
||||
return splitBinaryVectorOp(Op, DAG);
|
||||
}
|
||||
return SDValue();
|
||||
}
|
||||
|
@ -3630,21 +3722,23 @@ SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode,
|
|||
bool Unpacked = Subtarget->hasUnpackedD16VMem();
|
||||
EVT LoadVT = M->getValueType(0);
|
||||
|
||||
EVT UnpackedLoadVT = LoadVT.isVector() ?
|
||||
EVT::getVectorVT(*DAG.getContext(), MVT::i32,
|
||||
LoadVT.getVectorNumElements()) : LoadVT;
|
||||
EVT EquivLoadVT = LoadVT;
|
||||
if (LoadVT.isVector()) {
|
||||
EquivLoadVT = Unpacked ? UnpackedLoadVT :
|
||||
getEquivalentMemType(*DAG.getContext(), LoadVT);
|
||||
if (Unpacked && LoadVT.isVector()) {
|
||||
EquivLoadVT = LoadVT.isVector() ?
|
||||
EVT::getVectorVT(*DAG.getContext(), MVT::i32,
|
||||
LoadVT.getVectorNumElements()) : LoadVT;
|
||||
}
|
||||
|
||||
// Change from v4f16/v2f16 to EquivLoadVT.
|
||||
SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
|
||||
|
||||
SDValue Load = DAG.getMemIntrinsicNode(
|
||||
IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode,
|
||||
DL, VTList, Ops, M->getMemoryVT(), M->getMemOperand());
|
||||
SDValue Load
|
||||
= DAG.getMemIntrinsicNode(
|
||||
IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL,
|
||||
VTList, Ops, M->getMemoryVT(),
|
||||
M->getMemOperand());
|
||||
if (!Unpacked) // Just adjusted the opcode.
|
||||
return Load;
|
||||
|
||||
SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
|
||||
|
||||
|
@ -3734,8 +3828,10 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
|||
return;
|
||||
}
|
||||
case ISD::FNEG: {
|
||||
if (N->getValueType(0) != MVT::v2f16)
|
||||
break;
|
||||
|
||||
SDLoc SL(N);
|
||||
assert(N->getValueType(0) == MVT::v2f16);
|
||||
SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
|
||||
|
||||
SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32,
|
||||
|
@ -3745,8 +3841,10 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
|
|||
return;
|
||||
}
|
||||
case ISD::FABS: {
|
||||
if (N->getValueType(0) != MVT::v2f16)
|
||||
break;
|
||||
|
||||
SDLoc SL(N);
|
||||
assert(N->getValueType(0) == MVT::v2f16);
|
||||
SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
|
||||
|
||||
SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32,
|
||||
|
@ -4247,6 +4345,23 @@ SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
|
|||
SDLoc SL(Op);
|
||||
EVT VT = Op.getValueType();
|
||||
|
||||
if (VT == MVT::v4i16 || VT == MVT::v4f16) {
|
||||
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
|
||||
|
||||
// Turn into pair of packed build_vectors.
|
||||
// TODO: Special case for constants that can be materialized with s_mov_b64.
|
||||
SDValue Lo = DAG.getBuildVector(HalfVT, SL,
|
||||
{ Op.getOperand(0), Op.getOperand(1) });
|
||||
SDValue Hi = DAG.getBuildVector(HalfVT, SL,
|
||||
{ Op.getOperand(2), Op.getOperand(3) });
|
||||
|
||||
SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
|
||||
SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
|
||||
|
||||
SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
|
||||
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
|
||||
}
|
||||
|
||||
assert(VT == MVT::v2f16 || VT == MVT::v2i16);
|
||||
|
||||
SDValue Lo = Op.getOperand(0);
|
||||
|
@ -4913,11 +5028,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
|
||||
case Intrinsic::amdgcn_image_load:
|
||||
case Intrinsic::amdgcn_image_load_mip: {
|
||||
EVT LoadVT = Op.getValueType();
|
||||
if ((Subtarget->hasUnpackedD16VMem() && LoadVT == MVT::v2f16) ||
|
||||
LoadVT == MVT::v4f16) {
|
||||
MemSDNode *M = cast<MemSDNode>(Op);
|
||||
return adjustLoadValueType(getImageOpcode(IntrID), M, DAG);
|
||||
EVT VT = Op.getValueType();
|
||||
if (Subtarget->hasUnpackedD16VMem() &&
|
||||
VT.isVector() && VT.getScalarSizeInBits() == 16) {
|
||||
return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op),
|
||||
DAG);
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
|
@ -5009,8 +5124,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
|
||||
}
|
||||
|
||||
if ((Subtarget->hasUnpackedD16VMem() && Op.getValueType() == MVT::v2f16) ||
|
||||
Op.getValueType() == MVT::v4f16) {
|
||||
if (Subtarget->hasUnpackedD16VMem() &&
|
||||
Op.getValueType().isVector() &&
|
||||
Op.getValueType().getScalarSizeInBits() == 16) {
|
||||
return adjustLoadValueType(getImageOpcode(IntrID), cast<MemSDNode>(Op),
|
||||
DAG);
|
||||
}
|
||||
|
@ -5018,21 +5134,14 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
|
|||
return SDValue();
|
||||
}
|
||||
default:
|
||||
EVT LoadVT = Op.getValueType();
|
||||
if (LoadVT.getScalarSizeInBits() != 16)
|
||||
return SDValue();
|
||||
|
||||
const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
|
||||
AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID);
|
||||
if (D16ImageDimIntr) {
|
||||
bool Unpacked = Subtarget->hasUnpackedD16VMem();
|
||||
MemSDNode *M = cast<MemSDNode>(Op);
|
||||
|
||||
if (isTypeLegal(LoadVT) && (!Unpacked || LoadVT == MVT::f16))
|
||||
return SDValue();
|
||||
|
||||
return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr,
|
||||
M, DAG, true);
|
||||
if (Subtarget->hasUnpackedD16VMem() &&
|
||||
Op.getValueType().isVector() &&
|
||||
Op.getValueType().getScalarSizeInBits() == 16) {
|
||||
if (const AMDGPU::D16ImageDimIntrinsic *D16ImageDimIntr =
|
||||
AMDGPU::lookupD16ImageDimIntrinsicByIntr(IntrID)) {
|
||||
return adjustLoadValueType(D16ImageDimIntr->D16HelperIntr,
|
||||
cast<MemSDNode>(Op), DAG, true);
|
||||
}
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
|
@ -5061,13 +5170,8 @@ SDValue SITargetLowering::handleD16VData(SDValue VData,
|
|||
return DAG.UnrollVectorOp(ZExt.getNode());
|
||||
}
|
||||
|
||||
if (isTypeLegal(StoreVT))
|
||||
return VData;
|
||||
|
||||
// If target supports packed vmem, we just need to workaround
|
||||
// the illegal type by casting to an equivalent one.
|
||||
EVT EquivStoreVT = getEquivalentMemType(*DAG.getContext(), StoreVT);
|
||||
return DAG.getNode(ISD::BITCAST, DL, EquivStoreVT, VData);
|
||||
assert(isTypeLegal(StoreVT));
|
||||
return VData;
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
||||
|
@ -5261,9 +5365,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|||
case Intrinsic::amdgcn_image_store:
|
||||
case Intrinsic::amdgcn_image_store_mip: {
|
||||
SDValue VData = Op.getOperand(2);
|
||||
if ((Subtarget->hasUnpackedD16VMem() &&
|
||||
VData.getValueType() == MVT::v2f16) ||
|
||||
VData.getValueType() == MVT::v4f16) {
|
||||
EVT VT = VData.getValueType();
|
||||
if (Subtarget->hasUnpackedD16VMem() &&
|
||||
VT.isVector() && VT.getScalarSizeInBits() == 16) {
|
||||
SDValue Chain = Op.getOperand(0);
|
||||
|
||||
VData = handleD16VData(VData, DAG);
|
||||
|
@ -5293,9 +5397,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
|
|||
if (D16ImageDimIntr) {
|
||||
SDValue VData = Op.getOperand(2);
|
||||
EVT StoreVT = VData.getValueType();
|
||||
if (((StoreVT == MVT::v2f16 || StoreVT == MVT::v4f16) &&
|
||||
Subtarget->hasUnpackedD16VMem()) ||
|
||||
!isTypeLegal(StoreVT)) {
|
||||
if (Subtarget->hasUnpackedD16VMem() &&
|
||||
StoreVT.isVector() &&
|
||||
StoreVT.getScalarSizeInBits() == 16) {
|
||||
SmallVector<SDValue, 12> Ops(Op.getNode()->op_values());
|
||||
|
||||
Ops[1] = DAG.getConstant(D16ImageDimIntr->D16HelperIntr, DL, MVT::i32);
|
||||
|
@ -5521,8 +5625,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
|
|||
}
|
||||
|
||||
SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
||||
if (Op.getValueType() != MVT::i64)
|
||||
return SDValue();
|
||||
EVT VT = Op.getValueType();
|
||||
assert(VT.getSizeInBits() == 64);
|
||||
|
||||
SDLoc DL(Op);
|
||||
SDValue Cond = Op.getOperand(0);
|
||||
|
@ -5544,7 +5648,7 @@ SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
|
|||
SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
|
||||
|
||||
SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
|
||||
return DAG.getNode(ISD::BITCAST, DL, MVT::i64, Res);
|
||||
return DAG.getNode(ISD::BITCAST, DL, VT, Res);
|
||||
}
|
||||
|
||||
// Catch division cases where we can use shortcuts with rcp and rsq
|
||||
|
|
|
@ -268,7 +268,10 @@ public:
|
|||
EVT VT) const override;
|
||||
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
|
||||
bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
|
||||
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
|
||||
|
||||
void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
|
||||
SelectionDAG &DAG) const override;
|
||||
|
||||
|
|
|
@ -795,6 +795,27 @@ foreach Index = 0-15 in {
|
|||
>;
|
||||
}
|
||||
|
||||
|
||||
def : Pat <
|
||||
(extract_subvector v4i16:$vec, (i32 0)),
|
||||
(v2i16 (EXTRACT_SUBREG v4i16:$vec, sub0))
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
(extract_subvector v4i16:$vec, (i32 2)),
|
||||
(v2i16 (EXTRACT_SUBREG v4i16:$vec, sub1))
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
(extract_subvector v4f16:$vec, (i32 0)),
|
||||
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub0))
|
||||
>;
|
||||
|
||||
def : Pat <
|
||||
(extract_subvector v4f16:$vec, (i32 2)),
|
||||
(v2f16 (EXTRACT_SUBREG v4f16:$vec, sub1))
|
||||
>;
|
||||
|
||||
let SubtargetPredicate = isGCN in {
|
||||
|
||||
// FIXME: Why do only some of these type combinations for SReg and
|
||||
|
@ -834,6 +855,26 @@ def : BitConvert <f64, v2f32, VReg_64>;
|
|||
def : BitConvert <v2f32, f64, VReg_64>;
|
||||
def : BitConvert <f64, v2i32, VReg_64>;
|
||||
def : BitConvert <v2i32, f64, VReg_64>;
|
||||
|
||||
// FIXME: Make SGPR
|
||||
def : BitConvert <v2i32, v4f16, VReg_64>;
|
||||
def : BitConvert <v4f16, v2i32, VReg_64>;
|
||||
def : BitConvert <v2i32, v4f16, VReg_64>;
|
||||
def : BitConvert <v2i32, v4i16, VReg_64>;
|
||||
def : BitConvert <v4i16, v2i32, VReg_64>;
|
||||
def : BitConvert <v2f32, v4f16, VReg_64>;
|
||||
def : BitConvert <v4f16, v2f32, VReg_64>;
|
||||
def : BitConvert <v2f32, v4i16, VReg_64>;
|
||||
def : BitConvert <v4i16, v2f32, VReg_64>;
|
||||
def : BitConvert <v4i16, f64, VReg_64>;
|
||||
def : BitConvert <v4f16, f64, VReg_64>;
|
||||
def : BitConvert <f64, v4i16, VReg_64>;
|
||||
def : BitConvert <f64, v4f16, VReg_64>;
|
||||
def : BitConvert <v4i16, i64, VReg_64>;
|
||||
def : BitConvert <v4f16, i64, VReg_64>;
|
||||
def : BitConvert <i64, v4i16, VReg_64>;
|
||||
def : BitConvert <i64, v4f16, VReg_64>;
|
||||
|
||||
def : BitConvert <v4i32, v4f32, VReg_128>;
|
||||
def : BitConvert <v4f32, v4i32, VReg_128>;
|
||||
|
||||
|
|
|
@ -435,22 +435,22 @@ def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
|
|||
let AllocationPriority = 7;
|
||||
}
|
||||
|
||||
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add SGPR_64Regs)> {
|
||||
def SGPR_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add SGPR_64Regs)> {
|
||||
let CopyCost = 1;
|
||||
let AllocationPriority = 8;
|
||||
}
|
||||
|
||||
def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64], 32, (add TTMP_64Regs)> {
|
||||
def TTMP_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, v4i16, v4f16], 32, (add TTMP_64Regs)> {
|
||||
let isAllocatable = 0;
|
||||
}
|
||||
|
||||
def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
|
||||
def SReg_64_XEXEC : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
|
||||
(add SGPR_64, VCC, FLAT_SCR, XNACK_MASK, TTMP_64, TBA, TMA)> {
|
||||
let CopyCost = 1;
|
||||
let AllocationPriority = 8;
|
||||
}
|
||||
|
||||
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1], 32,
|
||||
def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, f64, i1, v4i16, v4f16], 32,
|
||||
(add SReg_64_XEXEC, EXEC)> {
|
||||
let CopyCost = 1;
|
||||
let AllocationPriority = 8;
|
||||
|
@ -505,7 +505,7 @@ def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
|
|||
}
|
||||
|
||||
// Register class for all vector registers (VGPRs + Interploation Registers)
|
||||
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32], 32, (add VGPR_64)> {
|
||||
def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32, (add VGPR_64)> {
|
||||
let Size = 64;
|
||||
|
||||
// Requires 2 v_mov_b32 to copy
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
; This test just checks that the compiler doesn't crash.
|
||||
|
||||
|
@ -126,3 +127,163 @@ end:
|
|||
store <2 x i64> %phi, <2 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4i16_to_f64:
|
||||
define amdgpu_kernel void @v4i16_to_f64(double addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
|
||||
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
|
||||
%bc = bitcast <4 x i16> %add.v4i16 to double
|
||||
%fadd.bitcast = fadd double %bc, 1.0
|
||||
store double %fadd.bitcast, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4f16_to_f64:
|
||||
define amdgpu_kernel void @v4f16_to_f64(double addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
|
||||
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
|
||||
%bc = bitcast <4 x half> %add.v4half to double
|
||||
%fadd.bitcast = fadd double %bc, 1.0
|
||||
store double %fadd.bitcast, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}f64_to_v4f16:
|
||||
define amdgpu_kernel void @f64_to_v4f16(<4 x half> addrspace(1)* %out, double addrspace(1)* %in) nounwind {
|
||||
%load = load double, double addrspace(1)* %in, align 4
|
||||
%fadd32 = fadd double %load, 1.0
|
||||
%bc = bitcast double %fadd32 to <4 x half>
|
||||
%add.bitcast = fadd <4 x half> %bc, <half 2.0, half 2.0, half 2.0, half 2.0>
|
||||
store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}f64_to_v4i16:
|
||||
define amdgpu_kernel void @f64_to_v4i16(<4 x i16> addrspace(1)* %out, double addrspace(1)* %in) nounwind {
|
||||
%load = load double, double addrspace(1)* %in, align 4
|
||||
%fadd32 = fadd double %load, 1.0
|
||||
%bc = bitcast double %fadd32 to <4 x i16>
|
||||
%add.bitcast = add <4 x i16> %bc, <i16 2, i16 2, i16 2, i16 2>
|
||||
store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4i16_to_i64:
|
||||
define amdgpu_kernel void @v4i16_to_i64(i64 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
|
||||
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
|
||||
%bc = bitcast <4 x i16> %add.v4i16 to i64
|
||||
%add.bitcast = add i64 %bc, 1
|
||||
store i64 %add.bitcast, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4f16_to_i64:
|
||||
define amdgpu_kernel void @v4f16_to_i64(i64 addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
|
||||
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
|
||||
%bc = bitcast <4 x half> %add.v4half to i64
|
||||
%add.bitcast = add i64 %bc, 1
|
||||
store i64 %add.bitcast, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}bitcast_i64_to_v4i16:
|
||||
define amdgpu_kernel void @bitcast_i64_to_v4i16(<4 x i16> addrspace(1)* %out, i64 addrspace(1)* %in) {
|
||||
%val = load i64, i64 addrspace(1)* %in, align 8
|
||||
%add = add i64 %val, 4
|
||||
%bc = bitcast i64 %add to <4 x i16>
|
||||
%add.v4i16 = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
|
||||
store <4 x i16> %add.v4i16, <4 x i16> addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}bitcast_i64_to_v4f16:
|
||||
define amdgpu_kernel void @bitcast_i64_to_v4f16(<4 x half> addrspace(1)* %out, i64 addrspace(1)* %in) {
|
||||
%val = load i64, i64 addrspace(1)* %in, align 8
|
||||
%add = add i64 %val, 4
|
||||
%bc = bitcast i64 %add to <4 x half>
|
||||
%add.v4i16 = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
|
||||
store <4 x half> %add.v4i16, <4 x half> addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4i16_to_v2f32:
|
||||
define amdgpu_kernel void @v4i16_to_v2f32(<2 x float> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
|
||||
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
|
||||
%bc = bitcast <4 x i16> %add.v4i16 to <2 x float>
|
||||
%fadd.bitcast = fadd <2 x float> %bc, <float 1.0, float 1.0>
|
||||
store <2 x float> %fadd.bitcast, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4f16_to_v2f32:
|
||||
define amdgpu_kernel void @v4f16_to_v2f32(<2 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
|
||||
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
|
||||
%bc = bitcast <4 x half> %add.v4half to <2 x float>
|
||||
%fadd.bitcast = fadd <2 x float> %bc, <float 1.0, float 1.0>
|
||||
store <2 x float> %fadd.bitcast, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v2f32_to_v4i16:
|
||||
define amdgpu_kernel void @v2f32_to_v4i16(<4 x i16> addrspace(1)* %out, <2 x float> addrspace(1)* %in) nounwind {
|
||||
%load = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
|
||||
%add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
|
||||
%bc = bitcast <2 x float> %add.v2f32 to <4 x i16>
|
||||
%add.bitcast = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
|
||||
store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v2f32_to_v4f16:
|
||||
define amdgpu_kernel void @v2f32_to_v4f16(<4 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) nounwind {
|
||||
%load = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
|
||||
%add.v2f32 = fadd <2 x float> %load, <float 2.0, float 4.0>
|
||||
%bc = bitcast <2 x float> %add.v2f32 to <4 x half>
|
||||
%add.bitcast = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
|
||||
store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4i16_to_v2i32:
|
||||
define amdgpu_kernel void @v4i16_to_v2i32(<2 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x i16>, <4 x i16> addrspace(1)* %in, align 4
|
||||
%add.v4i16 = add <4 x i16> %load, <i16 4, i16 4, i16 4, i16 4>
|
||||
%bc = bitcast <4 x i16> %add.v4i16 to <2 x i32>
|
||||
%add.bitcast = add <2 x i32> %bc, <i32 1, i32 1>
|
||||
store <2 x i32> %add.bitcast, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v4f16_to_v2i32:
|
||||
define amdgpu_kernel void @v4f16_to_v2i32(<2 x i32> addrspace(1)* %out, <4 x half> addrspace(1)* %in) nounwind {
|
||||
%load = load <4 x half>, <4 x half> addrspace(1)* %in, align 4
|
||||
%add.v4half = fadd <4 x half> %load, <half 4.0, half 4.0, half 4.0, half 4.0>
|
||||
%bc = bitcast <4 x half> %add.v4half to <2 x i32>
|
||||
%add.bitcast = add <2 x i32> %bc, <i32 1, i32 1>
|
||||
store <2 x i32> %add.bitcast, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v2i32_to_v4i16:
|
||||
define amdgpu_kernel void @v2i32_to_v4i16(<4 x i16> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
|
||||
%load = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 4
|
||||
%add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
|
||||
%bc = bitcast <2 x i32> %add.v2i32 to <4 x i16>
|
||||
%add.bitcast = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
|
||||
store <4 x i16> %add.bitcast, <4 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}v2i32_to_v4f16:
|
||||
define amdgpu_kernel void @v2i32_to_v4f16(<4 x half> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
|
||||
%load = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 4
|
||||
%add.v2i32 = add <2 x i32> %load, <i32 2, i32 4>
|
||||
%bc = bitcast <2 x i32> %add.v2i32 to <4 x half>
|
||||
%add.bitcast = fadd <4 x half> %bc, <half 1.0, half 2.0, half 4.0, half 8.0>
|
||||
store <4 x half> %add.bitcast, <4 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
|
|
@ -58,8 +58,14 @@ define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extract_vector_elt_v3f16:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; GFX89: s_load_dwordx2
|
||||
; GFX89: s_load_dwordx2
|
||||
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
|
||||
%p0 = extractelement <3 x half> %foo, i32 0
|
||||
%p1 = extractelement <3 x half> %foo, i32 2
|
||||
|
@ -71,12 +77,14 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
|
|||
|
||||
; FIXME: Why sometimes vector shift?
|
||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; GFX89: s_load_dwordx2 s
|
||||
; GFX89: s_load_dwordx2 s
|
||||
; GFX89: s_load_dword s
|
||||
|
||||
; GFX9-DAG: global_load_short_d16_hi v
|
||||
; GFX9-DAG: global_load_short_d16 v
|
||||
|
||||
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
||||
; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
|
||||
|
|
|
@ -58,8 +58,15 @@ define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extract_vector_elt_v3i16:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dwordx2 s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; GFX89: s_load_dwordx2
|
||||
; GFX89: s_load_dwordx2
|
||||
|
||||
; GCN-NOT: {{buffer|flat|global}}_load
|
||||
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
|
||||
|
@ -77,17 +84,11 @@ define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x
|
|||
; SI: buffer_store_short
|
||||
; SI: buffer_store_short
|
||||
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: buffer_store_short
|
||||
; VI: buffer_store_short
|
||||
|
||||
; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
|
||||
; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
|
||||
; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]]
|
||||
; GFX9-DAG: buffer_store_short [[VLOAD0]], off
|
||||
; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
|
||||
; GFX9-DAG: buffer_store_short [[VLOAD1]], off
|
||||
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
|
||||
; GFX89-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], s[[LOAD0]]
|
||||
; GFX89-DAG: buffer_store_short [[VLOAD0]], off
|
||||
; GFX89-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], s[[LOAD1]]
|
||||
; GFX89-DAG: buffer_store_short [[VLOAD1]], off
|
||||
define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
|
||||
%p0 = extractelement <4 x i16> %foo, i32 0
|
||||
%p1 = extractelement <4 x i16> %foo, i32 2
|
||||
|
@ -98,19 +99,28 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3i16:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; GFX89-DAG: s_load_dwordx2
|
||||
; GFX89-DAG: s_load_dwordx2 s{{\[}}[[LOAD0:[0-9]+]]:[[LOAD1:[0-9]+]]{{\]}}, s[0:1], 0x2c
|
||||
; GFX89-DAG: s_load_dword s
|
||||
|
||||
; GCN-NOT: {{buffer|flat|global}}
|
||||
|
||||
; FIXME: Unnecessary repacking
|
||||
; GFX9: s_pack_ll_b32_b16
|
||||
; GFX9: s_pack_lh_b32_b16
|
||||
; SICI: buffer_store_short
|
||||
; SICI: buffer_store_short
|
||||
; SICI: buffer_store_short
|
||||
|
||||
; SICI: buffer_load_ushort
|
||||
; SICI: buffer_store_short
|
||||
|
||||
; GFX9-NOT: s_pack_ll_b32_b16
|
||||
; GFX9-NOT: s_pack_lh_b32_b16
|
||||
|
||||
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
||||
|
||||
|
||||
; GCN: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s
|
||||
; GFX89: s_lshr_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LOAD0]]:[[LOAD1]]{{\]}}, s{{[0-9]+}}
|
||||
|
||||
; GCN: {{buffer|global}}_store_short
|
||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
|
||||
|
|
|
@ -39,11 +39,12 @@ define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_fabs_v4f16:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; CI: s_load_dword s[[LO:[0-9]+]]
|
||||
; CI: s_load_dword s[[HI:[0-9]+]]
|
||||
; GFX89: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
; GCN: s_mov_b32 [[MASK:s[0-9]+]], 0x7fff7fff
|
||||
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||
; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, [[MASK]]
|
||||
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[LO]], [[MASK]]
|
||||
; GCN-DAG: s_and_b32 s{{[0-9]+}}, s[[HI]], [[MASK]]
|
||||
; GCN: {{flat|global}}_store_dwordx2
|
||||
define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
|
||||
%fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
|
||||
|
|
|
@ -297,8 +297,8 @@ define void @void_func_v2i16(<2 x i16> %arg0) #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_v3i16:
|
||||
; GCN-DAG: buffer_store_dword v0, off
|
||||
; GCN-DAG: buffer_store_short v2, off
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off
|
||||
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
|
||||
define void @void_func_v3i16(<3 x i16> %arg0) #0 {
|
||||
store <3 x i16> %arg0, <3 x i16> addrspace(1)* undef
|
||||
ret void
|
||||
|
@ -434,10 +434,17 @@ define void @void_func_v2f16(<2 x half> %arg0) #0 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Different abi if f16 legal
|
||||
; GCN-LABEL: {{^}}void_func_v3f16:
|
||||
; GFX9-NOT: v0
|
||||
; GCN-DAG: buffer_store_dword v0, off
|
||||
; GCN-DAG: buffer_store_short v2, off
|
||||
; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v0
|
||||
; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v1
|
||||
; CI-DAG: v_cvt_f16_f32_e32 v{{[0-9]+}}, v2
|
||||
|
||||
; GFX89-DAG: v0
|
||||
; GFX89-DAG: v1
|
||||
|
||||
; GCN-DAG: buffer_store_short
|
||||
; GCN-DAG: buffer_store_dword
|
||||
define void @void_func_v3f16(<3 x half> %arg0) #0 {
|
||||
store <3 x half> %arg0, <3 x half> addrspace(1)* undef
|
||||
ret void
|
||||
|
|
|
@ -292,9 +292,8 @@ define <2 x i16> @v2i16_func_void() #0 {
|
|||
|
||||
; GCN-LABEL: {{^}}v3i16_func_void:
|
||||
; GFX9: buffer_load_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
|
||||
; GFX9: s_waitcnt vmcnt(0)
|
||||
; GFX9: v_lshrrev_b32
|
||||
; GFX9: s_setpc_b64
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <3 x i16> @v3i16_func_void() #0 {
|
||||
%val = load <3 x i16>, <3 x i16> addrspace(1)* undef
|
||||
ret <3 x i16> %val
|
||||
|
|
|
@ -22,14 +22,20 @@ define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x ha
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}load_v3f16_arg:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dwordx2
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dwordx2
|
||||
|
||||
; GCN-NOT: {buffer|flat|global}}_load_
|
||||
|
||||
; GCN-NOT: _load
|
||||
; GCN-DAG: _store_dword
|
||||
; GCN-DAG: _store_short
|
||||
; GCN-NOT: _store
|
||||
|
||||
; GCN-NOT: {{flat|global}}_load
|
||||
; GCN-DAG: {{flat|global}}_store_dword
|
||||
; GCN-DAG: {{flat|global}}_store_short
|
||||
; GCN-NOT: {{flat|global}}_store
|
||||
; GCN: s_endpgm
|
||||
define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
|
||||
store <3 x half> %arg, <3 x half> addrspace(1)* %out
|
||||
|
@ -39,10 +45,13 @@ define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x ha
|
|||
|
||||
; FIXME: Why not one load?
|
||||
; GCN-LABEL: {{^}}load_v4f16_arg:
|
||||
; GCN-DAG: s_load_dword [[ARG0_LO:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x2|0x8}}
|
||||
; GCN-DAG: s_load_dword [[ARG0_HI:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0x3|0xc}}
|
||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], [[ARG0_LO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], [[ARG0_HI]]
|
||||
; SI-DAG: s_load_dword s[[ARG0_LO:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x2
|
||||
; SI-DAG: s_load_dword s[[ARG0_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x3
|
||||
|
||||
; VI: s_load_dwordx2 s{{\[}}[[ARG0_LO:[0-9]+]]:[[ARG0_HI:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_LO:[0-9]+]], s[[ARG0_LO]]
|
||||
; GCN-DAG: v_mov_b32_e32 v[[V_ARG0_HI:[0-9]+]], s[[ARG0_HI]]
|
||||
; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[V_ARG0_LO]]:[[V_ARG0_HI]]{{\]}}
|
||||
define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
|
||||
store <4 x half> %arg, <4 x half> addrspace(1)* %out
|
||||
|
@ -77,8 +86,14 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extload_v3f16_to_v3f32_arg:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dwordx2 s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
|
||||
; GCN-NOT: _load
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
|
@ -101,10 +116,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extload_v8f16_to_v8f32_arg:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; VI: s_load_dwordx2 s
|
||||
; VI: s_load_dwordx2 s
|
||||
; VI: s_load_dwordx2 s
|
||||
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
; GCN: v_cvt_f32_f16_e32
|
||||
|
@ -150,8 +169,12 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extload_v3f16_to_v3f64_arg:
|
||||
; GCN: s_load_dword
|
||||
; GCN: s_load_dword
|
||||
; SI: s_load_dword
|
||||
; SI: s_load_dword
|
||||
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dwordx2
|
||||
|
||||
; GCN: s_lshr_b32
|
||||
|
||||
; GCN-DAG: v_cvt_f32_f16_e32
|
||||
|
@ -168,8 +191,10 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extload_v4f16_to_v4f64_arg:
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; VI: s_load_dwordx2 s
|
||||
|
||||
; GCN-DAG: v_cvt_f32_f16_e32
|
||||
; GCN-DAG: v_cvt_f32_f16_e32
|
||||
|
@ -187,11 +212,14 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)*
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extload_v8f16_to_v8f64_arg:
|
||||
; GCN: s_load_dword s
|
||||
; GCN-NEXT: s_load_dword s
|
||||
; GCN-NEXT: s_load_dword s
|
||||
; GCN-NEXT: s_load_dword s
|
||||
; GCN-NOT: _load_
|
||||
; SI: s_load_dword s
|
||||
; SI-NEXT: s_load_dword s
|
||||
; SI-NEXT: s_load_dword s
|
||||
; SI-NEXT: s_load_dword s
|
||||
; SI-NOT: _load_
|
||||
|
||||
; VI: s_load_dwordx2 s
|
||||
; VI: s_load_dwordx2 s
|
||||
|
||||
; GCN-DAG: v_cvt_f32_f16_e32
|
||||
; GCN-DAG: v_cvt_f32_f16_e32
|
||||
|
|
|
@ -226,8 +226,11 @@ entry:
|
|||
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 46
|
||||
; EG-DAG: VTX_READ_16 T{{[0-9]}}.X, T{{[0-9]}}.X, 48
|
||||
|
||||
; GCN: s_load_dword s
|
||||
; GCN: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
; SI: s_load_dword s
|
||||
|
||||
; VI-HSA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
; VI-MESA: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
|
||||
entry:
|
||||
store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
|
||||
|
@ -291,11 +294,8 @@ entry:
|
|||
; SI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; SI-DAG: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x9
|
||||
|
||||
; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
|
||||
; MESA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x30
|
||||
|
||||
; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x8
|
||||
; HSA-VI-DAG: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xc
|
||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x2c
|
||||
; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
|
||||
define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
|
||||
entry:
|
||||
store <4 x i16> %in, <4 x i16> addrspace(1)* %out
|
||||
|
@ -391,11 +391,11 @@ entry:
|
|||
; SI-NOT: {{buffer|flat|global}}_load
|
||||
|
||||
|
||||
; VI: s_load_dwordx2
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x34
|
||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x3c
|
||||
|
||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x10
|
||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x18
|
||||
define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
|
||||
entry:
|
||||
store <8 x i16> %in, <8 x i16> addrspace(1)* %out
|
||||
|
@ -528,14 +528,15 @@ entry:
|
|||
; SI-NOT: {{buffer|flat|global}}_load
|
||||
|
||||
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; VI: s_load_dword s
|
||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
|
||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x4c
|
||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x54
|
||||
; MESA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x5c
|
||||
|
||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x20
|
||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x28
|
||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x30
|
||||
; HSA-VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[4:5], 0x38
|
||||
define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
|
||||
entry:
|
||||
store <16 x i16> %in, <16 x i16> addrspace(1)* %out
|
||||
|
|
|
@ -29,27 +29,21 @@ main_body:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}buffer_store_format_d16_xyzw:
|
||||
|
||||
; UNPACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
|
||||
; UNPACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
|
||||
|
||||
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]]
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]]
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
|
||||
|
||||
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
|
||||
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
|
||||
|
||||
; UNPACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
||||
|
||||
|
||||
; PACKED-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
|
||||
; PACKED-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14
|
||||
|
||||
; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]]
|
||||
; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]]
|
||||
; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
|
||||
; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
|
||||
|
||||
; PACKED: buffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], 0 idxen
|
||||
define amdgpu_kernel void @buffer_store_format_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %index) {
|
||||
|
|
|
@ -66,9 +66,6 @@ main_body:
|
|||
; UNPACKED-DAG: s_and_b32 [[UNPACK_0:s[0-9]+]], [[DATA]], 0xffff
|
||||
; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_0:[0-9]+]], [[UNPACK_0]]
|
||||
; UNPACKED-DAG: v_mov_b32_e32 v[[V_UNPACK_1:[0-9]+]], [[UNPACK_1]]
|
||||
|
||||
|
||||
|
||||
; UNPACKED: image_store v{{\[}}[[V_UNPACK_0]]:[[V_UNPACK_1]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
|
||||
|
||||
; PACKED: image_store v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x3 unorm d16
|
||||
|
@ -78,19 +75,17 @@ main_body:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}image_store_v4f16
|
||||
; UNPACKED: s_load_dword s
|
||||
; UNPACKED: s_load_dword s
|
||||
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
; GCN-LABEL: {{^}}image_store_v4f16:
|
||||
; UNPACKED: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[LO]], 16
|
||||
; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[HI]], 16
|
||||
; UNPACKED: s_and_b32
|
||||
; UNPACKED: s_and_b32
|
||||
; UNPACKED: image_store v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||
|
||||
; PACKED: s_load_dword [[DATA0:s[0-9]+]]
|
||||
; PACKED: s_load_dword [[DATA1:s[0-9]+]]
|
||||
; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]]
|
||||
; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]]
|
||||
; PACKED: s_load_dwordx2 s{{\[}}[[DATA0:[0-9]+]]:[[DATA1:[0-9]+]]{{\]}}
|
||||
; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[DATA0]]
|
||||
; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[DATA1]]
|
||||
; PACKED: image_store v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||
define amdgpu_kernel void @image_store_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
|
@ -98,19 +93,17 @@ main_body:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}image_store_mip_v4f16
|
||||
; UNPACKD: s_load_dword s
|
||||
; UNPACKD: s_load_dword s
|
||||
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
; UNPACKED: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 16
|
||||
; GCN-LABEL: {{^}}image_store_mip_v4f16:
|
||||
; UNPACKED: s_load_dwordx2 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[LO]], 16
|
||||
; UNPACKED-DAG: s_lshr_b32 s{{[0-9]+}}, s[[HI]], 16
|
||||
; UNPACKED: s_and_b32
|
||||
; UNPACKED: s_and_b32
|
||||
; UNPACKED: image_store_mip v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||
|
||||
; PACKED: s_load_dword [[DATA0:s[0-9]+]]
|
||||
; PACKED: s_load_dword [[DATA1:s[0-9]+]]
|
||||
; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[DATA0]]
|
||||
; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], [[DATA1]]
|
||||
; PACKED: s_load_dwordx2 s{{\[}}[[DATA0:[0-9]+]]:[[DATA1:[0-9]+]]{{\]}}
|
||||
; PACKED: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[DATA0]]
|
||||
; PACKED: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[DATA1]]
|
||||
; PACKED: image_store_mip v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf unorm d16
|
||||
define amdgpu_kernel void @image_store_mip_v4f16(<4 x half> %data, <4 x i32> %coords, <8 x i32> inreg %rsrc) {
|
||||
main_body:
|
||||
|
|
|
@ -29,22 +29,21 @@ main_body:
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}tbuffer_store_d16_xyzw:
|
||||
; GCN-DAG: s_load_dword [[S_DATA_0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x10
|
||||
; GCN-DAG: s_load_dword [[S_DATA_1:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x14
|
||||
; GCN-DAG: s_load_dwordx2 s{{\[}}[[S_DATA_0:[0-9]+]]:[[S_DATA_1:[0-9]+]]{{\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x10
|
||||
|
||||
; UNPACKED-DAG: s_mov_b32 [[K:s[0-9]+]], 0xffff{{$}}
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], [[S_DATA_0]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], [[S_DATA_0]], [[K]]
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], [[S_DATA_1]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], [[S_DATA_1]], [[K]]
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR0:s[0-9]+]], s[[S_DATA_0]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED0:s[0-9]+]], s[[S_DATA_0]], [[K]]
|
||||
; UNPACKED-DAG: s_lshr_b32 [[SHR1:s[0-9]+]], s[[S_DATA_1]], 16
|
||||
; UNPACKED-DAG: s_and_b32 [[MASKED1:s[0-9]+]], s[[S_DATA_1]], [[K]]
|
||||
|
||||
; UNPACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], [[MASKED0]]
|
||||
; UNPACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SHR1]]
|
||||
; UNPACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||
|
||||
|
||||
; PACKED: v_mov_b32_e32 v[[LO:[0-9]+]], [[S_DATA_0]]
|
||||
; PACKED: v_mov_b32_e32 v[[HI:[0-9]+]], [[S_DATA_1]]
|
||||
; PACKED-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s[[S_DATA_0]]
|
||||
; PACKED-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s[[S_DATA_1]]
|
||||
; PACKED: tbuffer_store_format_d16_xyzw v{{\[}}[[LO]]:[[HI]]{{\]}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], dfmt:1, nfmt:2, 0 idxen
|
||||
define amdgpu_kernel void @tbuffer_store_d16_xyzw(<4 x i32> %rsrc, <4 x half> %data, i32 %vindex) {
|
||||
main_body:
|
||||
|
|
|
@ -94,9 +94,12 @@ define <2 x half> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half
|
|||
|
||||
; GCN-LABEL: {{^}}v_mad_mix_v3f32:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v4, v7 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
|
||||
%src0.ext = fpext <3 x half> %src0 to <3 x float>
|
||||
|
@ -110,11 +113,11 @@ define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half
|
|||
; GCN-LABEL: {{^}}v_mad_mix_v4f32:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v2
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
|
||||
%src0.ext = fpext <4 x half> %src0 to <4 x float>
|
||||
|
@ -145,14 +148,12 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
|
|||
; FIXME: Should be packed into 2 registers per argument?
|
||||
; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v2, v2, v5, v8 op_sel_hi:[1,1,1]
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v0, v0, v3, v6 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: s_movk_i32 s6, 0x7e00
|
||||
; GFX9-NEXT: v_and_b32_e32 v2, 0xffff, v2
|
||||
; GFX9-NEXT: v_lshl_or_b32 v2, s6, 16, v2
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v0, v1, v4, v7 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_pk_max_f16 v2, v2, v2 clamp
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
|
||||
%src0.ext = fpext <3 x half> %src0 to <3 x float>
|
||||
|
@ -168,12 +169,12 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
|
|||
; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-DAG: v_mov_b32_e32 v0, v6
|
||||
; GFX9-DAG: v_mov_b32_e32 v1, v2
|
||||
; GFX9: s_setpc_b64
|
||||
; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v6
|
||||
; GFX9-NEXT: v_mov_b32_e32 v1, v7
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
|
||||
%src0.ext = fpext <4 x half> %src0 to <4 x float>
|
||||
%src1.ext = fpext <4 x half> %src1 to <4 x float>
|
||||
|
@ -243,10 +244,14 @@ define <2 x half> @v_mad_mix_v2f32_clamp_precvt(<2 x half> %src0, <2 x half> %sr
|
|||
ret <2 x half> %cvt.result
|
||||
}
|
||||
|
||||
; FIXME: Handling undef 4th component
|
||||
; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt:
|
||||
; GFX9: v_mad_mix_f32 v0, v0, v3, v6 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9: v_mad_mix_f32 v1, v1, v4, v7 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9: v_mad_mix_f32 v2, v2, v5, v8 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
|
||||
; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
|
||||
; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
|
||||
|
||||
; GFX9: v_cvt_f16_f32
|
||||
; GFX9: v_cvt_f16_f32
|
||||
; GFX9: v_cvt_f16_f32
|
||||
; GFX9: v_cvt_f16_f32
|
||||
|
|
|
@ -66,15 +66,10 @@ define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) {
|
|||
; VI: v_mul_lo_u16
|
||||
; VI: v_mul_lo_u16
|
||||
|
||||
; GFX9: v_and_b32
|
||||
; GFX9: v_and_b32
|
||||
; GFX9: v_lshl_or_b32
|
||||
; GFX9: v_lshl_or_b32
|
||||
; GFX9: v_lshl_or_b32
|
||||
|
||||
; GFX9: v_pk_mul_lo_u16
|
||||
; GFX9: v_pk_mul_lo_u16
|
||||
; GFX9: s_setpc_b64
|
||||
; GFX9: s_waitcnt
|
||||
; GFX9-NEXT: v_pk_mul_lo_u16
|
||||
; GFX9-NEXT: v_pk_mul_lo_u16
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) {
|
||||
%r.val = mul <3 x i16> %a, %b
|
||||
ret <3 x i16> %r.val
|
||||
|
@ -94,8 +89,8 @@ define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) {
|
|||
; VI: v_or_b32_e32
|
||||
|
||||
; GFX9: s_waitcnt
|
||||
; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2
|
||||
; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3
|
||||
; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) {
|
||||
%r.val = mul <4 x i16> %a, %b
|
||||
|
|
|
@ -0,0 +1,20 @@
|
|||
; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
|
||||
|
||||
; Make sure reduceBuildVecExtToExtBuildVec combine doesn't regress
|
||||
|
||||
; code with legal v4i16. The v4i16 build_vector it produces will be
|
||||
; custom lowered into an i32 based build_vector, producing a mess that
|
||||
; nothing manages to put back together.
|
||||
|
||||
; GCN-LABEL: {{^}}v2i16_to_i64:
|
||||
; GFX9: s_waitcnt
|
||||
; GFX9-NEXT: v_pk_add_u16 v0, v0, v1
|
||||
; GFX9-NEXT: v_lshrrev_b32_e32 v1, 16, v0
|
||||
; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define i64 @v2i16_to_i64(<2 x i16> %x, <2 x i16> %y) {
|
||||
%x.add = add <2 x i16> %x, %y
|
||||
%zext = zext <2 x i16> %x.add to <2 x i32>
|
||||
%arst = bitcast <2 x i32> %zext to i64
|
||||
ret i64 %arst
|
||||
}
|
|
@ -110,13 +110,9 @@ define amdgpu_kernel void @v_select_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16
|
|||
; SI: cndmask
|
||||
; SI-NOT: cndmask
|
||||
|
||||
; GFX9: v_cndmask_b32_e32
|
||||
; GFX9: cndmask
|
||||
; GFX9-NOT: cndmask
|
||||
|
||||
; VI: v_cndmask_b32
|
||||
; VI: v_cndmask_b32
|
||||
; VI: v_cndmask_b32
|
||||
; GFX89: v_cndmask_b32_e32
|
||||
; GFX89: cndmask
|
||||
; GFX89-NOT: cndmask
|
||||
define amdgpu_kernel void @v_select_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr, i32 %c) #0 {
|
||||
%a = load <3 x i16>, <3 x i16> addrspace(1)* %a.ptr
|
||||
%b = load <3 x i16>, <3 x i16> addrspace(1)* %b.ptr
|
||||
|
|
|
@ -112,15 +112,15 @@ define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16>
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}s_abs_v4i16:
|
||||
; GFX9: s_load_dword [[VAL0:s[0-9]+]]
|
||||
; GFX9: s_load_dword [[VAL1:s[0-9]+]]
|
||||
; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, [[VAL0]]
|
||||
; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], [[VAL0]], [[SUB0]]
|
||||
; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2
|
||||
; GFX9: s_load_dwordx2 s{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}, s[0:1], 0x2c
|
||||
; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, s[[VAL0]]
|
||||
; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, s[[VAL1]]
|
||||
|
||||
; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]]
|
||||
; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]]
|
||||
; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
|
||||
; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], s[[VAL0]], [[SUB0]]
|
||||
; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], s[[VAL1]], [[SUB1]]
|
||||
|
||||
; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2 op_sel_hi:[1,0]
|
||||
; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2 op_sel_hi:[1,0]
|
||||
define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
|
||||
%z0 = insertelement <4 x i16> undef, i16 0, i16 0
|
||||
%z1 = insertelement <4 x i16> %z0, i16 0, i16 1
|
||||
|
@ -197,8 +197,8 @@ define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i
|
|||
|
||||
; GCN-LABEL: {{^}}s_min_max_v4i16:
|
||||
; GFX9: v_pk_max_i16
|
||||
; GFX9: v_pk_max_i16
|
||||
; GFX9: v_pk_min_i16
|
||||
; GFX9: v_pk_max_i16
|
||||
; GFX9: v_pk_min_i16
|
||||
define amdgpu_kernel void @s_min_max_v4i16(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
|
||||
%cond0 = icmp sgt <4 x i16> %val0, %val1
|
||||
|
|
Loading…
Reference in New Issue