forked from OSchip/llvm-project
AMDGPU: Custom lower v4i16/v4f16 vector operations
Avoids stack access. Also handle extract hi elt pattern from truncate + shift to avoid a couple test regressions. llvm-svn: 332453
This commit is contained in:
parent
ca22d427b9
commit
67a9815a5c
|
@ -3144,6 +3144,28 @@ SDValue AMDGPUTargetLowering::performTruncateCombine(
|
|||
}
|
||||
}
|
||||
|
||||
// Equivalent of above for accessing the high element of a vector as an
|
||||
// integer operation.
|
||||
// trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
|
||||
if (Src.getOpcode() == ISD::SRL) {
|
||||
if (auto K = isConstOrConstSplat(Src.getOperand(1))) {
|
||||
if (2 * K->getZExtValue() == Src.getValueType().getScalarSizeInBits()) {
|
||||
SDValue BV = stripBitcast(Src.getOperand(0));
|
||||
if (BV.getOpcode() == ISD::BUILD_VECTOR &&
|
||||
BV.getValueType().getVectorNumElements() == 2) {
|
||||
SDValue SrcElt = BV.getOperand(1);
|
||||
EVT SrcEltVT = SrcElt.getValueType();
|
||||
if (SrcEltVT.isFloatingPoint()) {
|
||||
SrcElt = DAG.getNode(ISD::BITCAST, SL,
|
||||
SrcEltVT.changeTypeToInteger(), SrcElt);
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
|
||||
//
|
||||
// i16 (trunc (srl i64:x, K)), K <= 16 ->
|
||||
|
|
|
@ -137,6 +137,10 @@ public:
|
|||
return false;
|
||||
}
|
||||
|
||||
static inline SDValue stripBitcast(SDValue Val) {
|
||||
return Val.getOpcode() == ISD::BITCAST ? Val.getOperand(0) : Val;
|
||||
}
|
||||
|
||||
static bool allUsesHaveSourceMods(const SDNode *N,
|
||||
unsigned CostThreshold = 4);
|
||||
bool isFAbsFree(EVT VT) const override;
|
||||
|
|
|
@ -288,13 +288,24 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
|
|||
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
|
||||
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16f32, Expand);
|
||||
|
||||
setOperationAction(ISD::BUILD_VECTOR, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
|
||||
|
||||
// Avoid stack access for these.
|
||||
// TODO: Generalize to more vector types.
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i16, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f16, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
|
||||
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
|
||||
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f16, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i16, Custom);
|
||||
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f16, Custom);
|
||||
|
||||
// BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
|
||||
// and output demarshalling
|
||||
setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i32, Custom);
|
||||
|
@ -3333,6 +3344,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
return lowerINSERT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::EXTRACT_VECTOR_ELT:
|
||||
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::BUILD_VECTOR:
|
||||
return lowerBUILD_VECTOR(Op, DAG);
|
||||
case ISD::FP_ROUND:
|
||||
return lowerFP_ROUND(Op, DAG);
|
||||
case ISD::TRAP:
|
||||
|
@ -4157,34 +4170,72 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
|
|||
|
||||
SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDValue Vec = Op.getOperand(0);
|
||||
SDValue InsVal = Op.getOperand(1);
|
||||
SDValue Idx = Op.getOperand(2);
|
||||
EVT VecVT = Vec.getValueType();
|
||||
|
||||
assert(VecVT.getScalarSizeInBits() == 16);
|
||||
|
||||
unsigned NumElts = VecVT.getVectorNumElements();
|
||||
SDLoc SL(Op);
|
||||
auto KIdx = dyn_cast<ConstantSDNode>(Idx);
|
||||
|
||||
if (NumElts == 4 && KIdx) {
|
||||
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
|
||||
|
||||
SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
|
||||
DAG.getConstant(0, SL, MVT::i32));
|
||||
SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
|
||||
DAG.getConstant(1, SL, MVT::i32));
|
||||
|
||||
SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
|
||||
SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
|
||||
|
||||
unsigned Idx = KIdx->getZExtValue();
|
||||
bool InsertLo = Idx < 2;
|
||||
SDValue InsHalf = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16,
|
||||
InsertLo ? LoVec : HiVec,
|
||||
DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
|
||||
DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
|
||||
|
||||
InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
|
||||
|
||||
SDValue Concat = InsertLo ?
|
||||
DAG.getBuildVector(MVT::v2i32, SL, { InsHalf, HiHalf }) :
|
||||
DAG.getBuildVector(MVT::v2i32, SL, { LoHalf, InsHalf });
|
||||
|
||||
return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
|
||||
}
|
||||
|
||||
assert(NumElts == 2 || NumElts == 4);
|
||||
|
||||
if (isa<ConstantSDNode>(Idx))
|
||||
return SDValue();
|
||||
|
||||
EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;
|
||||
|
||||
// Avoid stack access for dynamic indexing.
|
||||
SDLoc SL(Op);
|
||||
SDValue Vec = Op.getOperand(0);
|
||||
SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Op.getOperand(1));
|
||||
SDValue Val = DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal);
|
||||
|
||||
// v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
|
||||
SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Val);
|
||||
SDValue ExtVal = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Val);
|
||||
|
||||
// Convert vector index to bit-index.
|
||||
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx,
|
||||
DAG.getConstant(4, SL, MVT::i32));
|
||||
|
||||
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
|
||||
|
||||
SDValue BFM = DAG.getNode(ISD::SHL, SL, MVT::i32,
|
||||
DAG.getConstant(0xffff, SL, MVT::i32),
|
||||
SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
|
||||
SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
|
||||
DAG.getConstant(0xffff, SL, IntVT),
|
||||
ScaledIdx);
|
||||
|
||||
SDValue LHS = DAG.getNode(ISD::AND, SL, MVT::i32, BFM, ExtVal);
|
||||
SDValue RHS = DAG.getNode(ISD::AND, SL, MVT::i32,
|
||||
DAG.getNOT(SL, BFM, MVT::i32), BCVec);
|
||||
SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
|
||||
SDValue RHS = DAG.getNode(ISD::AND, SL, IntVT,
|
||||
DAG.getNOT(SL, BFM, IntVT), BCVec);
|
||||
|
||||
SDValue BFI = DAG.getNode(ISD::OR, SL, MVT::i32, LHS, RHS);
|
||||
return DAG.getNode(ISD::BITCAST, SL, Op.getValueType(), BFI);
|
||||
SDValue BFI = DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS);
|
||||
return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
|
||||
|
@ -4194,6 +4245,9 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
|
|||
EVT ResultVT = Op.getValueType();
|
||||
SDValue Vec = Op.getOperand(0);
|
||||
SDValue Idx = Op.getOperand(1);
|
||||
EVT VecVT = Vec.getValueType();
|
||||
unsigned NumElts = VecVT.getVectorNumElements();
|
||||
assert(VecVT.getScalarSizeInBits() == 16 && (NumElts == 2 || NumElts == 4));
|
||||
|
||||
DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
|
||||
|
||||
|
@ -4204,19 +4258,43 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
|
|||
if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
|
||||
return Combined;
|
||||
|
||||
EVT IntVT = NumElts == 2 ? MVT::i32 : MVT::i64;
|
||||
SDValue Four = DAG.getConstant(4, SL, MVT::i32);
|
||||
|
||||
// Convert vector index to bit-index (* 16)
|
||||
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, Four);
|
||||
|
||||
SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
|
||||
SDValue Elt = DAG.getNode(ISD::SRL, SL, MVT::i32, BC, ScaledIdx);
|
||||
SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
|
||||
SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
|
||||
|
||||
SDValue Result = Elt;
|
||||
if (ResultVT.bitsLT(MVT::i32))
|
||||
Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Result);
|
||||
if (ResultVT == MVT::f16) {
|
||||
SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
|
||||
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
|
||||
}
|
||||
|
||||
return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
|
||||
return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
|
||||
}
|
||||
|
||||
SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
|
||||
SelectionDAG &DAG) const {
|
||||
SDLoc SL(Op);
|
||||
EVT VT = Op.getValueType();
|
||||
assert(VT == MVT::v4i16 || VT == MVT::v4f16);
|
||||
|
||||
EVT HalfVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
|
||||
|
||||
// Turn into pair of packed build_vectors.
|
||||
// TODO: Special case for constants that can be materialized with s_mov_b64.
|
||||
SDValue Lo = DAG.getBuildVector(HalfVT, SL,
|
||||
{ Op.getOperand(0), Op.getOperand(1) });
|
||||
SDValue Hi = DAG.getBuildVector(HalfVT, SL,
|
||||
{ Op.getOperand(2), Op.getOperand(3) });
|
||||
|
||||
SDValue CastLo = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Lo);
|
||||
SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Hi);
|
||||
|
||||
SDValue Blend = DAG.getBuildVector(MVT::v2i32, SL, { CastLo, CastHi });
|
||||
return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
|
||||
}
|
||||
|
||||
bool
|
||||
|
|
|
@ -84,6 +84,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
|
|||
SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
|
||||
SDValue lowerTRAP(SDValue Op, SelectionDAG &DAG) const;
|
||||
|
||||
SDNode *adjustWritemask(MachineSDNode *&N, SelectionDAG &DAG) const;
|
||||
|
|
|
@ -7,17 +7,17 @@ target datalayout = "A5"
|
|||
; size and not 4 corresponding to the sign-extended size (i32).
|
||||
|
||||
; DEBUG: {{^}}# Machine code for function extload_align:
|
||||
; DEBUG: (load 2, addrspace 5)
|
||||
; DEBUG: (volatile load 2 from %ir.a, addrspace 5)
|
||||
; DEBUG: {{^}}# End machine code for function extload_align.
|
||||
|
||||
define amdgpu_kernel void @extload_align(i32 addrspace(5)* %out, i32 %index) #0 {
|
||||
%v0 = alloca [4 x i16], addrspace(5)
|
||||
%a1 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 0
|
||||
%a2 = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 1
|
||||
store i16 0, i16 addrspace(5)* %a1
|
||||
store i16 1, i16 addrspace(5)* %a2
|
||||
store volatile i16 0, i16 addrspace(5)* %a1
|
||||
store volatile i16 1, i16 addrspace(5)* %a2
|
||||
%a = getelementptr inbounds [4 x i16], [4 x i16] addrspace(5)* %v0, i32 0, i32 %index
|
||||
%val = load i16, i16 addrspace(5)* %a
|
||||
%val = load volatile i16, i16 addrspace(5)* %a
|
||||
%eval = sext i16 %val to i32
|
||||
store i32 %eval, i32 addrspace(5)* %out
|
||||
ret void
|
||||
|
|
|
@ -70,31 +70,20 @@ define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}extract_vector_elt_v4f16:
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
|
||||
%p0 = extractelement <4 x half> %foo, i32 0
|
||||
%p1 = extractelement <4 x half> %foo, i32 2
|
||||
%out1 = getelementptr half, half addrspace(1)* %out, i32 10
|
||||
store half %p1, half addrspace(1)* %out, align 2
|
||||
store half %p0, half addrspace(1)* %out1, align 2
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v3f16:
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
; GFX9-DAG: global_load_short_d16_hi v
|
||||
; GFX9-DAG: global_load_short_d16 v
|
||||
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_store_short
|
||||
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
||||
; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
|
||||
|
||||
; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
|
||||
|
||||
; GCN: {{buffer|global}}_store_short
|
||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
|
||||
%p0 = extractelement <3 x half> %foo, i32 %idx
|
||||
%out1 = getelementptr half, half addrspace(1)* %out, i32 1
|
||||
|
@ -102,23 +91,45 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4f16:
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN-LABEL: {{^}}v_extractelement_v4f16_2:
|
||||
; SI: buffer_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
|
||||
; SI: buffer_store_short [[LOAD]]
|
||||
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
; GCN: buffer_store_short
|
||||
; VI: flat_load_dword v
|
||||
; VI: flat_store_short
|
||||
|
||||
; GCN: buffer_load_ushort
|
||||
; GCN: buffer_store_short
|
||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
|
||||
%p0 = extractelement <4 x half> %foo, i32 %idx
|
||||
%out1 = getelementptr half, half addrspace(1)* %out, i32 1
|
||||
store half %p0, half addrspace(1)* %out
|
||||
; GFX9: global_load_dword [[LOAD:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, off offset:4
|
||||
; GFX9: global_store_short_d16_hi v{{\[[0-9]+:[0-9]+\]}}, [[LOAD]]
|
||||
define amdgpu_kernel void @v_extractelement_v4f16_2(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
|
||||
%vec.extract = extractelement <4 x half> %vec, i32 2
|
||||
store half %vec.extract, half addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_vgpr:
|
||||
; GCN-DAG: {{flat|global|buffer}}_load_dword [[IDX:v[0-9]+]],
|
||||
; GCN-DAG: {{flat|global|buffer}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
||||
|
||||
; GFX89: v_lshrrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], v{{\[}}[[LO]]:[[HI]]{{\]}}
|
||||
; GFX89: {{flat|global}}_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[SHIFT_LO]]
|
||||
|
||||
; SI: v_lshr_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}, [[SCALED_IDX]]
|
||||
; SI: buffer_store_short v[[SHIFT_LO]]
|
||||
define amdgpu_kernel void @v_insertelement_v4f16_dynamic_vgpr(half addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds half, half addrspace(1)* %out, i64 %tid.ext
|
||||
%idx.val = load volatile i32, i32 addrspace(1)* undef
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
|
||||
%vec.extract = extractelement <4 x half> %vec, i32 %idx.val
|
||||
store half %vec.extract, half addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI,SICIVI %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI,SICIVI,GFX89 %s
|
||||
; RUN: llc -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9,GFX89 %s
|
||||
|
||||
; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
|
||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
||||
|
@ -96,20 +96,15 @@ define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x
|
|||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
|
||||
; SICIVI: buffer_store_short
|
||||
; SICIVI: buffer_store_short
|
||||
; SICIVI: buffer_store_short
|
||||
; GFX9-DAG: global_load_short_d16_hi v
|
||||
; GFX9-DAG: global_load_short_d16 v
|
||||
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_store_short
|
||||
; GCN-DAG: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 4
|
||||
; GFX89: v_lshrrev_b64 v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, v
|
||||
|
||||
; GFX9: buffer_load_ushort
|
||||
; GFX9: global_load_short_d16_hi
|
||||
; GFX9: global_load_short_d16 v
|
||||
; GFX9: buffer_store_dword
|
||||
; GFX9: buffer_store_dword
|
||||
; GFX9: buffer_load_ushort
|
||||
; GFX9: buffer_store_short
|
||||
; SI: v_lshr_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
|
||||
|
||||
; GCN: {{buffer|global}}_store_short
|
||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
|
||||
%p0 = extractelement <3 x i16> %foo, i32 %idx
|
||||
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
|
||||
|
@ -117,29 +112,15 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %o
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_load_ushort
|
||||
|
||||
; SICIVI: buffer_store_short
|
||||
; SICIVI: buffer_store_short
|
||||
; SICIVI: buffer_store_short
|
||||
; SICIVI: buffer_store_short
|
||||
|
||||
; SICIVI: buffer_load_ushort
|
||||
; SICIVI: buffer_store_short
|
||||
|
||||
; GFX9: s_load_dword
|
||||
; GFX9: buffer_store_dword
|
||||
; GFX9: buffer_store_dword
|
||||
; GFX9: buffer_load_ushort
|
||||
; GFX9: buffer_store_short
|
||||
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
|
||||
%p0 = extractelement <4 x i16> %foo, i32 %idx
|
||||
%out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
|
||||
store i16 %p0, i16 addrspace(1)* %out
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_sgpr:
|
||||
define amdgpu_kernel void @v_insertelement_v4i16_dynamic_sgpr(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %idx) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
|
||||
%vec.extract = extractelement <4 x i16> %vec, i32 %idx
|
||||
store i16 %vec.extract, i16 addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -201,33 +201,6 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v4i16:
|
||||
; GCN: buffer_load_ushort v{{[0-9]+}}, off
|
||||
; GCN: buffer_load_ushort v{{[0-9]+}}, off
|
||||
; GCN: buffer_load_ushort v{{[0-9]+}}, off
|
||||
; GCN: buffer_load_ushort v{{[0-9]+}}, off
|
||||
|
||||
; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 8{{$}}
|
||||
; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}}
|
||||
; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}}
|
||||
|
||||
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:14
|
||||
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12
|
||||
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:10
|
||||
; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8
|
||||
; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
|
||||
|
||||
; GCN-NO-TONGA: s_waitcnt expcnt
|
||||
|
||||
; GCN: buffer_load_dwordx2
|
||||
|
||||
; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
|
||||
define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
|
||||
%vecins = insertelement <4 x i16> %a, i16 5, i32 %b
|
||||
store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v2i8:
|
||||
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
|
||||
; GCN: buffer_load_ubyte v{{[0-9]+}}, off
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=gfx900 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
|
||||
; RUN: llc -verify-machineinstrs -march=amdgcn -mtriple=amdgcn---amdgiz -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
|
||||
|
||||
; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
|
||||
; GCN: s_load_dword [[VEC:s[0-9]+]]
|
||||
|
@ -484,6 +484,187 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspa
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_0:
|
||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
|
||||
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
||||
; GFX9: v_bfi_b32 v[[INS_LO:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[LO]]
|
||||
|
||||
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[LO]]
|
||||
; CIVI: v_or_b32_e32 v[[INS_LO:[0-9]+]], [[VAL]], [[AND]]
|
||||
|
||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_LO]]:[[HI]]{{\]}}
|
||||
define amdgpu_kernel void @v_insertelement_v4f16_0(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
|
||||
%val.trunc = trunc i32 %val to i16
|
||||
%val.cvt = bitcast i16 %val.trunc to half
|
||||
%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 0
|
||||
store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_1:
|
||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
|
||||
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
|
||||
; GFX9: v_lshl_or_b32 v[[INS_HALF:[0-9]+]], [[VAL]], 16, [[AND]]
|
||||
|
||||
; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
|
||||
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
|
||||
; VI: v_or_b32_sdwa v[[INS_HALF:[0-9]+]], v[[LO]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[LO]]
|
||||
; CI: v_or_b32_e32 v[[INS_HALF:[0-9]+]], [[VAL]], [[AND]]
|
||||
|
||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[INS_HALF]]:[[HI]]{{\]}}
|
||||
define amdgpu_kernel void @v_insertelement_v4f16_1(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
|
||||
%val.trunc = trunc i32 %val to i16
|
||||
%val.cvt = bitcast i16 %val.trunc to half
|
||||
%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 1
|
||||
store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_2:
|
||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
|
||||
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
||||
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
|
||||
|
||||
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
|
||||
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
|
||||
|
||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
||||
define amdgpu_kernel void @v_insertelement_v4f16_2(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
|
||||
%val.trunc = trunc i32 %val to i16
|
||||
%val.cvt = bitcast i16 %val.trunc to half
|
||||
%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 2
|
||||
store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_3:
|
||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
|
||||
; GFX9: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
|
||||
; GFX9: v_lshl_or_b32 v[[INS_HI:[0-9]+]], [[VAL]], 16, [[AND]]
|
||||
|
||||
; VI: s_lshl_b32 [[VAL]], [[VAL]], 16
|
||||
; VI-DAG: v_mov_b32_e32 [[COPY_VAL:v[0-9]+]], [[VAL]]
|
||||
; VI: v_or_b32_sdwa v[[INS_HI:[0-9]+]], v[[HI]], [[COPY_VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
; CI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff, v[[HI]]
|
||||
; CI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
|
||||
|
||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
||||
define amdgpu_kernel void @v_insertelement_v4f16_3(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
|
||||
%val.trunc = trunc i32 %val to i16
|
||||
%val.cvt = bitcast i16 %val.trunc to half
|
||||
%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 3
|
||||
store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4i16_2:
|
||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
|
||||
; GFX9-DAG: v_mov_b32_e32 [[BFI_MASK:v[0-9]+]], 0xffff{{$}}
|
||||
; GFX9: v_bfi_b32 v[[INS_HI:[0-9]+]], [[BFI_MASK]], [[VAL]], v[[HI]]
|
||||
|
||||
; CIVI: v_and_b32_e32 [[AND:v[0-9]+]], 0xffff0000, v[[HI]]
|
||||
; CIVI: v_or_b32_e32 v[[INS_HI:[0-9]+]], [[VAL]], [[AND]]
|
||||
|
||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[INS_HI]]{{\]}}
|
||||
define amdgpu_kernel void @v_insertelement_v4i16_2(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
|
||||
%val.trunc = trunc i32 %val to i16
|
||||
%val.cvt = bitcast i16 %val.trunc to i16
|
||||
%vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 2
|
||||
store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Better code on CI?
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4i16_dynamic_vgpr:
|
||||
; GCN-DAG: {{flat|global}}_load_dword [[IDX:v[0-9]+]],
|
||||
; GCN-DAG: s_load_dword [[VAL:s[0-9]+]]
|
||||
; GCN-DAG: {{flat|global}}_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
|
||||
|
||||
; GCN-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 4, [[IDX]]
|
||||
; GCN-DAG: s_mov_b32 s[[MASK_HI:[0-9]+]], 0
|
||||
; GCN-DAG: s_mov_b32 s[[MASK_LO:[0-9]+]], 0xffff{{$}}
|
||||
|
||||
; GFX89: v_lshlrev_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, [[SCALED_IDX]], s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}
|
||||
; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_LO:[0-9+]]], v[[SHIFT_LO]]
|
||||
; GFX89-DAG: v_not_b32_e32 v[[NOT_SHIFT_HI:[0-9+]]], v[[SHIFT_HI]]
|
||||
; GFX89-DAG: v_and_b32_e32 v[[MASK:[0-9]+]], [[VAL]], v[[SHIFT_LO]]
|
||||
|
||||
; GFX89-DAG: v_and_b32_e32 v[[AND0:[0-9]+]], v[[NOT_SHIFT_LO]], v[[LO]]
|
||||
; GFX89-DAG: v_and_b32_e32 v[[AND1:[0-9]+]], v[[NOT_SHIFT_HI]], v[[HI]]
|
||||
; GFX89: v_or_b32_sdwa v[[OR_SDWA:[0-9]+]], v[[MASK]], v[[AND0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
|
||||
|
||||
|
||||
; CI: v_lshl_b64 v{{\[}}[[SHIFT_LO:[0-9]+]]:[[SHIFT_HI:[0-9]+]]{{\]}}, s{{\[}}[[MASK_LO]]:[[MASK_HI]]{{\]}}, [[SCALED_IDX]]
|
||||
; CI-DAG: v_bfi_b32 v[[OR_SDWA:[0-9]+]], v[[SHIFT_LO]],
|
||||
; CI-DAG: v_bfi_b32 v[[AND1:[0-9]+]], v[[SHIFT_HI]], 0,
|
||||
|
||||
; GCN: {{flat|global}}_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[OR_SDWA]]:[[AND1]]{{\]}}
|
||||
define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in, i32 %val) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
|
||||
%idx.val = load volatile i32, i32 addrspace(1)* undef
|
||||
%vec = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
|
||||
%val.trunc = trunc i32 %val to i16
|
||||
%val.cvt = bitcast i16 %val.trunc to i16
|
||||
%vecins = insertelement <4 x i16> %vec, i16 %val.cvt, i32 %idx.val
|
||||
store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}v_insertelement_v4f16_dynamic_sgpr:
|
||||
define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in, i32 %val, i32 %idxval) #0 {
|
||||
%tid = call i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
%in.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %in, i64 %tid.ext
|
||||
%out.gep = getelementptr inbounds <4 x half>, <4 x half> addrspace(1)* %out, i64 %tid.ext
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %in.gep
|
||||
%val.trunc = trunc i32 %val to i16
|
||||
%val.cvt = bitcast i16 %val.trunc to half
|
||||
%vecins = insertelement <4 x half> %vec, half %val.cvt, i32 %idxval
|
||||
store <4 x half> %vecins, <4 x half> addrspace(1)* %out.gep
|
||||
ret void
|
||||
}
|
||||
|
||||
declare i32 @llvm.amdgcn.workitem.id.x() #1
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -289,9 +289,9 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <
|
|||
; SI-NOT: v_min_u32_e32
|
||||
|
||||
; VI: v_min_u16_e32
|
||||
; VI: v_min_u16_sdwa
|
||||
; VI: v_min_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
|
||||
; VI: v_min_u16_e32
|
||||
; VI-NOT: v_min_u16_e32
|
||||
; VI-NOT: v_min_u16
|
||||
|
||||
; GFX9: v_pk_min_u16
|
||||
; GFX9: v_pk_min_u16
|
||||
|
|
Loading…
Reference in New Issue