diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index fcaa6f907e5c..23c9352ce273 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -129,10 +129,7 @@ private: bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const; bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const; - bool selectMOVRELOffsetImpl(SDValue Index, SDValue &Base, - SDValue &Offset, bool IsInsert) const; - bool selectMOVRELSOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; - bool selectMOVRELDOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; + bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const; bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const; bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods, @@ -1193,10 +1190,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr, !isa(Offset); } -bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index, - SDValue &Base, - SDValue &Offset, - bool IsInsert) const { +bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index, + SDValue &Base, + SDValue &Offset) const { SDLoc DL(Index); if (CurDAG->isBaseWithConstantOffset(Index)) { @@ -1210,34 +1206,14 @@ bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index, return true; } - if (IsInsert) { - if (ConstantSDNode *CBase = dyn_cast(Index)) { - Base = CurDAG->getRegister(AMDGPU::NoRegister, MVT::i32); - Offset = CurDAG->getTargetConstant(CBase->getZExtValue(), DL, MVT::i32); - return true; - } - } else { - if (isa(Index)) - return false; - } + if (isa(Index)) + return false; Base = Index; Offset = CurDAG->getTargetConstant(0, DL, MVT::i32); return true; } -bool AMDGPUDAGToDAGISel::selectMOVRELSOffset(SDValue Index, - SDValue &Base, - SDValue &Offset) const { - return selectMOVRELOffsetImpl(Index, Base, Offset, false); -} - -bool AMDGPUDAGToDAGISel::selectMOVRELDOffset(SDValue Index, - SDValue &Base, - SDValue &Offset) const { - return selectMOVRELOffsetImpl(Index, Base, Offset, true); -} - SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL, SDValue Val, uint32_t Offset, uint32_t Width) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index b31a804987ca..f133eb3270ad 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -624,8 +624,7 @@ def SMRDBufferImm : ComplexPattern; def SMRDBufferImm32 : ComplexPattern; def SMRDBufferSgpr : ComplexPattern; -def MOVRELSOffset : ComplexPattern; -def MOVRELDOffset : ComplexPattern; +def MOVRELOffset : ComplexPattern; def VOP3Mods0 : ComplexPattern; def VOP3NoMods0 : ComplexPattern; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index cf1f4f86df32..05e0e3e504ca 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3308,13 +3308,13 @@ def : MTBUF_StoreResource ; multiclass SI_INDIRECT_Pattern { // Extract with offset def : Pat< - (eltvt (extractelt vt:$src, (MOVRELSOffset i32:$idx, (i32 imm:$offset)))), + (eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))), (!cast("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset) >; // Insert with offset def : Pat< - (insertelt vt:$src, eltvt:$val, (MOVRELDOffset i32:$idx, (i32 imm:$offset))), + (insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))), (!cast("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val) >; } diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 3e6905f887fb..367e7f734556 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -73,6 +73,15 @@ define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> % ret void } +; GCN-LABEL: {{^}}insertelement_to_sgpr: +; GCN-NOT: v_readfirstlane +define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind { + %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef + %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0 + %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0) + ret <4 x float> %tmp2 +} + ; GCN-LABEL: {{^}}dynamic_insertelement_v2f32: ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]] @@ -432,3 +441,5 @@ define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x d store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16 ret void } + +declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone diff --git a/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll index 7eb9eb253b3b..aef9f660436e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.SI.gather4.ll @@ -467,12 +467,9 @@ main_body: ; This crashed at some point due to a bug in FixSGPRCopies. Derived from the ; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877 ; -;TODO: the readfirstlanes are unnecessary, see http://reviews.llvm.org/D22217 -; -;CHECK: v_readfirstlane_b32 s[[LO:[0-9]+]], v{{[0-9]+}} -;CHECK: v_readfirstlane_b32 -;CHECK: v_readfirstlane_b32 -;CHECK: v_readfirstlane_b32 s[[HI:[0-9]+]], v{{[0-9]+}} +;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0 +;CHECK: s_waitcnt lgkmcnt(0) +;CHECK: s_mov_b32 s[[LO]], 0 ;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8 define amdgpu_ps float @gather4_sgpr_bug() { main_body: