forked from OSchip/llvm-project
AMDGPU: Unify MOVRELSOffset and MOVRELDOffset
Summary: Previously, constant index insertelements would be turned into SI_INDIRECT_DST, which is bound to prevent some optimization opportunities. Worse, it mislead the heuristic that decides whether immediates should be lowered to S_MOV_B32 or V_MOV_B32 in a way that resulted in unnecessary v_readfirstlanes. Reviewers: arsenm, tstellarAMD Subscribers: arsenm, kzhuravl, llvm-commits Differential Revision: http://reviews.llvm.org/D22217 llvm-svn: 275160
This commit is contained in:
parent
7899d48dff
commit
7968c34586
|
@ -129,10 +129,7 @@ private:
|
|||
bool SelectSMRDBufferImm(SDValue Addr, SDValue &Offset) const;
|
||||
bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
|
||||
bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
|
||||
bool selectMOVRELOffsetImpl(SDValue Index, SDValue &Base,
|
||||
SDValue &Offset, bool IsInsert) const;
|
||||
bool selectMOVRELSOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
|
||||
bool selectMOVRELDOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
|
||||
bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
|
||||
bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
|
||||
bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
|
||||
|
@ -1193,10 +1190,9 @@ bool AMDGPUDAGToDAGISel::SelectSMRDBufferSgpr(SDValue Addr,
|
|||
!isa<ConstantSDNode>(Offset);
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index,
|
||||
SDValue &Base,
|
||||
SDValue &Offset,
|
||||
bool IsInsert) const {
|
||||
bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
|
||||
SDValue &Base,
|
||||
SDValue &Offset) const {
|
||||
SDLoc DL(Index);
|
||||
|
||||
if (CurDAG->isBaseWithConstantOffset(Index)) {
|
||||
|
@ -1210,34 +1206,14 @@ bool AMDGPUDAGToDAGISel::selectMOVRELOffsetImpl(SDValue Index,
|
|||
return true;
|
||||
}
|
||||
|
||||
if (IsInsert) {
|
||||
if (ConstantSDNode *CBase = dyn_cast<ConstantSDNode>(Index)) {
|
||||
Base = CurDAG->getRegister(AMDGPU::NoRegister, MVT::i32);
|
||||
Offset = CurDAG->getTargetConstant(CBase->getZExtValue(), DL, MVT::i32);
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
if (isa<ConstantSDNode>(Index))
|
||||
return false;
|
||||
}
|
||||
if (isa<ConstantSDNode>(Index))
|
||||
return false;
|
||||
|
||||
Base = Index;
|
||||
Offset = CurDAG->getTargetConstant(0, DL, MVT::i32);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::selectMOVRELSOffset(SDValue Index,
|
||||
SDValue &Base,
|
||||
SDValue &Offset) const {
|
||||
return selectMOVRELOffsetImpl(Index, Base, Offset, false);
|
||||
}
|
||||
|
||||
bool AMDGPUDAGToDAGISel::selectMOVRELDOffset(SDValue Index,
|
||||
SDValue &Base,
|
||||
SDValue &Offset) const {
|
||||
return selectMOVRELOffsetImpl(Index, Base, Offset, true);
|
||||
}
|
||||
|
||||
SDNode *AMDGPUDAGToDAGISel::getS_BFE(unsigned Opcode, const SDLoc &DL,
|
||||
SDValue Val, uint32_t Offset,
|
||||
uint32_t Width) {
|
||||
|
|
|
@ -624,8 +624,7 @@ def SMRDBufferImm : ComplexPattern<i32, 1, "SelectSMRDBufferImm">;
|
|||
def SMRDBufferImm32 : ComplexPattern<i32, 1, "SelectSMRDBufferImm32">;
|
||||
def SMRDBufferSgpr : ComplexPattern<i32, 1, "SelectSMRDBufferSgpr">;
|
||||
|
||||
def MOVRELSOffset : ComplexPattern<i32, 2, "selectMOVRELSOffset">;
|
||||
def MOVRELDOffset : ComplexPattern<i32, 2, "selectMOVRELDOffset">;
|
||||
def MOVRELOffset : ComplexPattern<i32, 2, "SelectMOVRELOffset">;
|
||||
|
||||
def VOP3Mods0 : ComplexPattern<untyped, 4, "SelectVOP3Mods0">;
|
||||
def VOP3NoMods0 : ComplexPattern<untyped, 4, "SelectVOP3NoMods0">;
|
||||
|
|
|
@ -3308,13 +3308,13 @@ def : MTBUF_StoreResource <v4i32, 4, TBUFFER_STORE_FORMAT_XYZW>;
|
|||
multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, string VecSize> {
|
||||
// Extract with offset
|
||||
def : Pat<
|
||||
(eltvt (extractelt vt:$src, (MOVRELSOffset i32:$idx, (i32 imm:$offset)))),
|
||||
(eltvt (extractelt vt:$src, (MOVRELOffset i32:$idx, (i32 imm:$offset)))),
|
||||
(!cast<Instruction>("SI_INDIRECT_SRC_"#VecSize) $src, $idx, imm:$offset)
|
||||
>;
|
||||
|
||||
// Insert with offset
|
||||
def : Pat<
|
||||
(insertelt vt:$src, eltvt:$val, (MOVRELDOffset i32:$idx, (i32 imm:$offset))),
|
||||
(insertelt vt:$src, eltvt:$val, (MOVRELOffset i32:$idx, (i32 imm:$offset))),
|
||||
(!cast<Instruction>("SI_INDIRECT_DST_"#VecSize) $src, $idx, imm:$offset, $val)
|
||||
>;
|
||||
}
|
||||
|
|
|
@ -73,6 +73,15 @@ define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}insertelement_to_sgpr:
|
||||
; GCN-NOT: v_readfirstlane
|
||||
define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
|
||||
%tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef
|
||||
%tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
|
||||
%tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
|
||||
ret <4 x float> %tmp2
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}dynamic_insertelement_v2f32:
|
||||
; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
|
||||
; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
|
||||
|
@ -432,3 +441,5 @@ define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x d
|
|||
store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
|
||||
|
|
|
@ -467,12 +467,9 @@ main_body:
|
|||
; This crashed at some point due to a bug in FixSGPRCopies. Derived from the
|
||||
; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877
|
||||
;
|
||||
;TODO: the readfirstlanes are unnecessary, see http://reviews.llvm.org/D22217
|
||||
;
|
||||
;CHECK: v_readfirstlane_b32 s[[LO:[0-9]+]], v{{[0-9]+}}
|
||||
;CHECK: v_readfirstlane_b32
|
||||
;CHECK: v_readfirstlane_b32
|
||||
;CHECK: v_readfirstlane_b32 s[[HI:[0-9]+]], v{{[0-9]+}}
|
||||
;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
|
||||
;CHECK: s_waitcnt lgkmcnt(0)
|
||||
;CHECK: s_mov_b32 s[[LO]], 0
|
||||
;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8
|
||||
define amdgpu_ps float @gather4_sgpr_bug() {
|
||||
main_body:
|
||||
|
|
Loading…
Reference in New Issue