AMDGPU: Look through casted selects to constant fold bin ops

The promotion of the uniform select to i32 interfered with this fold.
This commit is contained in:
Matt Arsenault 2020-01-20 19:27:21 -05:00 committed by Matt Arsenault
parent bcd91778fe
commit 2fe500ab5b
3 changed files with 88 additions and 14 deletions

View File

@ -530,14 +530,32 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
return true;
}
// Find a select instruction, which may have been casted. This is mostly to deal
// with cases where i16 selects weer promoted here to i32.
static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
Cast = nullptr;
if (SelectInst *Sel = dyn_cast<SelectInst>(V))
return Sel;
if ((Cast = dyn_cast<CastInst>(V))) {
if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
return Sel;
}
return nullptr;
}
bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
// Don't do this unless the old select is going away. We want to eliminate the
// binary operator, not replace a binop with a select.
int SelOpNo = 0;
SelectInst *Sel = dyn_cast<SelectInst>(BO.getOperand(0));
CastInst *CastOp;
SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
if (!Sel || !Sel->hasOneUse()) {
SelOpNo = 1;
Sel = dyn_cast<SelectInst>(BO.getOperand(1));
Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
}
if (!Sel || !Sel->hasOneUse())
@ -549,6 +567,11 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
if (!CBO || !CT || !CF)
return false;
if (CastOp) {
CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
}
// TODO: Handle special 0/-1 cases DAG combine does, although we only really
// need to handle divisions here.
Constant *FoldedT = SelOpNo ?
@ -573,6 +596,8 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
NewSelect->takeName(&BO);
BO.replaceAllUsesWith(NewSelect);
BO.eraseFromParent();
if (CastOp)
CastOp->eraseFromParent();
Sel->eraseFromParent();
return true;
}

View File

@ -410,13 +410,18 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
; IR-LABEL: @select_mul_rhs_const_i32(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
; IR-NEXT: ret i32 [[OP]]
;
%select = select i1 %cond, i32 5, i32 8
%op = mul i32 %select, 1000
ret i32 %op
}
; FIXME: Truncate from promoted select blocks this.
define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; IR-LABEL: @select_add_lhs_const_i16(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
; IR-NEXT: store i16 [[OP]], i16 addrspace(1)* undef
; IR-NEXT: ret void
; GCN-LABEL: select_add_lhs_const_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[4:5], 0x0
@ -428,16 +433,62 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; GCN-NEXT: flat_store_short v[0:1], v0
; GCN-NEXT: s_endpgm
; IR-LABEL: @select_add_lhs_const_i16(
; IR-NEXT: [[TMP1:%.*]] = select i1 [[COND:%.*]], i32 5, i32 8
; IR-NEXT: [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
; IR-NEXT: [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
; IR-NEXT: [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 123
; IR-NEXT: [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
; IR-NEXT: store i16 [[TMP5]], i16 addrspace(1)* undef
; IR-NEXT: ret void
;
%select = select i1 %cond, i16 5, i16 8
%op = add i16 %select, 123
store i16 %op, i16 addrspace(1)* undef
ret void
}
define i16 @select_add_trunc_select(i1 %cond) {
; GCN-LABEL: select_add_trunc_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
; IR-LABEL: @select_add_trunc_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
; IR-NEXT: ret i16 [[OP]]
;
%select = select i1 %cond, i32 5, i32 8
%trunc = trunc i32 %select to i16
%op = add i16 %trunc, 42
ret i16 %op
}
define i32 @select_add_sext_select(i1 %cond) {
; IR-LABEL: @select_add_sext_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 29, i32 50
; IR-NEXT: ret i32 [[OP]]
; GCN-LABEL: select_add_sext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 29, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 -13, i16 8
%trunc = sext i16 %select to i32
%op = add i32 %trunc, 42
ret i32 %op
}
define i32 @select_add_zext_select(i1 %cond) {
; IR-LABEL: @select_add_zext_select(
; IR-NEXT: [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50
; IR-NEXT: ret i32 [[OP]]
; GCN-LABEL: select_add_zext_select:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e64 v0, 50, 47, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
%select = select i1 %cond, i16 5, i16 8
%trunc = zext i16 %select to i32
%op = add i32 %trunc, 42
ret i32 %op
}

View File

@ -112,9 +112,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(i32 addrspac
}
; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
; TODO: shrink i16 constant. This is correct but suboptimal.
; GCN: v_mov_b32_e32 [[T:v[0-9]+]], 0xffff0009
; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[T]],
; GCN: v_cndmask_b32_e64 v2, 2, 9,
define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) {
%sel = select i1 %cond, i16 -4, i16 3
%bo = sub i16 5, %sel