AMDGPU: Look through casted selects to constant fold bin ops

The promotion of the uniform select to i32 interfered with this fold.
2020-01-20 19:27:21 -05:00 · 2020-01-20 19:27:21 -05:00 · 2fe500ab5b
parent bcd91778fe
commit 2fe500ab5b
3 changed files with 88 additions and 14 deletions
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@ -530,14 +530,32 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
  return true;
 }

+// Find a select instruction, which may have been casted. This is mostly to deal
+// with cases where i16 selects weer promoted here to i32.
+static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
+  Cast = nullptr;
+  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
+    return Sel;
+
+  if ((Cast = dyn_cast<CastInst>(V))) {
+    if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
+      return Sel;
+  }
+
+  return nullptr;
+}
+
 bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
  // Don't do this unless the old select is going away. We want to eliminate the
  // binary operator, not replace a binop with a select.
  int SelOpNo = 0;
-  SelectInst *Sel = dyn_cast<SelectInst>(BO.getOperand(0));
+
+  CastInst *CastOp;
+
+  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
  if (!Sel || !Sel->hasOneUse()) {
    SelOpNo = 1;
-    Sel = dyn_cast<SelectInst>(BO.getOperand(1));
+    Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
  }

  if (!Sel || !Sel->hasOneUse())
@ -549,6 +567,11 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
  if (!CBO || !CT || !CF)
    return false;

+  if (CastOp) {
+    CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
+    CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
+  }
+
  // TODO: Handle special 0/-1 cases DAG combine does, although we only really
  // need to handle divisions here.
  Constant *FoldedT = SelOpNo ?
@ -573,6 +596,8 @@ bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
  NewSelect->takeName(&BO);
  BO.replaceAllUsesWith(NewSelect);
  BO.eraseFromParent();
+  if (CastOp)
+    CastOp->eraseFromParent();
  Sel->eraseFromParent();
  return true;
 }
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@ -410,13 +410,18 @@ define i32 @select_mul_rhs_const_i32(i1 %cond) {
 ; IR-LABEL: @select_mul_rhs_const_i32(
 ; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 5000, i32 8000
 ; IR-NEXT:    ret i32 [[OP]]
+;
  %select = select i1 %cond, i32 5, i32 8
  %op = mul i32 %select, 1000
  ret i32 %op
 }

-; FIXME: Truncate from promoted select blocks this.
 define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
+; IR-LABEL: @select_add_lhs_const_i16(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 128, i16 131
+; IR-NEXT:    store i16 [[OP]], i16 addrspace(1)* undef
+; IR-NEXT:    ret void
+
 ; GCN-LABEL: select_add_lhs_const_i16:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_load_dword s0, s[4:5], 0x0
@ -428,16 +433,62 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GCN-NEXT:    flat_store_short v[0:1], v0
 ; GCN-NEXT:    s_endpgm
-; IR-LABEL: @select_add_lhs_const_i16(
-; IR-NEXT:    [[TMP1:%.*]] = select i1 [[COND:%.*]], i32 5, i32 8
-; IR-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i16
-; IR-NEXT:    [[TMP3:%.*]] = zext i16 [[TMP2]] to i32
-; IR-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 123
-; IR-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
-; IR-NEXT:    store i16 [[TMP5]], i16 addrspace(1)* undef
-; IR-NEXT:    ret void
+;
  %select = select i1 %cond, i16 5, i16 8
  %op = add i16 %select, 123
  store i16 %op, i16 addrspace(1)* undef
  ret void
 }
+
+define i16 @select_add_trunc_select(i1 %cond) {
+; GCN-LABEL: select_add_trunc_select:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+; IR-LABEL: @select_add_trunc_select(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i16 47, i16 50
+; IR-NEXT:    ret i16 [[OP]]
+;
+  %select = select i1 %cond, i32 5, i32 8
+  %trunc = trunc i32 %select to i16
+  %op = add i16 %trunc, 42
+  ret i16 %op
+}
+
+define i32 @select_add_sext_select(i1 %cond) {
+; IR-LABEL: @select_add_sext_select(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 29, i32 50
+; IR-NEXT:    ret i32 [[OP]]
+; GCN-LABEL: select_add_sext_select:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 29, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %select = select i1 %cond, i16 -13, i16 8
+  %trunc = sext i16 %select to i32
+  %op = add i32 %trunc, 42
+  ret i32 %op
+}
+
+define i32 @select_add_zext_select(i1 %cond) {
+; IR-LABEL: @select_add_zext_select(
+; IR-NEXT:    [[OP:%.*]] = select i1 [[COND:%.*]], i32 47, i32 50
+; IR-NEXT:    ret i32 [[OP]]
+
+; GCN-LABEL: select_add_zext_select:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT:    v_and_b32_e32 v0, 1, v0
+; GCN-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GCN-NEXT:    v_cndmask_b32_e64 v0, 50, 47, vcc
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+  %select = select i1 %cond, i16 5, i16 8
+  %trunc = zext i16 %select to i32
+  %op = add i32 %trunc, 42
+  ret i32 %op
+}
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@ -112,9 +112,7 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants(i32 addrspac
 }

 ; GCN-LABEL: {{^}}sel_constants_sub_constant_sel_constants_i16:
-; TODO: shrink i16 constant. This is correct but suboptimal.
-; GCN: v_mov_b32_e32 [[T:v[0-9]+]], 0xffff0009
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[T]],
+; GCN: v_cndmask_b32_e64 v2, 2, 9,
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_i16(i16 addrspace(1)* %p, i1 %cond) {
  %sel = select i1 %cond, i16 -4, i16 3
  %bo = sub i16 5, %sel