AMDGPU: Add a fast path for icmp.i1(src, false, NE)

Summary:
This allows moving the condition from the intrinsic to the standard ICmp
opcode, so that LLVM can do simplifications on it. The icmp.i1 intrinsic
is an identity for retrieving the SGPR mask.

And we can also get the mask from and i1, or i1, xor i1.

Reviewers: arsenm, nhaehnle

Subscribers: kzhuravl, jvesely, wdng, yaxunl, dstuttard, tpr, t-tye, llvm-commits

Differential Revision: https://reviews.llvm.org/D52060

llvm-svn: 351150
This commit is contained in:
Marek Olsak 2019-01-15 02:13:18 +00:00
parent f793fe1402
commit 33eb4d947d
5 changed files with 226 additions and 0 deletions

View File

@ -5355,6 +5355,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
Denominator, Numerator);
}
case Intrinsic::amdgcn_icmp: {
// There is a Pat that handles this variant, so return it as-is.
if (Op.getOperand(1).getValueType() == MVT::i1 &&
Op.getConstantOperandVal(2) == 0 &&
Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
return Op;
return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
}
case Intrinsic::amdgcn_fcmp: {

View File

@ -583,6 +583,11 @@ def : Pat <
// TODO: we could add more variants for other types of conditionals
def : Pat <
(int_amdgcn_icmp i1:$src, (i1 0), (i32 33)),
(COPY $src) // Return the SGPRs representing i1 src
>;
//===----------------------------------------------------------------------===//
// VOP1 Patterns
//===----------------------------------------------------------------------===//

View File

@ -3760,6 +3760,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
// Promote to next legal integer type.
unsigned Width = CmpType->getBitWidth();
unsigned NewWidth = Width;
// Don't do anything for i1 comparisons.
if (Width == 1)
break;
if (Width <= 16)
NewWidth = 16;
else if (Width <= 32)

View File

@ -4,6 +4,7 @@
declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0
declare i64 @llvm.amdgcn.icmp.i16(i16, i16, i32) #0
declare i64 @llvm.amdgcn.icmp.i1(i1, i1, i32) #0
; No crash on invalid input
; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc:
@ -314,4 +315,21 @@ define amdgpu_kernel void @v_icmp_i16_sle(i64 addrspace(1)* %out, i16 %src) {
ret void
}
; GCN-LABEL: {{^}}v_icmp_i1_ne0:
; GCN: v_cmp_gt_u32_e64 s[[C0:\[[0-9]+:[0-9]+\]]],
; GCN: v_cmp_gt_u32_e64 s[[C1:\[[0-9]+:[0-9]+\]]],
; GCN: s_and_b64 s[[SRC:\[[0-9]+:[0-9]+\]]], s[[C0]], s[[C1]]
; SI-NEXT: s_mov_b32 s{{[0-9]+}}, -1
; GCN-NEXT: v_mov_b32_e32
; GCN-NEXT: v_mov_b32_e32
; GCN-NEXT: {{global|flat|buffer}}_store_dwordx2
define amdgpu_kernel void @v_icmp_i1_ne0(i64 addrspace(1)* %out, i32 %a, i32 %b) {
%c0 = icmp ugt i32 %a, 1
%c1 = icmp ugt i32 %b, 2
%src = and i1 %c0, %c1
%result = call i64 @llvm.amdgcn.icmp.i1(i1 %src, i1 false, i32 33)
store i64 %result, i64 addrspace(1)* %out
ret void
}
attributes #0 = { nounwind readnone convergent }

View File

@ -1406,6 +1406,7 @@ define float @fmed3_0_1_undef_f32() {
declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) nounwind readnone convergent
declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) nounwind readnone convergent
declare i64 @llvm.amdgcn.icmp.i1(i1, i1, i32) nounwind readnone convergent
; Make sure there's no crash for invalid input
; CHECK-LABEL: @invalid_nonconstant_icmp_code(
@ -1815,6 +1816,198 @@ define i64 @fold_icmp_ne_0_zext_icmp_ult_i16(i16 %a, i16 %b) {
ret i64 %mask
}
; 1-bit NE comparisons
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i1(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_eq_i1(i32 %a, i32 %b) {
%cmp = icmp eq i32 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ne_i1(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_ne_i1(i32 %a, i32 %b) {
%cmp = icmp ne i32 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_sle_i1(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_sle_i1(i32 %a, i32 %b) {
%cmp = icmp sle i32 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ugt_i64(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_ugt_i64(i64 %a, i64 %b) {
%cmp = icmp ugt i64 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_swap_i64(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_ult_swap_i64(i64 %a, i64 %b) {
%cmp = icmp ugt i64 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 false, i1 %cmp, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f32(
; CHECK-NEXT: fcmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f32(float %a, float %b) {
%cmp = fcmp oeq float %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_une_f32(
; CHECK-NEXT: fcmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_fcmp_une_f32(float %a, float %b) {
%cmp = fcmp une float %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_olt_f64(
; CHECK-NEXT: fcmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_fcmp_olt_f64(double %a, double %b) {
%cmp = fcmp olt double %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i4(
; CHECK-NEXT: icmp
; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_eq_i4(i4 %a, i4 %b) {
%cmp = icmp eq i4 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i8(
; CHECK-NEXT: icmp
; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_eq_i8(i8 %a, i8 %b) {
%cmp = icmp eq i8 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i16(
; CHECK-NEXT: icmp
; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_eq_i16(i16 %a, i16 %b) {
%cmp = icmp eq i16 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i36(
; CHECK-NEXT: icmp
; CHECK: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_eq_i36(i36 %a, i36 %b) {
%cmp = icmp eq i36 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_eq_i128(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_eq_i128(i128 %a, i128 %b) {
%cmp = icmp eq i128 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f16(
; CHECK-NEXT: fcmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f16(half %a, half %b) {
%cmp = fcmp oeq half %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_fcmp_oeq_f128(
; CHECK-NEXT: fcmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_fcmp_oeq_f128(fp128 %a, fp128 %b) {
;
%cmp = fcmp oeq fp128 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i4(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_slt_i4(i4 %a, i4 %b) {
%cmp = icmp slt i4 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i8(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_slt_i8(i8 %a, i8 %b) {
%cmp = icmp slt i8 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_slt_i16(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_slt_i16(i16 %a, i16 %b) {
%cmp = icmp slt i16 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i4(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_ult_i4(i4 %a, i4 %b) {
%cmp = icmp ult i4 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i8(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_ult_i8(i8 %a, i8 %b) {
%cmp = icmp ult i8 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; CHECK-LABEL: @fold_icmp_i1_ne_0_icmp_ult_i16(
; CHECK-NEXT: icmp
; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
define i64 @fold_icmp_i1_ne_0_icmp_ult_i16(i16 %a, i16 %b) {
%cmp = icmp ult i16 %a, %b
%mask = call i64 @llvm.amdgcn.icmp.i1(i1 %cmp, i1 false, i32 33)
ret i64 %mask
}
; --------------------------------------------------------------------
; llvm.amdgcn.fcmp
; --------------------------------------------------------------------