From 22c017f0f902598505b57a9a7147278a7b4dad87 Mon Sep 17 00:00:00 2001 From: Bradley Smith Date: Mon, 12 Apr 2021 13:06:25 +0100 Subject: [PATCH] [AArch64][NEON] Match (or (and -a b) (and (a+1) b)) => bit select With this patch vbslq_f32(vnegq_s32(a), b, c) lowers to a BIT instruction. Co-authored-by: Paul Walker Differential Revision: https://reviews.llvm.org/D100304 --- .../Target/AArch64/AArch64ISelLowering.cpp | 38 +++ llvm/test/CodeGen/AArch64/neon-bitselect.ll | 238 ++++++++++++++++++ 2 files changed, 276 insertions(+) create mode 100644 llvm/test/CodeGen/AArch64/neon-bitselect.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2152256e4cb0..5b94a770556a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12582,6 +12582,44 @@ static SDValue tryCombineToBSL(SDNode *N, if (N1.getOpcode() != ISD::AND) return SDValue(); + // InstCombine does (not (neg a)) => (add a -1). + // Try: (or (and (neg a) b) (and (add a -1) c)) => (bsl (neg a) b c) + // Loop over all combinations of AND operands. + for (int i = 1; i >= 0; --i) { + for (int j = 1; j >= 0; --j) { + SDValue O0 = N0->getOperand(i); + SDValue O1 = N1->getOperand(j); + SDValue Sub, Add, SubSibling, AddSibling; + + // Find a SUB and an ADD operand, one from each AND. + if (O0.getOpcode() == ISD::SUB && O1.getOpcode() == ISD::ADD) { + Sub = O0; + Add = O1; + SubSibling = N0->getOperand(1 - i); + AddSibling = N1->getOperand(1 - j); + } else if (O0.getOpcode() == ISD::ADD && O1.getOpcode() == ISD::SUB) { + Add = O0; + Sub = O1; + AddSibling = N0->getOperand(1 - i); + SubSibling = N1->getOperand(1 - j); + } else + continue; + + if (!ISD::isBuildVectorAllZeros(Sub.getOperand(0).getNode())) + continue; + + // Constant ones is always righthand operand of the Add. + if (!ISD::isBuildVectorAllOnes(Add.getOperand(1).getNode())) + continue; + + if (Sub.getOperand(1) != Add.getOperand(0)) + continue; + + return DAG.getNode(AArch64ISD::BSP, DL, VT, Sub, SubSibling, AddSibling); + } + } + + // (or (and a b) (and (not a) c)) => (bsl a b c) // We only have to look for constant vectors here since the general, variable // case can be handled in TableGen. unsigned Bits = VT.getScalarSizeInBits(); diff --git a/llvm/test/CodeGen/AArch64/neon-bitselect.ll b/llvm/test/CodeGen/AArch64/neon-bitselect.ll new file mode 100644 index 000000000000..9496b3193bf6 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-bitselect.ll @@ -0,0 +1,238 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s | FileCheck %s + +target triple = "aarch64" + +; Check that an expanded vbsl(vneg(pre_cond), left, right) lowers to a VBSL +; during ISEL. +; +; Subtly different from a plain vector bit select: operand representing the +; condition has been negated (-v, not to be confused with bitwise_not(v)). + +; Each vbsl_neg_cond_xxxx tests one of the 16 permutations of the operands. + +define <4 x i32> @vbsl_neg_cond_0000(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0000: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_0 = and <4 x i32> %neg_cond, %left + %right_bits_0 = and <4 x i32> %min_cond, %right + %bsl0000 = or <4 x i32> %right_bits_0, %left_bits_0 + ret <4 x i32> %bsl0000 +} + +define <4 x i32> @vbsl_neg_cond_0001(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0001: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_1 = and <4 x i32> %left, %neg_cond + %right_bits_0 = and <4 x i32> %min_cond, %right + %bsl0001 = or <4 x i32> %right_bits_0, %left_bits_1 + ret <4 x i32> %bsl0001 +} + +define <4 x i32> @vbsl_neg_cond_0010(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0010: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_0 = and <4 x i32> %neg_cond, %left + %right_bits_1 = and <4 x i32> %right, %min_cond + %bsl0010 = or <4 x i32> %right_bits_1, %left_bits_0 + ret <4 x i32> %bsl0010 +} + +define <4 x i32> @vbsl_neg_cond_0011(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0011: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_1 = and <4 x i32> %left, %neg_cond + %right_bits_1 = and <4 x i32> %right, %min_cond + %bsl0011 = or <4 x i32> %right_bits_1, %left_bits_1 + ret <4 x i32> %bsl0011 +} + +define <4 x i32> @vbsl_neg_cond_0100(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0100: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_0 = and <4 x i32> %neg_cond, %left + %right_bits_0 = and <4 x i32> %min_cond, %right + %bsl0100 = or <4 x i32> %left_bits_0, %right_bits_0 + ret <4 x i32> %bsl0100 +} + +define <4 x i32> @vbsl_neg_cond_0101(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0101: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_0 = and <4 x i32> %neg_cond, %left + %right_bits_1 = and <4 x i32> %right, %min_cond + %bsl0101 = or <4 x i32> %left_bits_0, %right_bits_1 + ret <4 x i32> %bsl0101 +} + +define <4 x i32> @vbsl_neg_cond_0110(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0110: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_1 = and <4 x i32> %left, %neg_cond + %right_bits_0 = and <4 x i32> %min_cond, %right + %bsl0110 = or <4 x i32> %left_bits_1, %right_bits_0 + ret <4 x i32> %bsl0110 +} + +define <4 x i32> @vbsl_neg_cond_0111(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_0111: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v1.16b, v2.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %left_bits_1 = and <4 x i32> %left, %neg_cond + %right_bits_1 = and <4 x i32> %right, %min_cond + %bsl0111 = or <4 x i32> %left_bits_1, %right_bits_1 + ret <4 x i32> %bsl0111 +} + +define <4 x i32> @vbsl_neg_cond_1000(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1000: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_0 = and <4 x i32> %min_cond, %left + %flip_cond_right_bits_0 = and <4 x i32> %neg_cond, %right + %bsl1000 = or <4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_0 + ret <4 x i32> %bsl1000 +} + +define <4 x i32> @vbsl_neg_cond_1001(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1001: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_1 = and <4 x i32> %left, %min_cond + %flip_cond_right_bits_0 = and <4 x i32> %neg_cond, %right + %bsl1001 = or <4 x i32> %flip_cond_right_bits_0, %flip_cond_left_bits_1 + ret <4 x i32> %bsl1001 +} + +define <4 x i32> @vbsl_neg_cond_1010(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1010: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_0 = and <4 x i32> %min_cond, %left + %flip_cond_right_bits_1 = and <4 x i32> %right, %neg_cond + %bsl1010 = or <4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_0 + ret <4 x i32> %bsl1010 +} + +define <4 x i32> @vbsl_neg_cond_1011(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1011: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_1 = and <4 x i32> %left, %min_cond + %flip_cond_right_bits_1 = and <4 x i32> %right, %neg_cond + %bsl1011 = or <4 x i32> %flip_cond_right_bits_1, %flip_cond_left_bits_1 + ret <4 x i32> %bsl1011 +} + +define <4 x i32> @vbsl_neg_cond_1100(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1100: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_0 = and <4 x i32> %min_cond, %left + %flip_cond_right_bits_0 = and <4 x i32> %neg_cond, %right + %bsl1100 = or <4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_0 + ret <4 x i32> %bsl1100 +} + +define <4 x i32> @vbsl_neg_cond_1101(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1101: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_0 = and <4 x i32> %min_cond, %left + %flip_cond_right_bits_1 = and <4 x i32> %right, %neg_cond + %bsl1101 = or <4 x i32> %flip_cond_left_bits_0, %flip_cond_right_bits_1 + ret <4 x i32> %bsl1101 +} + +define <4 x i32> @vbsl_neg_cond_1110(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1110: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_1 = and <4 x i32> %left, %min_cond + %flip_cond_right_bits_0 = and <4 x i32> %neg_cond, %right + %bsl1110 = or <4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_0 + ret <4 x i32> %bsl1110 +} + +define <4 x i32> @vbsl_neg_cond_1111(<4 x i32> %pre_cond, <4 x i32> %left, <4 x i32> %right) #0 { +; CHECK-LABEL: vbsl_neg_cond_1111: +; CHECK: // %bb.0: +; CHECK-NEXT: neg v0.4s, v0.4s +; CHECK-NEXT: bsl v0.16b, v2.16b, v1.16b +; CHECK-NEXT: ret + %neg_cond = sub <4 x i32> zeroinitializer, %pre_cond + %min_cond = add <4 x i32> %pre_cond, + %flip_cond_left_bits_1 = and <4 x i32> %left, %min_cond + %flip_cond_right_bits_1 = and <4 x i32> %right, %neg_cond + %bsl1111 = or <4 x i32> %flip_cond_left_bits_1, %flip_cond_right_bits_1 + ret <4 x i32> %bsl1111 +} + +attributes #0 = { "target-features"="+neon" }