From 65f257a2152ce855f4e25faed9b43ac21933c14a Mon Sep 17 00:00:00 2001 From: Jessica Paquette Date: Tue, 13 Apr 2021 10:11:25 -0700 Subject: [PATCH] [AArch64][GlobalISel] Implement custom legalization for s32 and s64 G_CTPOP This is a partial port of AArch64TargetLowering::LowerCTPOP. This custom lowering tries to uses NEON instructions to give a more efficient CTPOP lowering when possible. In the non-NEON/noimplicitfloat case, this should use the generic lowering (see: https://godbolt.org/z/GcaPvWe4x). I think that's worth implementing after implementing the widening code for s16/s8 though. Differential Revision: https://reviews.llvm.org/D100399 --- .../AArch64/GISel/AArch64LegalizerInfo.cpp | 56 ++++++++++++++++++- .../AArch64/GISel/AArch64LegalizerInfo.h | 2 + .../legalize-ctpop-no-implicit-float.mir | 16 ++++++ .../AArch64/GlobalISel/legalize-ctpop.mir | 40 +++++++++++++ 4 files changed, 112 insertions(+), 2 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp index f359d9600649..86e461c3c295 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp @@ -22,9 +22,10 @@ #include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Type.h" -#include #include "llvm/Support/MathExtras.h" +#include #define DEBUG_TYPE "aarch64-legalinfo" @@ -718,7 +719,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) getActionDefinitionsBuilder({G_SBFX, G_UBFX}) .customFor({{s32, s32}, {s64, s64}}); - getActionDefinitionsBuilder(G_CTPOP).legalFor({{v8s8, v8s8}, {v16s8, v16s8}}); + // TODO: s8, s16, s128 + // TODO: v2s64, v2s32, v4s32, v4s16, v8s16 + // TODO: Use generic lowering when custom lowering is not possible. + getActionDefinitionsBuilder(G_CTPOP) + .legalFor({{v8s8, v8s8}, {v16s8, v16s8}}) + .customFor({{s32, s32}, {s64, s64}}); computeTables(); verify(*ST.getInstrInfo()); @@ -751,6 +757,8 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper, return legalizeBitfieldExtract(MI, MRI, Helper); case TargetOpcode::G_ROTR: return legalizeRotate(MI, MRI, Helper); + case TargetOpcode::G_CTPOP: + return legalizeCTPOP(MI, MRI, Helper); } llvm_unreachable("expected switch to return"); @@ -995,3 +1003,47 @@ bool AArch64LegalizerInfo::legalizeBitfieldExtract( return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) && getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI); } + +bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI, + MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const { + // While there is no integer popcount instruction, it can + // be more efficiently lowered to the following sequence that uses + // AdvSIMD registers/instructions as long as the copies to/from + // the AdvSIMD registers are cheap. + // FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd + // CNT V0.8B, V0.8B // 8xbyte pop-counts + // ADDV B0, V0.8B // sum 8xbyte pop-counts + // UMOV X0, V0.B[0] // copy byte result back to integer reg + if (!ST->hasNEON() || + MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) + return false; + MachineIRBuilder &MIRBuilder = Helper.MIRBuilder; + Register Dst = MI.getOperand(0).getReg(); + Register Val = MI.getOperand(1).getReg(); + LLT Ty = MRI.getType(Val); + + // TODO: Handle vector types. + assert(!Ty.isVector() && "Vector types not handled yet!"); + assert(Ty == MRI.getType(Dst) && + "Expected src and dst to have the same type!"); + // TODO: Handle s128. + unsigned Size = Ty.getSizeInBits(); + assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!"); + if (Size == 32) + Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0); + const LLT V8S8 = LLT::vector(8, LLT::scalar(8)); + Val = MIRBuilder.buildBitcast(V8S8, Val).getReg(0); + auto CTPOP = MIRBuilder.buildCTPOP(V8S8, Val); + auto UADDLV = + MIRBuilder + .buildIntrinsic(Intrinsic::aarch64_neon_uaddlv, {LLT::scalar(32)}, + /*HasSideEffects = */ false) + .addUse(CTPOP.getReg(0)); + if (Size == 64) + MIRBuilder.buildZExt(Dst, UADDLV); + else + UADDLV->getOperand(0).setReg(Dst); + MI.eraseFromParent(); + return true; +} diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h index 5d78dc64a2f1..51ae105195c5 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h +++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h @@ -52,6 +52,8 @@ private: LegalizerHelper &Helper) const; bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI, LegalizerHelper &Helper) const; + bool legalizeCTPOP(MachineInstr &MI, MachineRegisterInfo &MRI, + LegalizerHelper &Helper) const; const AArch64Subtarget *ST; }; } // End llvm namespace. diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir new file mode 100644 index 000000000000..41e316e1453d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop-no-implicit-float.mir @@ -0,0 +1,16 @@ +# RUN: not --crash llc -mtriple=aarch64-unknown-unknown -verify-machineinstrs -run-pass=legalizer %s -o - 2>&1 | FileCheck %s +# CHECK: LLVM ERROR: unable to legalize instruction: %ctpop:_(s32) = G_CTPOP %copy:_(s32) (in function: s32) +--- | + define void @s32() noimplicitfloat { unreachable } + define void @s64() noimplicitfloat { unreachable } +... +--- +name: s32 +tracksRegLiveness: true +body: | + bb.0: + liveins: $w0 + %copy:_(s32) = COPY $w0 + %ctpop:_(s32) = G_CTPOP %copy(s32) + $w0 = COPY %ctpop(s32) + RET_ReallyLR implicit $w0 diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir index 3a8196a581d0..9cd631819d4b 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir @@ -37,3 +37,43 @@ body: | RET_ReallyLR implicit $q0 ... +--- +name: s32_lower +tracksRegLiveness: true +body: | + bb.0: + liveins: $w0 + ; CHECK-LABEL: name: s32_lower + ; CHECK: liveins: $w0 + ; CHECK: %copy:_(s32) = COPY $w0 + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %copy(s32) + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[ZEXT]](s64) + ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: %ctpop:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) + ; CHECK: $w0 = COPY %ctpop(s32) + ; CHECK: RET_ReallyLR implicit $w0 + %copy:_(s32) = COPY $w0 + %ctpop:_(s32) = G_CTPOP %copy(s32) + $w0 = COPY %ctpop(s32) + RET_ReallyLR implicit $w0 + +... +--- +name: s64_lower +tracksRegLiveness: true +body: | + bb.0: + liveins: $x0 + ; CHECK-LABEL: name: s64_lower + ; CHECK: liveins: $x0 + ; CHECK: %copy:_(s64) = COPY $x0 + ; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST %copy(s64) + ; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>) + ; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>) + ; CHECK: %ctpop:_(s64) = G_ZEXT [[INT]](s32) + ; CHECK: $x0 = COPY %ctpop(s64) + ; CHECK: RET_ReallyLR implicit $x0 + %copy:_(s64) = COPY $x0 + %ctpop:_(s64) = G_CTPOP %copy(s64) + $x0 = COPY %ctpop(s64) + RET_ReallyLR implicit $x0