forked from OSchip/llvm-project
[AArch64][GlobalISel] Implement custom legalization for s32 and s64 G_CTPOP
This is a partial port of AArch64TargetLowering::LowerCTPOP. This custom lowering tries to uses NEON instructions to give a more efficient CTPOP lowering when possible. In the non-NEON/noimplicitfloat case, this should use the generic lowering (see: https://godbolt.org/z/GcaPvWe4x). I think that's worth implementing after implementing the widening code for s16/s8 though. Differential Revision: https://reviews.llvm.org/D100399
This commit is contained in:
parent
c440b97d89
commit
65f257a215
|
@ -22,9 +22,10 @@
|
||||||
#include "llvm/CodeGen/TargetOpcodes.h"
|
#include "llvm/CodeGen/TargetOpcodes.h"
|
||||||
#include "llvm/CodeGen/ValueTypes.h"
|
#include "llvm/CodeGen/ValueTypes.h"
|
||||||
#include "llvm/IR/DerivedTypes.h"
|
#include "llvm/IR/DerivedTypes.h"
|
||||||
|
#include "llvm/IR/IntrinsicsAArch64.h"
|
||||||
#include "llvm/IR/Type.h"
|
#include "llvm/IR/Type.h"
|
||||||
#include <initializer_list>
|
|
||||||
#include "llvm/Support/MathExtras.h"
|
#include "llvm/Support/MathExtras.h"
|
||||||
|
#include <initializer_list>
|
||||||
|
|
||||||
#define DEBUG_TYPE "aarch64-legalinfo"
|
#define DEBUG_TYPE "aarch64-legalinfo"
|
||||||
|
|
||||||
|
@ -718,7 +719,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
|
||||||
getActionDefinitionsBuilder({G_SBFX, G_UBFX})
|
getActionDefinitionsBuilder({G_SBFX, G_UBFX})
|
||||||
.customFor({{s32, s32}, {s64, s64}});
|
.customFor({{s32, s32}, {s64, s64}});
|
||||||
|
|
||||||
getActionDefinitionsBuilder(G_CTPOP).legalFor({{v8s8, v8s8}, {v16s8, v16s8}});
|
// TODO: s8, s16, s128
|
||||||
|
// TODO: v2s64, v2s32, v4s32, v4s16, v8s16
|
||||||
|
// TODO: Use generic lowering when custom lowering is not possible.
|
||||||
|
getActionDefinitionsBuilder(G_CTPOP)
|
||||||
|
.legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
|
||||||
|
.customFor({{s32, s32}, {s64, s64}});
|
||||||
|
|
||||||
computeTables();
|
computeTables();
|
||||||
verify(*ST.getInstrInfo());
|
verify(*ST.getInstrInfo());
|
||||||
|
@ -751,6 +757,8 @@ bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
|
||||||
return legalizeBitfieldExtract(MI, MRI, Helper);
|
return legalizeBitfieldExtract(MI, MRI, Helper);
|
||||||
case TargetOpcode::G_ROTR:
|
case TargetOpcode::G_ROTR:
|
||||||
return legalizeRotate(MI, MRI, Helper);
|
return legalizeRotate(MI, MRI, Helper);
|
||||||
|
case TargetOpcode::G_CTPOP:
|
||||||
|
return legalizeCTPOP(MI, MRI, Helper);
|
||||||
}
|
}
|
||||||
|
|
||||||
llvm_unreachable("expected switch to return");
|
llvm_unreachable("expected switch to return");
|
||||||
|
@ -995,3 +1003,47 @@ bool AArch64LegalizerInfo::legalizeBitfieldExtract(
|
||||||
return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
|
return getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI) &&
|
||||||
getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
|
getConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool AArch64LegalizerInfo::legalizeCTPOP(MachineInstr &MI,
|
||||||
|
MachineRegisterInfo &MRI,
|
||||||
|
LegalizerHelper &Helper) const {
|
||||||
|
// While there is no integer popcount instruction, it can
|
||||||
|
// be more efficiently lowered to the following sequence that uses
|
||||||
|
// AdvSIMD registers/instructions as long as the copies to/from
|
||||||
|
// the AdvSIMD registers are cheap.
|
||||||
|
// FMOV D0, X0 // copy 64-bit int to vector, high bits zero'd
|
||||||
|
// CNT V0.8B, V0.8B // 8xbyte pop-counts
|
||||||
|
// ADDV B0, V0.8B // sum 8xbyte pop-counts
|
||||||
|
// UMOV X0, V0.B[0] // copy byte result back to integer reg
|
||||||
|
if (!ST->hasNEON() ||
|
||||||
|
MI.getMF()->getFunction().hasFnAttribute(Attribute::NoImplicitFloat))
|
||||||
|
return false;
|
||||||
|
MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
|
||||||
|
Register Dst = MI.getOperand(0).getReg();
|
||||||
|
Register Val = MI.getOperand(1).getReg();
|
||||||
|
LLT Ty = MRI.getType(Val);
|
||||||
|
|
||||||
|
// TODO: Handle vector types.
|
||||||
|
assert(!Ty.isVector() && "Vector types not handled yet!");
|
||||||
|
assert(Ty == MRI.getType(Dst) &&
|
||||||
|
"Expected src and dst to have the same type!");
|
||||||
|
// TODO: Handle s128.
|
||||||
|
unsigned Size = Ty.getSizeInBits();
|
||||||
|
assert((Size == 32 || Size == 64) && "Expected only 32 or 64 bit scalars!");
|
||||||
|
if (Size == 32)
|
||||||
|
Val = MIRBuilder.buildZExt(LLT::scalar(64), Val).getReg(0);
|
||||||
|
const LLT V8S8 = LLT::vector(8, LLT::scalar(8));
|
||||||
|
Val = MIRBuilder.buildBitcast(V8S8, Val).getReg(0);
|
||||||
|
auto CTPOP = MIRBuilder.buildCTPOP(V8S8, Val);
|
||||||
|
auto UADDLV =
|
||||||
|
MIRBuilder
|
||||||
|
.buildIntrinsic(Intrinsic::aarch64_neon_uaddlv, {LLT::scalar(32)},
|
||||||
|
/*HasSideEffects = */ false)
|
||||||
|
.addUse(CTPOP.getReg(0));
|
||||||
|
if (Size == 64)
|
||||||
|
MIRBuilder.buildZExt(Dst, UADDLV);
|
||||||
|
else
|
||||||
|
UADDLV->getOperand(0).setReg(Dst);
|
||||||
|
MI.eraseFromParent();
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
|
@ -52,6 +52,8 @@ private:
|
||||||
LegalizerHelper &Helper) const;
|
LegalizerHelper &Helper) const;
|
||||||
bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
|
bool legalizeRotate(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||||
LegalizerHelper &Helper) const;
|
LegalizerHelper &Helper) const;
|
||||||
|
bool legalizeCTPOP(MachineInstr &MI, MachineRegisterInfo &MRI,
|
||||||
|
LegalizerHelper &Helper) const;
|
||||||
const AArch64Subtarget *ST;
|
const AArch64Subtarget *ST;
|
||||||
};
|
};
|
||||||
} // End llvm namespace.
|
} // End llvm namespace.
|
||||||
|
|
|
@ -0,0 +1,16 @@
|
||||||
|
# RUN: not --crash llc -mtriple=aarch64-unknown-unknown -verify-machineinstrs -run-pass=legalizer %s -o - 2>&1 | FileCheck %s
|
||||||
|
# CHECK: LLVM ERROR: unable to legalize instruction: %ctpop:_(s32) = G_CTPOP %copy:_(s32) (in function: s32)
|
||||||
|
--- |
|
||||||
|
define void @s32() noimplicitfloat { unreachable }
|
||||||
|
define void @s64() noimplicitfloat { unreachable }
|
||||||
|
...
|
||||||
|
---
|
||||||
|
name: s32
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $w0
|
||||||
|
%copy:_(s32) = COPY $w0
|
||||||
|
%ctpop:_(s32) = G_CTPOP %copy(s32)
|
||||||
|
$w0 = COPY %ctpop(s32)
|
||||||
|
RET_ReallyLR implicit $w0
|
|
@ -37,3 +37,43 @@ body: |
|
||||||
RET_ReallyLR implicit $q0
|
RET_ReallyLR implicit $q0
|
||||||
|
|
||||||
...
|
...
|
||||||
|
---
|
||||||
|
name: s32_lower
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $w0
|
||||||
|
; CHECK-LABEL: name: s32_lower
|
||||||
|
; CHECK: liveins: $w0
|
||||||
|
; CHECK: %copy:_(s32) = COPY $w0
|
||||||
|
; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %copy(s32)
|
||||||
|
; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[ZEXT]](s64)
|
||||||
|
; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
|
||||||
|
; CHECK: %ctpop:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
|
||||||
|
; CHECK: $w0 = COPY %ctpop(s32)
|
||||||
|
; CHECK: RET_ReallyLR implicit $w0
|
||||||
|
%copy:_(s32) = COPY $w0
|
||||||
|
%ctpop:_(s32) = G_CTPOP %copy(s32)
|
||||||
|
$w0 = COPY %ctpop(s32)
|
||||||
|
RET_ReallyLR implicit $w0
|
||||||
|
|
||||||
|
...
|
||||||
|
---
|
||||||
|
name: s64_lower
|
||||||
|
tracksRegLiveness: true
|
||||||
|
body: |
|
||||||
|
bb.0:
|
||||||
|
liveins: $x0
|
||||||
|
; CHECK-LABEL: name: s64_lower
|
||||||
|
; CHECK: liveins: $x0
|
||||||
|
; CHECK: %copy:_(s64) = COPY $x0
|
||||||
|
; CHECK: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST %copy(s64)
|
||||||
|
; CHECK: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
|
||||||
|
; CHECK: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
|
||||||
|
; CHECK: %ctpop:_(s64) = G_ZEXT [[INT]](s32)
|
||||||
|
; CHECK: $x0 = COPY %ctpop(s64)
|
||||||
|
; CHECK: RET_ReallyLR implicit $x0
|
||||||
|
%copy:_(s64) = COPY $x0
|
||||||
|
%ctpop:_(s64) = G_CTPOP %copy(s64)
|
||||||
|
$x0 = COPY %ctpop(s64)
|
||||||
|
RET_ReallyLR implicit $x0
|
||||||
|
|
Loading…
Reference in New Issue