forked from OSchip/llvm-project
[AArch64][SVE] Add a pass for SVE intrinsic optimisations
Summary: Creates the SVEIntrinsicOpts pass. In this patch, the pass tries to remove unnecessary reinterpret intrinsics which convert to and from svbool_t (llvm.aarch64.sve.convert.[to|from].svbool) For example, the reinterprets below are redundant: %1 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a) %2 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1) The pass also looks for ptest intrinsics and phi instructions where the operands are being needlessly converted to and from svbool_t. Reviewers: sdesmalen, andwar, efriedma, cameron.mcinally, c-rhodes, rengolin Reviewed By: efriedma Subscribers: mgorny, tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D76078
This commit is contained in:
parent
31c8e11896
commit
36c76de678
|
@ -52,6 +52,7 @@ FunctionPass *createAArch64BranchTargetsPass();
|
|||
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
|
||||
|
||||
FunctionPass *createAArch64CollectLOHPass();
|
||||
ModulePass *createSVEIntrinsicOptsPass();
|
||||
InstructionSelector *
|
||||
createAArch64InstructionSelector(const AArch64TargetMachine &,
|
||||
AArch64Subtarget &, AArch64RegisterBankInfo &);
|
||||
|
@ -80,6 +81,7 @@ void initializeAArch64StorePairSuppressPass(PassRegistry&);
|
|||
void initializeFalkorHWPFFixPass(PassRegistry&);
|
||||
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
|
||||
void initializeLDTLSCleanupPass(PassRegistry&);
|
||||
void initializeSVEIntrinsicOptsPass(PassRegistry&);
|
||||
void initializeAArch64StackTaggingPass(PassRegistry&);
|
||||
void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
|
||||
} // end namespace llvm
|
||||
|
|
|
@ -146,6 +146,11 @@ static cl::opt<int> EnableGlobalISelAtO(
|
|||
cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
|
||||
cl::init(0));
|
||||
|
||||
static cl::opt<bool> EnableSVEIntrinsicOpts(
|
||||
"aarch64-sve-intrinsic-opts", cl::Hidden,
|
||||
cl::desc("Enable SVE intrinsic opts"),
|
||||
cl::init(true));
|
||||
|
||||
static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
|
||||
cl::init(true), cl::Hidden);
|
||||
|
||||
|
@ -182,6 +187,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
|
|||
initializeFalkorHWPFFixPass(*PR);
|
||||
initializeFalkorMarkStridedAccessesLegacyPass(*PR);
|
||||
initializeLDTLSCleanupPass(*PR);
|
||||
initializeSVEIntrinsicOptsPass(*PR);
|
||||
initializeAArch64SpeculationHardeningPass(*PR);
|
||||
initializeAArch64StackTaggingPass(*PR);
|
||||
initializeAArch64StackTaggingPreRAPass(*PR);
|
||||
|
@ -434,6 +440,10 @@ void AArch64PassConfig::addIRPasses() {
|
|||
// ourselves.
|
||||
addPass(createAtomicExpandPass());
|
||||
|
||||
// Expand any SVE vector library calls that we can't code generate directly.
|
||||
if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive)
|
||||
addPass(createSVEIntrinsicOptsPass());
|
||||
|
||||
// Cmpxchg instructions are often used with a subsequent comparison to
|
||||
// determine whether it succeeded. We can exploit existing control-flow in
|
||||
// ldrex/strex loops to simplify this, but it needs tidying up.
|
||||
|
|
|
@ -64,6 +64,7 @@ add_llvm_target(AArch64CodeGen
|
|||
AArch64TargetMachine.cpp
|
||||
AArch64TargetObjectFile.cpp
|
||||
AArch64TargetTransformInfo.cpp
|
||||
SVEIntrinsicOpts.cpp
|
||||
AArch64SIMDInstrOpt.cpp
|
||||
|
||||
DEPENDS
|
||||
|
|
|
@ -0,0 +1,277 @@
|
|||
//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
|
||||
//
|
||||
// The LLVM Compiler Infrastructure
|
||||
//
|
||||
// This file is distributed under the University of Illinois Open Source
|
||||
// License. See LICENSE.TXT for details.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Performs general IR level optimizations on SVE intrinsics.
|
||||
//
|
||||
// The main goal of this pass is to remove unnecessary reinterpret
|
||||
// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
|
||||
//
|
||||
// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
|
||||
// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
|
||||
//
|
||||
// This pass also looks for ptest intrinsics & phi instructions where the
|
||||
// operands are being needlessly converted to and from svbool_t.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "Utils/AArch64BaseInfo.h"
|
||||
#include "llvm/ADT/PostOrderIterator.h"
|
||||
#include "llvm/ADT/SetVector.h"
|
||||
#include "llvm/IR/Constants.h"
|
||||
#include "llvm/IR/Dominators.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
#include "llvm/IR/IntrinsicInst.h"
|
||||
#include "llvm/IR/IntrinsicsAArch64.h"
|
||||
#include "llvm/IR/LLVMContext.h"
|
||||
#include "llvm/IR/PatternMatch.h"
|
||||
#include "llvm/InitializePasses.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace llvm::PatternMatch;
|
||||
|
||||
#define DEBUG_TYPE "sve-intrinsic-opts"
|
||||
|
||||
namespace llvm {
|
||||
void initializeSVEIntrinsicOptsPass(PassRegistry &);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct SVEIntrinsicOpts : public ModulePass {
|
||||
static char ID; // Pass identification, replacement for typeid
|
||||
SVEIntrinsicOpts() : ModulePass(ID) {
|
||||
initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
|
||||
}
|
||||
|
||||
bool runOnModule(Module &M) override;
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override;
|
||||
|
||||
private:
|
||||
static IntrinsicInst *isReinterpretFromSVBool(Value *V);
|
||||
static IntrinsicInst *isReinterpretToSVBool(Value *V);
|
||||
|
||||
static bool optimizeIntrinsic(Instruction *I);
|
||||
|
||||
bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
|
||||
|
||||
static bool optimizeConvertFromSVBool(IntrinsicInst *I);
|
||||
static bool optimizePTest(IntrinsicInst *I);
|
||||
|
||||
static bool processPhiNode(IntrinsicInst *I);
|
||||
};
|
||||
} // end anonymous namespace
|
||||
|
||||
void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.setPreservesCFG();
|
||||
}
|
||||
|
||||
char SVEIntrinsicOpts::ID = 0;
|
||||
static const char *name = "SVE intrinsics optimizations";
|
||||
INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
|
||||
INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
|
||||
|
||||
namespace llvm {
|
||||
ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
|
||||
} // namespace llvm
|
||||
|
||||
/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
|
||||
/// otherwise.
|
||||
IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
|
||||
IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
|
||||
if (!I)
|
||||
return nullptr;
|
||||
|
||||
if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
|
||||
return nullptr;
|
||||
|
||||
return I;
|
||||
}
|
||||
|
||||
/// Returns V if it's a cast to <n x 16 x i1> (aka svbool_t), nullptr otherwise.
|
||||
IntrinsicInst *SVEIntrinsicOpts::isReinterpretFromSVBool(Value *V) {
|
||||
IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
|
||||
if (!I)
|
||||
return nullptr;
|
||||
|
||||
if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_from_svbool)
|
||||
return nullptr;
|
||||
|
||||
return I;
|
||||
}
|
||||
|
||||
/// The function will remove redundant reinterprets casting in the presence
|
||||
/// of the control flow
|
||||
bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
|
||||
|
||||
SmallVector<Instruction *, 32> Worklist;
|
||||
auto RequiredType = X->getType();
|
||||
|
||||
auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
|
||||
assert(PN && "Expected Phi Node!");
|
||||
|
||||
// Don't create a new Phi unless we can remove the old one.
|
||||
if (!PN->hasOneUse())
|
||||
return false;
|
||||
|
||||
for (Value *IncValPhi : PN->incoming_values()) {
|
||||
auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
|
||||
if (!Reinterpret ||
|
||||
RequiredType != Reinterpret->getArgOperand(0)->getType())
|
||||
return false;
|
||||
}
|
||||
|
||||
// Create the new Phi
|
||||
LLVMContext &Ctx = PN->getContext();
|
||||
IRBuilder<> Builder(Ctx);
|
||||
Builder.SetInsertPoint(PN);
|
||||
PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
|
||||
Worklist.push_back(PN);
|
||||
|
||||
for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
|
||||
auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
|
||||
NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
|
||||
Worklist.push_back(Reinterpret);
|
||||
}
|
||||
|
||||
// Cleanup Phi Node and reinterprets
|
||||
X->replaceAllUsesWith(NPN);
|
||||
X->eraseFromParent();
|
||||
|
||||
for (auto &I : Worklist)
|
||||
if (I->use_empty())
|
||||
I->eraseFromParent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
|
||||
IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
|
||||
IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
|
||||
|
||||
if (Op1 && Op2 &&
|
||||
Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
|
||||
Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
|
||||
Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
|
||||
|
||||
Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
|
||||
Type *Tys[] = {Op1->getArgOperand(0)->getType()};
|
||||
Module *M = I->getParent()->getParent()->getParent();
|
||||
|
||||
auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
|
||||
auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
|
||||
|
||||
I->replaceAllUsesWith(CI);
|
||||
I->eraseFromParent();
|
||||
if (Op1->use_empty())
|
||||
Op1->eraseFromParent();
|
||||
if (Op2->use_empty())
|
||||
Op2->eraseFromParent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
|
||||
assert(isReinterpretFromSVBool(I));
|
||||
|
||||
// If the reinterpret instruction operand is a PHI Node
|
||||
if (isa<PHINode>(I->getArgOperand(0)))
|
||||
return processPhiNode(I);
|
||||
|
||||
// If we have a reinterpret intrinsic I of type A which is converting from
|
||||
// another reinterpret Y of type B, and the source type of Y is A, then we can
|
||||
// elide away both reinterprets if there are no other users of Y.
|
||||
auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
|
||||
if (!Y)
|
||||
return false;
|
||||
|
||||
Value *SourceVal = Y->getArgOperand(0);
|
||||
if (I->getType() != SourceVal->getType())
|
||||
return false;
|
||||
|
||||
I->replaceAllUsesWith(SourceVal);
|
||||
I->eraseFromParent();
|
||||
if (Y->use_empty())
|
||||
Y->eraseFromParent();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
|
||||
IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
|
||||
if (!IntrI)
|
||||
return false;
|
||||
|
||||
switch (IntrI->getIntrinsicID()) {
|
||||
case Intrinsic::aarch64_sve_convert_from_svbool:
|
||||
return optimizeConvertFromSVBool(IntrI);
|
||||
case Intrinsic::aarch64_sve_ptest_any:
|
||||
case Intrinsic::aarch64_sve_ptest_first:
|
||||
case Intrinsic::aarch64_sve_ptest_last:
|
||||
return optimizePTest(IntrI);
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SVEIntrinsicOpts::optimizeFunctions(
|
||||
SmallSetVector<Function *, 4> &Functions) {
|
||||
bool Changed = false;
|
||||
for (auto *F : Functions) {
|
||||
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
|
||||
|
||||
// Traverse the DT with an rpo walk so we see defs before uses, allowing
|
||||
// simplification to be done incrementally.
|
||||
BasicBlock *Root = DT->getRoot();
|
||||
ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
|
||||
for (auto *BB : RPOT)
|
||||
for (Instruction &I : make_early_inc_range(*BB))
|
||||
Changed |= optimizeIntrinsic(&I);
|
||||
}
|
||||
return Changed;
|
||||
}
|
||||
|
||||
bool SVEIntrinsicOpts::runOnModule(Module &M) {
|
||||
bool Changed = false;
|
||||
SmallSetVector<Function *, 4> Functions;
|
||||
|
||||
// Check for SVE intrinsic declarations first so that we only iterate over
|
||||
// relevant functions. Where an appropriate declaration is found, store the
|
||||
// function(s) where it is used so we can target these only.
|
||||
for (auto &F : M.getFunctionList()) {
|
||||
if (!F.isDeclaration())
|
||||
continue;
|
||||
|
||||
switch (F.getIntrinsicID()) {
|
||||
case Intrinsic::aarch64_sve_convert_from_svbool:
|
||||
case Intrinsic::aarch64_sve_ptest_any:
|
||||
case Intrinsic::aarch64_sve_ptest_first:
|
||||
case Intrinsic::aarch64_sve_ptest_last:
|
||||
for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
|
||||
auto *Inst = dyn_cast<Instruction>(*I++);
|
||||
Functions.insert(Inst->getFunction());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!Functions.empty())
|
||||
Changed |= optimizeFunctions(Functions);
|
||||
|
||||
return Changed;
|
||||
}
|
|
@ -18,6 +18,10 @@
|
|||
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
|
||||
; CHECK-NEXT: FunctionPass Manager
|
||||
; CHECK-NEXT: Expand Atomic instructions
|
||||
; CHECK-NEXT: SVE intrinsics optimizations
|
||||
; CHECK-NEXT: FunctionPass Manager
|
||||
; CHECK-NEXT: Dominator Tree Construction
|
||||
; CHECK-NEXT: FunctionPass Manager
|
||||
; CHECK-NEXT: Simplify the CFG
|
||||
; CHECK-NEXT: Dominator Tree Construction
|
||||
; CHECK-NEXT: Natural Loop Information
|
||||
|
|
|
@ -0,0 +1,67 @@
|
|||
; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
|
||||
|
||||
define i1 @ptest_any1(<vscale x 2 x i1> %a) {
|
||||
; OPT-LABEL: ptest_any1
|
||||
; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
|
||||
; OPT-NOT: convert
|
||||
; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %a)
|
||||
; OPT-NEXT: ret i1 %[[OUT]]
|
||||
%mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
|
||||
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
|
||||
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
|
||||
%out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
|
||||
ret i1 %out
|
||||
}
|
||||
|
||||
; No transform because the ptest is using differently sized operands.
|
||||
define i1 @ptest_any2(<vscale x 4 x i1> %a) {
|
||||
; OPT-LABEL: ptest_any2
|
||||
; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
|
||||
; OPT-NEXT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
|
||||
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
|
||||
; OPT-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
|
||||
%mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
|
||||
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
|
||||
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
|
||||
%out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
|
||||
ret i1 %out
|
||||
}
|
||||
|
||||
define i1 @ptest_first(<vscale x 4 x i1> %a) {
|
||||
; OPT-LABEL: ptest_first
|
||||
; OPT: %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
|
||||
; OPT-NOT: convert
|
||||
; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %a)
|
||||
; OPT-NEXT: ret i1 %[[OUT]]
|
||||
%mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
|
||||
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %mask)
|
||||
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
|
||||
%out = call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
|
||||
ret i1 %out
|
||||
}
|
||||
|
||||
define i1 @ptest_last(<vscale x 8 x i1> %a) {
|
||||
; OPT-LABEL: ptest_last
|
||||
; OPT: %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
|
||||
; OPT-NOT: convert
|
||||
; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %a)
|
||||
; OPT-NEXT: ret i1 %[[OUT]]
|
||||
%mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
|
||||
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %mask)
|
||||
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
|
||||
%out = call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
|
||||
ret i1 %out
|
||||
}
|
||||
|
||||
declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
|
||||
declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
|
||||
declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
|
||||
declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
|
||||
|
||||
declare i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
|
||||
declare i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
|
||||
declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
|
||||
|
||||
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
|
||||
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
|
||||
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
|
|
@ -0,0 +1,203 @@
|
|||
; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
|
||||
|
||||
define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) {
|
||||
; OPT-LABEL: @reinterpret_test_h(
|
||||
; OPT-NOT: convert
|
||||
; OPT: ret <vscale x 8 x i1> %a
|
||||
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
|
||||
%2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
|
||||
ret <vscale x 8 x i1> %2
|
||||
}
|
||||
|
||||
; Reinterprets are not redundant because the second reinterpret zeros the
|
||||
; lanes that don't exist within its input.
|
||||
define <vscale x 16 x i1> @reinterpret_test_h_rev(<vscale x 16 x i1> %a) {
|
||||
; OPT-LABEL: @reinterpret_test_h_rev(
|
||||
; OPT: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
|
||||
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
|
||||
; OPT-NEXT: ret <vscale x 16 x i1> %2
|
||||
%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
|
||||
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
|
||||
ret <vscale x 16 x i1> %2
|
||||
}
|
||||
|
||||
define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) {
|
||||
; OPT-LABEL: @reinterpret_test_w(
|
||||
; OPT-NOT: convert
|
||||
; OPT: ret <vscale x 4 x i1> %a
|
||||
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
|
||||
%2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
|
||||
ret <vscale x 4 x i1> %2
|
||||
}
|
||||
|
||||
; Reinterprets are not redundant because the second reinterpret zeros the
|
||||
; lanes that don't exist within its input.
|
||||
define <vscale x 16 x i1> @reinterpret_test_w_rev(<vscale x 16 x i1> %a) {
|
||||
; OPT-LABEL: @reinterpret_test_w_rev(
|
||||
; OPT: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
|
||||
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
|
||||
; OPT-NEXT: ret <vscale x 16 x i1> %2
|
||||
%1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
|
||||
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
|
||||
ret <vscale x 16 x i1> %2
|
||||
}
|
||||
|
||||
define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) {
|
||||
; OPT-LABEL: @reinterpret_test_d(
|
||||
; OPT-NOT: convert
|
||||
; OPT: ret <vscale x 2 x i1> %a
|
||||
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
|
||||
%2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %1)
|
||||
ret <vscale x 2 x i1> %2
|
||||
}
|
||||
|
||||
; Reinterprets are not redundant because the second reinterpret zeros the
|
||||
; lanes that don't exist within its input.
|
||||
define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) {
|
||||
; OPT-LABEL: @reinterpret_test_d_rev(
|
||||
; OPT: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
|
||||
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
|
||||
; OPT-NEXT: ret <vscale x 16 x i1> %2
|
||||
%1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
|
||||
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
|
||||
ret <vscale x 16 x i1> %2
|
||||
}
|
||||
|
||||
define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
|
||||
; OPT-LABEL: reinterpret_reductions
|
||||
; OPT-NOT: convert
|
||||
; OPT-NOT: phi <vscale x 16 x i1>
|
||||
; OPT: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ]
|
||||
; OPT-NOT: convert
|
||||
; OPT: ret
|
||||
|
||||
entry:
|
||||
switch i32 %cond, label %br_phi_c [
|
||||
i32 43, label %br_phi_a
|
||||
i32 45, label %br_phi_b
|
||||
]
|
||||
|
||||
br_phi_a:
|
||||
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
|
||||
br label %join
|
||||
|
||||
br_phi_b:
|
||||
%b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b)
|
||||
br label %join
|
||||
|
||||
br_phi_c:
|
||||
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
|
||||
br label %join
|
||||
|
||||
join:
|
||||
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
|
||||
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
|
||||
ret <vscale x 2 x i1> %pg1
|
||||
}
|
||||
|
||||
; No transform as the reinterprets are converting from different types (nxv2i1 & nxv4i1)
|
||||
; As the incoming values to the phi must all be the same type, we cannot remove the reinterprets.
|
||||
define <vscale x 2 x i1> @reinterpret_reductions_1(i32 %cond, <vscale x 2 x i1> %a, <vscale x 4 x i1> %b, <vscale x 2 x i1> %c) {
|
||||
; OPT-LABEL: reinterpret_reductions_1
|
||||
; OPT: convert
|
||||
; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
|
||||
; OPT-NOT: phi <vscale x 2 x i1>
|
||||
; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
|
||||
; OPT: ret
|
||||
|
||||
entry:
|
||||
switch i32 %cond, label %br_phi_c [
|
||||
i32 43, label %br_phi_a
|
||||
i32 45, label %br_phi_b
|
||||
]
|
||||
|
||||
br_phi_a:
|
||||
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
|
||||
br label %join
|
||||
|
||||
br_phi_b:
|
||||
%b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %b)
|
||||
br label %join
|
||||
|
||||
br_phi_c:
|
||||
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
|
||||
br label %join
|
||||
|
||||
join:
|
||||
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
|
||||
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
|
||||
ret <vscale x 2 x i1> %pg1
|
||||
}
|
||||
|
||||
; No transform. Similar to the the test above, but here only two of the arguments need to
|
||||
; be converted to svbool.
|
||||
define <vscale x 2 x i1> @reinterpret_reductions_2(i32 %cond, <vscale x 2 x i1> %a, <vscale x 16 x i1> %b, <vscale x 2 x i1> %c) {
|
||||
; OPT-LABEL: reinterpret_reductions_2
|
||||
; OPT: convert
|
||||
; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
|
||||
; OPT-NOT: phi <vscale x 2 x i1>
|
||||
; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
|
||||
; OPT: ret
|
||||
|
||||
entry:
|
||||
switch i32 %cond, label %br_phi_c [
|
||||
i32 43, label %br_phi_a
|
||||
i32 45, label %br_phi_b
|
||||
]
|
||||
|
||||
br_phi_a:
|
||||
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
|
||||
br label %join
|
||||
|
||||
br_phi_b:
|
||||
br label %join
|
||||
|
||||
br_phi_c:
|
||||
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
|
||||
br label %join
|
||||
|
||||
join:
|
||||
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
|
||||
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
|
||||
ret <vscale x 2 x i1> %pg1
|
||||
}
|
||||
|
||||
; Similar to reinterpret_reductions but the reinterprets remain because the
|
||||
; original phi cannot be removed (i.e. prefer reinterprets over multiple phis).
|
||||
define <vscale x 16 x i1> @reinterpret_reductions3(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
|
||||
; OPT-LABEL: reinterpret_reductions3
|
||||
; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
|
||||
; OPT-NOT: phi <vscale x 2 x i1>
|
||||
; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
|
||||
; OPT-NEXT: ret <vscale x 16 x i1> %pg
|
||||
|
||||
entry:
|
||||
switch i32 %cond, label %br_phi_c [
|
||||
i32 43, label %br_phi_a
|
||||
i32 45, label %br_phi_b
|
||||
]
|
||||
|
||||
br_phi_a:
|
||||
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
|
||||
br label %join
|
||||
|
||||
br_phi_b:
|
||||
%b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b)
|
||||
br label %join
|
||||
|
||||
br_phi_c:
|
||||
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
|
||||
br label %join
|
||||
|
||||
join:
|
||||
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
|
||||
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
|
||||
ret <vscale x 16 x i1> %pg
|
||||
}
|
||||
|
||||
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
|
||||
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
|
||||
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
|
||||
declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
|
||||
declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
|
||||
declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)
|
Loading…
Reference in New Issue