[AArch64][SVE] Add a pass for SVE intrinsic optimisations

Summary:
Creates the SVEIntrinsicOpts pass. In this patch, the pass tries
to remove unnecessary reinterpret intrinsics which convert to
and from svbool_t (llvm.aarch64.sve.convert.[to|from].svbool)

For example, the reinterprets below are redundant:

  %1 = call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
  %2 = call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)

The pass also looks for ptest intrinsics and phi instructions where
the operands are being needlessly converted to and from svbool_t.

Reviewers: sdesmalen, andwar, efriedma, cameron.mcinally, c-rhodes, rengolin

Reviewed By: efriedma

Subscribers: mgorny, tschuett, kristof.beyls, hiraditya, rkruppe, psnobl, danielkiss, cfe-commits, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D76078
This commit is contained in:
Kerry McLaughlin 2020-04-14 09:52:04 +01:00
parent 31c8e11896
commit 36c76de678
7 changed files with 564 additions and 0 deletions

View File

@ -52,6 +52,7 @@ FunctionPass *createAArch64BranchTargetsPass();
FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
FunctionPass *createAArch64CollectLOHPass();
ModulePass *createSVEIntrinsicOptsPass();
InstructionSelector *
createAArch64InstructionSelector(const AArch64TargetMachine &,
AArch64Subtarget &, AArch64RegisterBankInfo &);
@ -80,6 +81,7 @@ void initializeAArch64StorePairSuppressPass(PassRegistry&);
void initializeFalkorHWPFFixPass(PassRegistry&);
void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
void initializeLDTLSCleanupPass(PassRegistry&);
void initializeSVEIntrinsicOptsPass(PassRegistry&);
void initializeAArch64StackTaggingPass(PassRegistry&);
void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
} // end namespace llvm

View File

@ -146,6 +146,11 @@ static cl::opt<int> EnableGlobalISelAtO(
cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
cl::init(0));
static cl::opt<bool> EnableSVEIntrinsicOpts(
"aarch64-sve-intrinsic-opts", cl::Hidden,
cl::desc("Enable SVE intrinsic opts"),
cl::init(true));
static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
cl::init(true), cl::Hidden);
@ -182,6 +187,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
initializeFalkorHWPFFixPass(*PR);
initializeFalkorMarkStridedAccessesLegacyPass(*PR);
initializeLDTLSCleanupPass(*PR);
initializeSVEIntrinsicOptsPass(*PR);
initializeAArch64SpeculationHardeningPass(*PR);
initializeAArch64StackTaggingPass(*PR);
initializeAArch64StackTaggingPreRAPass(*PR);
@ -434,6 +440,10 @@ void AArch64PassConfig::addIRPasses() {
// ourselves.
addPass(createAtomicExpandPass());
// Expand any SVE vector library calls that we can't code generate directly.
if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive)
addPass(createSVEIntrinsicOptsPass());
// Cmpxchg instructions are often used with a subsequent comparison to
// determine whether it succeeded. We can exploit existing control-flow in
// ldrex/strex loops to simplify this, but it needs tidying up.

View File

@ -64,6 +64,7 @@ add_llvm_target(AArch64CodeGen
AArch64TargetMachine.cpp
AArch64TargetObjectFile.cpp
AArch64TargetTransformInfo.cpp
SVEIntrinsicOpts.cpp
AArch64SIMDInstrOpt.cpp
DEPENDS

View File

@ -0,0 +1,277 @@
//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// Performs general IR level optimizations on SVE intrinsics.
//
// The main goal of this pass is to remove unnecessary reinterpret
// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
//
// %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
// %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
//
// This pass also looks for ptest intrinsics & phi instructions where the
// operands are being needlessly converted to and from svbool_t.
//
//===----------------------------------------------------------------------===//
#include "Utils/AArch64BaseInfo.h"
#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/LLVMContext.h"
#include "llvm/IR/PatternMatch.h"
#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
using namespace llvm::PatternMatch;
#define DEBUG_TYPE "sve-intrinsic-opts"
namespace llvm {
void initializeSVEIntrinsicOptsPass(PassRegistry &);
}
namespace {
struct SVEIntrinsicOpts : public ModulePass {
static char ID; // Pass identification, replacement for typeid
SVEIntrinsicOpts() : ModulePass(ID) {
initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
}
bool runOnModule(Module &M) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
private:
static IntrinsicInst *isReinterpretFromSVBool(Value *V);
static IntrinsicInst *isReinterpretToSVBool(Value *V);
static bool optimizeIntrinsic(Instruction *I);
bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
static bool optimizeConvertFromSVBool(IntrinsicInst *I);
static bool optimizePTest(IntrinsicInst *I);
static bool processPhiNode(IntrinsicInst *I);
};
} // end anonymous namespace
void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<DominatorTreeWrapperPass>();
AU.setPreservesCFG();
}
char SVEIntrinsicOpts::ID = 0;
static const char *name = "SVE intrinsics optimizations";
INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
namespace llvm {
ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
} // namespace llvm
/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
/// otherwise.
IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
if (!I)
return nullptr;
if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
return nullptr;
return I;
}
/// Returns V if it's a cast to <n x 16 x i1> (aka svbool_t), nullptr otherwise.
IntrinsicInst *SVEIntrinsicOpts::isReinterpretFromSVBool(Value *V) {
IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
if (!I)
return nullptr;
if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_from_svbool)
return nullptr;
return I;
}
/// The function will remove redundant reinterprets casting in the presence
/// of the control flow
bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
SmallVector<Instruction *, 32> Worklist;
auto RequiredType = X->getType();
auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
assert(PN && "Expected Phi Node!");
// Don't create a new Phi unless we can remove the old one.
if (!PN->hasOneUse())
return false;
for (Value *IncValPhi : PN->incoming_values()) {
auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
if (!Reinterpret ||
RequiredType != Reinterpret->getArgOperand(0)->getType())
return false;
}
// Create the new Phi
LLVMContext &Ctx = PN->getContext();
IRBuilder<> Builder(Ctx);
Builder.SetInsertPoint(PN);
PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
Worklist.push_back(PN);
for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
Worklist.push_back(Reinterpret);
}
// Cleanup Phi Node and reinterprets
X->replaceAllUsesWith(NPN);
X->eraseFromParent();
for (auto &I : Worklist)
if (I->use_empty())
I->eraseFromParent();
return true;
}
bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
if (Op1 && Op2 &&
Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
Type *Tys[] = {Op1->getArgOperand(0)->getType()};
Module *M = I->getParent()->getParent()->getParent();
auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
I->replaceAllUsesWith(CI);
I->eraseFromParent();
if (Op1->use_empty())
Op1->eraseFromParent();
if (Op2->use_empty())
Op2->eraseFromParent();
return true;
}
return false;
}
bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
assert(isReinterpretFromSVBool(I));
// If the reinterpret instruction operand is a PHI Node
if (isa<PHINode>(I->getArgOperand(0)))
return processPhiNode(I);
// If we have a reinterpret intrinsic I of type A which is converting from
// another reinterpret Y of type B, and the source type of Y is A, then we can
// elide away both reinterprets if there are no other users of Y.
auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
if (!Y)
return false;
Value *SourceVal = Y->getArgOperand(0);
if (I->getType() != SourceVal->getType())
return false;
I->replaceAllUsesWith(SourceVal);
I->eraseFromParent();
if (Y->use_empty())
Y->eraseFromParent();
return true;
}
bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
if (!IntrI)
return false;
switch (IntrI->getIntrinsicID()) {
case Intrinsic::aarch64_sve_convert_from_svbool:
return optimizeConvertFromSVBool(IntrI);
case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
return optimizePTest(IntrI);
default:
return false;
}
return true;
}
bool SVEIntrinsicOpts::optimizeFunctions(
SmallSetVector<Function *, 4> &Functions) {
bool Changed = false;
for (auto *F : Functions) {
DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
// Traverse the DT with an rpo walk so we see defs before uses, allowing
// simplification to be done incrementally.
BasicBlock *Root = DT->getRoot();
ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
for (auto *BB : RPOT)
for (Instruction &I : make_early_inc_range(*BB))
Changed |= optimizeIntrinsic(&I);
}
return Changed;
}
bool SVEIntrinsicOpts::runOnModule(Module &M) {
bool Changed = false;
SmallSetVector<Function *, 4> Functions;
// Check for SVE intrinsic declarations first so that we only iterate over
// relevant functions. Where an appropriate declaration is found, store the
// function(s) where it is used so we can target these only.
for (auto &F : M.getFunctionList()) {
if (!F.isDeclaration())
continue;
switch (F.getIntrinsicID()) {
case Intrinsic::aarch64_sve_convert_from_svbool:
case Intrinsic::aarch64_sve_ptest_any:
case Intrinsic::aarch64_sve_ptest_first:
case Intrinsic::aarch64_sve_ptest_last:
for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
auto *Inst = dyn_cast<Instruction>(*I++);
Functions.insert(Inst->getFunction());
}
break;
default:
break;
}
}
if (!Functions.empty())
Changed |= optimizeFunctions(Functions);
return Changed;
}

View File

@ -18,6 +18,10 @@
; CHECK-NEXT: Pre-ISel Intrinsic Lowering
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Expand Atomic instructions
; CHECK-NEXT: SVE intrinsics optimizations
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: FunctionPass Manager
; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information

View File

@ -0,0 +1,67 @@
; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
define i1 @ptest_any1(<vscale x 2 x i1> %a) {
; OPT-LABEL: ptest_any1
; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
; OPT-NOT: convert
; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv2i1(<vscale x 2 x i1> %mask, <vscale x 2 x i1> %a)
; OPT-NEXT: ret i1 %[[OUT]]
%mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 0)
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
%out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
ret i1 %out
}
; No transform because the ptest is using differently sized operands.
define i1 @ptest_any2(<vscale x 4 x i1> %a) {
; OPT-LABEL: ptest_any2
; OPT: %mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
; OPT-NEXT: %1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
; OPT-NEXT: %out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
%mask = tail call <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32 31)
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %mask)
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
%out = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
ret i1 %out
}
define i1 @ptest_first(<vscale x 4 x i1> %a) {
; OPT-LABEL: ptest_first
; OPT: %mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
; OPT-NOT: convert
; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1(<vscale x 4 x i1> %mask, <vscale x 4 x i1> %a)
; OPT-NEXT: ret i1 %[[OUT]]
%mask = tail call <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32 0)
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %mask)
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
%out = call i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
ret i1 %out
}
define i1 @ptest_last(<vscale x 8 x i1> %a) {
; OPT-LABEL: ptest_last
; OPT: %mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
; OPT-NOT: convert
; OPT-NEXT: %[[OUT:.*]] = call i1 @llvm.aarch64.sve.ptest.last.nxv8i1(<vscale x 8 x i1> %mask, <vscale x 8 x i1> %a)
; OPT-NEXT: ret i1 %[[OUT]]
%mask = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 0)
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %mask)
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
%out = call i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1> %1, <vscale x 16 x i1> %2)
ret i1 %out
}
declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)
declare <vscale x 4 x i1> @llvm.aarch64.sve.ptrue.nxv4i1(i32)
declare <vscale x 2 x i1> @llvm.aarch64.sve.ptrue.nxv2i1(i32)
declare i1 @llvm.aarch64.sve.ptest.any.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
declare i1 @llvm.aarch64.sve.ptest.first.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
declare i1 @llvm.aarch64.sve.ptest.last.nxv16i1(<vscale x 16 x i1>, <vscale x 16 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)

View File

@ -0,0 +1,203 @@
; RUN: opt -S -sve-intrinsic-opts -mtriple=aarch64-linux-gnu -mattr=+sve < %s | FileCheck --check-prefix OPT %s
define <vscale x 8 x i1> @reinterpret_test_h(<vscale x 8 x i1> %a) {
; OPT-LABEL: @reinterpret_test_h(
; OPT-NOT: convert
; OPT: ret <vscale x 8 x i1> %a
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %a)
%2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
ret <vscale x 8 x i1> %2
}
; Reinterprets are not redundant because the second reinterpret zeros the
; lanes that don't exist within its input.
define <vscale x 16 x i1> @reinterpret_test_h_rev(<vscale x 16 x i1> %a) {
; OPT-LABEL: @reinterpret_test_h_rev(
; OPT: %1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
; OPT-NEXT: ret <vscale x 16 x i1> %2
%1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %a)
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %1)
ret <vscale x 16 x i1> %2
}
define <vscale x 4 x i1> @reinterpret_test_w(<vscale x 4 x i1> %a) {
; OPT-LABEL: @reinterpret_test_w(
; OPT-NOT: convert
; OPT: ret <vscale x 4 x i1> %a
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
%2 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
ret <vscale x 4 x i1> %2
}
; Reinterprets are not redundant because the second reinterpret zeros the
; lanes that don't exist within its input.
define <vscale x 16 x i1> @reinterpret_test_w_rev(<vscale x 16 x i1> %a) {
; OPT-LABEL: @reinterpret_test_w_rev(
; OPT: %1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
; OPT-NEXT: ret <vscale x 16 x i1> %2
%1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %a)
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %1)
ret <vscale x 16 x i1> %2
}
define <vscale x 2 x i1> @reinterpret_test_d(<vscale x 2 x i1> %a) {
; OPT-LABEL: @reinterpret_test_d(
; OPT-NOT: convert
; OPT: ret <vscale x 2 x i1> %a
%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
%2 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %1)
ret <vscale x 2 x i1> %2
}
; Reinterprets are not redundant because the second reinterpret zeros the
; lanes that don't exist within its input.
define <vscale x 16 x i1> @reinterpret_test_d_rev(<vscale x 16 x i1> %a) {
; OPT-LABEL: @reinterpret_test_d_rev(
; OPT: %1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
; OPT-NEXT: %2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
; OPT-NEXT: ret <vscale x 16 x i1> %2
%1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %a)
%2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %1)
ret <vscale x 16 x i1> %2
}
define <vscale x 2 x i1> @reinterpret_reductions(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
; OPT-LABEL: reinterpret_reductions
; OPT-NOT: convert
; OPT-NOT: phi <vscale x 16 x i1>
; OPT: phi <vscale x 2 x i1> [ %a, %br_phi_a ], [ %b, %br_phi_b ], [ %c, %br_phi_c ]
; OPT-NOT: convert
; OPT: ret
entry:
switch i32 %cond, label %br_phi_c [
i32 43, label %br_phi_a
i32 45, label %br_phi_b
]
br_phi_a:
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
br label %join
br_phi_b:
%b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b)
br label %join
br_phi_c:
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
br label %join
join:
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
ret <vscale x 2 x i1> %pg1
}
; No transform as the reinterprets are converting from different types (nxv2i1 & nxv4i1)
; As the incoming values to the phi must all be the same type, we cannot remove the reinterprets.
define <vscale x 2 x i1> @reinterpret_reductions_1(i32 %cond, <vscale x 2 x i1> %a, <vscale x 4 x i1> %b, <vscale x 2 x i1> %c) {
; OPT-LABEL: reinterpret_reductions_1
; OPT: convert
; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
; OPT-NOT: phi <vscale x 2 x i1>
; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
; OPT: ret
entry:
switch i32 %cond, label %br_phi_c [
i32 43, label %br_phi_a
i32 45, label %br_phi_b
]
br_phi_a:
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
br label %join
br_phi_b:
%b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %b)
br label %join
br_phi_c:
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
br label %join
join:
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
ret <vscale x 2 x i1> %pg1
}
; No transform. Similar to the the test above, but here only two of the arguments need to
; be converted to svbool.
define <vscale x 2 x i1> @reinterpret_reductions_2(i32 %cond, <vscale x 2 x i1> %a, <vscale x 16 x i1> %b, <vscale x 2 x i1> %c) {
; OPT-LABEL: reinterpret_reductions_2
; OPT: convert
; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
; OPT-NOT: phi <vscale x 2 x i1>
; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
; OPT: ret
entry:
switch i32 %cond, label %br_phi_c [
i32 43, label %br_phi_a
i32 45, label %br_phi_b
]
br_phi_a:
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
br label %join
br_phi_b:
br label %join
br_phi_c:
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
br label %join
join:
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b, %br_phi_b ], [ %c1, %br_phi_c ]
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
ret <vscale x 2 x i1> %pg1
}
; Similar to reinterpret_reductions but the reinterprets remain because the
; original phi cannot be removed (i.e. prefer reinterprets over multiple phis).
define <vscale x 16 x i1> @reinterpret_reductions3(i32 %cond, <vscale x 2 x i1> %a, <vscale x 2 x i1> %b, <vscale x 2 x i1> %c) {
; OPT-LABEL: reinterpret_reductions3
; OPT: phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
; OPT-NOT: phi <vscale x 2 x i1>
; OPT: tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
; OPT-NEXT: ret <vscale x 16 x i1> %pg
entry:
switch i32 %cond, label %br_phi_c [
i32 43, label %br_phi_a
i32 45, label %br_phi_b
]
br_phi_a:
%a1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %a)
br label %join
br_phi_b:
%b1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %b)
br label %join
br_phi_c:
%c1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1> %c)
br label %join
join:
%pg = phi <vscale x 16 x i1> [ %a1, %br_phi_a ], [ %b1, %br_phi_b ], [ %c1, %br_phi_c ]
%pg1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> %pg)
ret <vscale x 16 x i1> %pg
}
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)
declare <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1>)
declare <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1>)