forked from OSchip/llvm-project
[Matrix] Add initial tiling for load/multiply/store chains.
This patch adds initial fusion for load/multiply/store chains of matrix operations. The patch contains roughly two parts: 1. Code generation for a fused load/multiply/store chain (LowerMatrixMultiplyFused). First, we ensure that both loads of the multiply operands do not alias the store. If they do, we create new non-aliasing copies of the operands. Note that this may introduce new basic block. Finally we process TileSize x TileSize blocks. That is: load tiles from the input operands, multiply and store them. 2. Identify fusion candidates & matrix instructions. As a first step, collect all instructions with shape info and fusion candidates (currently @llvm.matrix.multiply calls). Next, try to fuse candidates and collect instructions eliminated by fusion. Finally iterate over all matrix instructions, skip the ones eliminated by fusion and lower the rest as usual. Reviewers: anemet, Gerolf, hfinkel, andrew.w.kaylor, LuoYuanke Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D75566
This commit is contained in:
parent
c2d03e4ef1
commit
d1fed7081d
|
@ -9,7 +9,11 @@
|
|||
// Lower matrix intrinsics to vector operations.
|
||||
//
|
||||
// TODO:
|
||||
// * Implement multiply & add fusion
|
||||
// * Improve fusion:
|
||||
// * Support more cases, e.g. multiply-add, multiply-sub, operands/results
|
||||
// transposed.
|
||||
// * Improve cost-modeling, e.g. choose different number of rows/columns
|
||||
// columns for tiles, consider cost of copies on alias.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
@ -17,6 +21,8 @@
|
|||
#include "llvm/ADT/GraphTraits.h"
|
||||
#include "llvm/ADT/PostOrderIterator.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Analysis/AliasAnalysis.h"
|
||||
#include "llvm/Analysis/DomTreeUpdater.h"
|
||||
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
|
@ -33,6 +39,7 @@
|
|||
#include "llvm/Pass.h"
|
||||
#include "llvm/Support/Debug.h"
|
||||
#include "llvm/Transforms/Scalar.h"
|
||||
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
|
||||
|
||||
using namespace llvm;
|
||||
using namespace PatternMatch;
|
||||
|
@ -44,6 +51,17 @@ static cl::opt<bool> EnableShapePropagation(
|
|||
cl::desc("Enable/disable shape propagation from matrix intrinsics to other "
|
||||
"instructions."));
|
||||
|
||||
static cl::opt<bool>
|
||||
FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,
|
||||
cl::desc("Enable/disable fusing matrix instructions."));
|
||||
// TODO: Allow and use non-square tiles.
|
||||
static cl::opt<unsigned> TileSize(
|
||||
"fuse-matrix-tile-size", cl::init(4), cl::Hidden,
|
||||
cl::desc(
|
||||
"Tile size for matrix instruction fusion using square-shaped tiles."));
|
||||
static cl::opt<bool> ForceFusion(
|
||||
"force-fuse-matrix", cl::init(false), cl::Hidden,
|
||||
cl::desc("Force matrix instruction fusion even if not profitable."));
|
||||
static cl::opt<bool> AllowContractEnabled(
|
||||
"matrix-allow-contract", cl::init(false), cl::Hidden,
|
||||
cl::desc("Allow the use of FMAs if available and profitable. This may "
|
||||
|
@ -146,6 +164,9 @@ class LowerMatrixIntrinsics {
|
|||
Function &Func;
|
||||
const DataLayout &DL;
|
||||
const TargetTransformInfo &TTI;
|
||||
AliasAnalysis &AA;
|
||||
DominatorTree &DT;
|
||||
LoopInfo &LI;
|
||||
OptimizationRemarkEmitter &ORE;
|
||||
|
||||
/// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
|
||||
|
@ -299,8 +320,10 @@ class LowerMatrixIntrinsics {
|
|||
|
||||
public:
|
||||
LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
|
||||
AliasAnalysis &AA, DominatorTree &DT, LoopInfo &LI,
|
||||
OptimizationRemarkEmitter &ORE)
|
||||
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), ORE(ORE) {}
|
||||
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
|
||||
LI(LI), ORE(ORE) {}
|
||||
|
||||
unsigned getNumOps(Type *VT) {
|
||||
assert(isa<VectorType>(VT) && "Expected vector type");
|
||||
|
@ -586,24 +609,46 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
ReversePostOrderTraversal<Function *> RPOT(&Func);
|
||||
bool Changed = false;
|
||||
for (auto *BB : RPOT) {
|
||||
for (Instruction &Inst : make_early_inc_range(*BB)) {
|
||||
IRBuilder<> Builder(&Inst);
|
||||
SmallVector<CallInst *, 16> MaybeFusableInsts;
|
||||
SmallVector<Instruction *, 16> MatrixInsts;
|
||||
|
||||
if (CallInst *CInst = dyn_cast<CallInst>(&Inst))
|
||||
Changed |= VisitCallInst(CInst);
|
||||
|
||||
Value *Op1;
|
||||
Value *Op2;
|
||||
if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst))
|
||||
Changed |= VisitBinaryOperator(BinOp);
|
||||
if (match(&Inst, m_Load(m_Value(Op1))))
|
||||
Changed |= VisitLoad(&Inst, Op1, Builder);
|
||||
else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2))))
|
||||
Changed |= VisitStore(&Inst, Op1, Op2, Builder);
|
||||
// First, collect all instructions with shape information and candidates for
|
||||
// fusion (currently only matrix multiplies).
|
||||
ReversePostOrderTraversal<Function *> RPOT(&Func);
|
||||
for (auto *BB : RPOT)
|
||||
for (Instruction &I : *BB) {
|
||||
if (ShapeMap.find(&I) == ShapeMap.end())
|
||||
continue;
|
||||
if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))
|
||||
MaybeFusableInsts.push_back(cast<CallInst>(&I));
|
||||
MatrixInsts.push_back(&I);
|
||||
}
|
||||
|
||||
// Second, try to fuse candidates.
|
||||
SmallPtrSet<Instruction *, 16> FusedInsts;
|
||||
for (CallInst *CI : MaybeFusableInsts)
|
||||
LowerMatrixMultiplyFused(CI, FusedInsts);
|
||||
Changed = !FusedInsts.empty();
|
||||
|
||||
// Third, lower remaining instructions with shape information.
|
||||
for (Instruction *Inst : MatrixInsts) {
|
||||
if (FusedInsts.find(Inst) != FusedInsts.end())
|
||||
continue;
|
||||
|
||||
IRBuilder<> Builder(Inst);
|
||||
|
||||
if (CallInst *CInst = dyn_cast<CallInst>(Inst))
|
||||
Changed |= VisitCallInst(CInst);
|
||||
|
||||
Value *Op1;
|
||||
Value *Op2;
|
||||
if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
|
||||
Changed |= VisitBinaryOperator(BinOp);
|
||||
if (match(Inst, m_Load(m_Value(Op1))))
|
||||
Changed |= VisitLoad(Inst, Op1, Builder);
|
||||
else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
|
||||
Changed |= VisitStore(Inst, Op1, Op2, Builder);
|
||||
}
|
||||
|
||||
RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
|
||||
|
@ -699,7 +744,7 @@ public:
|
|||
Value *TilePtr =
|
||||
Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
|
||||
|
||||
return loadMatrix(TileTy, TilePtr, Builder.getInt32(ResultShape.NumRows),
|
||||
return loadMatrix(TileTy, TilePtr, Builder.getInt32(MatrixShape.NumRows),
|
||||
ResultShape, Builder);
|
||||
}
|
||||
|
||||
|
@ -743,7 +788,7 @@ public:
|
|||
Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
|
||||
|
||||
storeMatrix(TileTy, StoreVal, TilePtr,
|
||||
Builder.getInt32(StoreVal.getNumRows()), Builder);
|
||||
Builder.getInt32(MatrixShape.NumRows), Builder);
|
||||
}
|
||||
|
||||
/// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
|
||||
|
@ -915,6 +960,212 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
/// Ensure that the memory in \p Load does not alias \p Store by potentially
|
||||
/// copying it to a new location. This new or otherwise the original location
|
||||
/// is returned.
|
||||
Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
|
||||
CallInst *MatMul) {
|
||||
MemoryLocation StoreLoc = MemoryLocation::get(Store);
|
||||
MemoryLocation LoadLoc = MemoryLocation::get(Load);
|
||||
|
||||
AliasResult LdAliased = AA.alias(LoadLoc, StoreLoc);
|
||||
|
||||
// If we can statically determine noalias we're good.
|
||||
if (!LdAliased)
|
||||
return Load->getPointerOperand();
|
||||
|
||||
// Create code to check if the memory locations of the Load and Store
|
||||
// overlap and if they do, copy Load's operand to a new buffer.
|
||||
|
||||
// First, create new blocks for 2n part of the check and the copy.
|
||||
BasicBlock *Check0 = MatMul->getParent();
|
||||
// FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a
|
||||
// DT. Manually collect dominator tree updates, to avoid unnecessary work,
|
||||
// as we adjust Check0 and Check1's branches.
|
||||
SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
|
||||
for (BasicBlock *Succ : successors(Check0))
|
||||
DTUpdates.push_back({DT.Delete, Check0, Succ});
|
||||
|
||||
BasicBlock *Check1 = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
|
||||
nullptr, "alias_cont");
|
||||
BasicBlock *Copy =
|
||||
SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, nullptr, "copy");
|
||||
BasicBlock *Fusion = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
|
||||
nullptr, "no_alias");
|
||||
|
||||
// Check if the loaded memory location begins before the end of the store
|
||||
// location. If the condition holds, they might overlap, otherwise they are
|
||||
// guaranteed to not overlap.
|
||||
IRBuilder<> Builder(MatMul);
|
||||
Check0->getTerminator()->eraseFromParent();
|
||||
Builder.SetInsertPoint(Check0);
|
||||
Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout());
|
||||
Value *StoreBegin = Builder.CreatePtrToInt(
|
||||
const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");
|
||||
Value *StoreEnd = Builder.CreateAdd(
|
||||
StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()),
|
||||
"store.end", true, true);
|
||||
Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr),
|
||||
IntPtrTy, "load.begin");
|
||||
Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1,
|
||||
Fusion);
|
||||
|
||||
// Check if the store begins before the end of the load location. If the
|
||||
// condition holds, they alias, otherwise they are guaranteed to not
|
||||
// overlap.
|
||||
Check1->getTerminator()->eraseFromParent();
|
||||
Builder.SetInsertPoint(Check1, Check1->begin());
|
||||
Value *LoadEnd = Builder.CreateAdd(
|
||||
LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()),
|
||||
"load.end", true, true);
|
||||
Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy,
|
||||
Fusion);
|
||||
|
||||
// Copy load operand to new alloca.
|
||||
Builder.SetInsertPoint(Copy, Copy->begin());
|
||||
AllocaInst *NewLd =
|
||||
Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace());
|
||||
Builder.CreateMemCpy(NewLd, MaybeAlign(NewLd->getAlignment()),
|
||||
Load->getPointerOperand(), Load->getAlign(),
|
||||
LoadLoc.Size.getValue());
|
||||
Builder.SetInsertPoint(Fusion, Fusion->begin());
|
||||
PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);
|
||||
PHI->addIncoming(Load->getPointerOperand(), Check0);
|
||||
PHI->addIncoming(Load->getPointerOperand(), Check1);
|
||||
PHI->addIncoming(NewLd, Copy);
|
||||
|
||||
// Adjust DT.
|
||||
DTUpdates.push_back({DT.Insert, Check0, Check1});
|
||||
DTUpdates.push_back({DT.Insert, Check0, Fusion});
|
||||
DTUpdates.push_back({DT.Insert, Check1, Copy});
|
||||
DTUpdates.push_back({DT.Insert, Check1, Fusion});
|
||||
DT.applyUpdates(DTUpdates);
|
||||
return PHI;
|
||||
}
|
||||
|
||||
bool isFusionProfitable(CallInst *MatMul) {
|
||||
if (ForceFusion)
|
||||
return true;
|
||||
|
||||
ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
|
||||
ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
|
||||
|
||||
const unsigned R = LShape.NumRows;
|
||||
const unsigned C = RShape.NumColumns;
|
||||
const unsigned M = LShape.NumColumns;
|
||||
auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
|
||||
|
||||
const unsigned VF =
|
||||
std::max<unsigned>(TTI.getRegisterBitWidth(true) /
|
||||
EltType->getPrimitiveSizeInBits().getFixedSize(),
|
||||
1U);
|
||||
|
||||
// Cost model for tiling
|
||||
//
|
||||
// For tiling to be beneficial, we need reuse either along the R or
|
||||
// the C axis. We vectorize along the R axis so that means at least
|
||||
// 3 elements.
|
||||
// TODO: Also consider cost of copying if operands alias.
|
||||
if (R <= VF && C == 1)
|
||||
return false;
|
||||
// Then we need enough elements to exceed the number of vector
|
||||
// registers we have. Note that this is an oversimplification since
|
||||
// fusing also takes some extra loads which may exceed the number of
|
||||
// reloads necessary.
|
||||
unsigned Op0Regs = (R + VF - 1) / VF * M;
|
||||
unsigned Op1Regs = (M + VF - 1) / VF * C;
|
||||
return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true);
|
||||
}
|
||||
|
||||
MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {
|
||||
MatrixTy Res;
|
||||
Type *ColumType = VectorType::get(EltType, R);
|
||||
for (unsigned I = 0; I < C; ++I)
|
||||
Res.addColumn(ConstantAggregateZero::get(ColumType));
|
||||
return Res;
|
||||
}
|
||||
|
||||
void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
|
||||
StoreInst *Store,
|
||||
SmallPtrSetImpl<Instruction *> &FusedInsts) {
|
||||
if (!isFusionProfitable(MatMul))
|
||||
return;
|
||||
|
||||
ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
|
||||
ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
|
||||
|
||||
const unsigned R = LShape.NumRows;
|
||||
const unsigned C = RShape.NumColumns;
|
||||
const unsigned M = LShape.NumColumns;
|
||||
auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
|
||||
|
||||
Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
|
||||
Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
|
||||
Value *CPtr = Store->getPointerOperand();
|
||||
|
||||
bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
|
||||
MatMul->hasAllowContract());
|
||||
IRBuilder<> Builder(Store);
|
||||
for (unsigned J = 0; J < C; J += TileSize)
|
||||
for (unsigned I = 0; I < R; I += TileSize) {
|
||||
const unsigned TileR = std::min(R - I, unsigned(TileSize));
|
||||
const unsigned TileC = std::min(C - J, unsigned(TileSize));
|
||||
MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
|
||||
|
||||
for (unsigned K = 0; K < M; K += TileSize) {
|
||||
const unsigned TileM = std::min(M - K, unsigned(TileSize));
|
||||
MatrixTy A =
|
||||
loadMatrix(APtr, LShape, I, K, {TileR, TileM}, EltType, Builder);
|
||||
MatrixTy B =
|
||||
loadMatrix(BPtr, RShape, K, J, {TileM, TileC}, EltType, Builder);
|
||||
emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
|
||||
}
|
||||
storeMatrix(Res, CPtr, {R, M}, I, J, EltType, Builder);
|
||||
}
|
||||
|
||||
// Mark eliminated instructions as fused and remove them.
|
||||
FusedInsts.insert(Store);
|
||||
FusedInsts.insert(MatMul);
|
||||
Store->eraseFromParent();
|
||||
MatMul->eraseFromParent();
|
||||
if (LoadOp0->hasNUses(0)) {
|
||||
FusedInsts.insert(LoadOp0);
|
||||
LoadOp0->eraseFromParent();
|
||||
}
|
||||
if (LoadOp1->hasNUses(0)) {
|
||||
FusedInsts.insert(LoadOp1);
|
||||
LoadOp1->eraseFromParent();
|
||||
}
|
||||
}
|
||||
|
||||
/// Try to lower matrix multiply chains by fusing operations.
|
||||
///
|
||||
/// Currently we only lower {ld, ld} -> matmul -> st chains.
|
||||
//
|
||||
/// No need to return a MatrixTy object for the result of the operation, since
|
||||
/// the single store user will be lowered as part of this. Instructions that
|
||||
/// are completely eliminated by fusion are added to \p FusedInsts.
|
||||
void LowerMatrixMultiplyFused(CallInst *MatMul,
|
||||
SmallPtrSetImpl<Instruction *> &FusedInsts) {
|
||||
if (!FuseMatrix || !MatMul->hasOneUse())
|
||||
return;
|
||||
|
||||
auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0));
|
||||
auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1));
|
||||
auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());
|
||||
if (LoadOp0 && LoadOp1 && Store) {
|
||||
// The store address must dominate the MatMul instruction, otherwise
|
||||
// we create invalid IR.
|
||||
// FIXME: See if we can hoist the store address computation.
|
||||
auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1));
|
||||
if (AddrI && (!DT.dominates(AddrI, MatMul)))
|
||||
return;
|
||||
|
||||
emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
/// Lowers llvm.matrix.multiply.
|
||||
void LowerMultiply(CallInst *MatMul) {
|
||||
IRBuilder<> Builder(MatMul);
|
||||
|
@ -1481,7 +1732,11 @@ PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
|
|||
FunctionAnalysisManager &AM) {
|
||||
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
|
||||
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
|
||||
LowerMatrixIntrinsics LMT(F, TTI, ORE);
|
||||
auto &AA = AM.getResult<AAManager>(F);
|
||||
auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
|
||||
auto &LI = AM.getResult<LoopAnalysis>(F);
|
||||
|
||||
LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
|
||||
if (LMT.Visit()) {
|
||||
PreservedAnalyses PA;
|
||||
PA.preserveSet<CFGAnalyses>();
|
||||
|
@ -1504,7 +1759,10 @@ public:
|
|||
bool runOnFunction(Function &F) override {
|
||||
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
||||
auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
|
||||
LowerMatrixIntrinsics LMT(F, TTI, ORE);
|
||||
auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
|
||||
auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
|
||||
auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
|
||||
LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
|
||||
bool C = LMT.Visit();
|
||||
return C;
|
||||
}
|
||||
|
@ -1512,7 +1770,11 @@ public:
|
|||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetTransformInfoWrapperPass>();
|
||||
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
|
||||
AU.setPreservesCFG();
|
||||
AU.addRequired<AAResultsWrapperPass>();
|
||||
AU.addRequired<DominatorTreeWrapperPass>();
|
||||
AU.addPreserved<DominatorTreeWrapperPass>();
|
||||
AU.addRequired<LoopInfoWrapperPass>();
|
||||
AU.addPreserved<LoopInfoWrapperPass>();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
@ -1522,6 +1784,9 @@ char LowerMatrixIntrinsicsLegacyPass::ID = 0;
|
|||
INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
|
||||
false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
|
||||
INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
|
||||
INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
|
||||
false, false)
|
||||
|
||||
|
|
|
@ -0,0 +1,303 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
||||
; RUN: opt -lower-matrix-intrinsics -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s
|
||||
|
||||
; REQUIRES: aarch64-registered-target
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "aarch64-apple-ios"
|
||||
|
||||
define void @test(<6 x double> * %A, <6 x double> * %B, <9 x double>* %C, i1 %cond) {
|
||||
; CHECK-LABEL: @test(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[COL_CAST196:%.*]] = bitcast <6 x double>* [[A:%.*]] to <3 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD197:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST196]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP198:%.*]] = getelementptr <6 x double>, <6 x double>* [[A]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST199:%.*]] = bitcast double* [[COL_GEP198]] to <3 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD200:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST199]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST201:%.*]] = bitcast <6 x double>* [[B:%.*]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD202:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST201]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP203:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST204:%.*]] = bitcast double* [[COL_GEP203]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD205:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST204]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP206:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST207:%.*]] = bitcast double* [[COL_GEP206]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD208:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST207]], align 8
|
||||
; CHECK-NEXT: [[ST_B:%.*]] = ptrtoint <9 x double>* [[C:%.*]] to i64
|
||||
; CHECK-NEXT: [[ST_E:%.*]] = add nuw nsw i64 [[ST_B]], 72
|
||||
; CHECK-NEXT: [[LD_B:%.*]] = ptrtoint <6 x double>* [[A]] to i64
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[ST_E]], [[LD_B]]
|
||||
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
|
||||
; CHECK: alias_cont:
|
||||
; CHECK-NEXT: [[LD_E:%.*]] = add nuw nsw i64 [[LD_B]], 48
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LD_E]], [[ST_B]]
|
||||
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
|
||||
; CHECK: copy:
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = alloca <6 x double>, align 64
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <6 x double>* [[TMP2]] to i8*
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <6 x double>* [[A]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP3]], i8* nonnull align 16 dereferenceable(48) [[TMP4]], i64 48, i1 false)
|
||||
; CHECK-NEXT: br label [[NO_ALIAS]]
|
||||
; CHECK: no_alias:
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = phi <6 x double>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
|
||||
; CHECK-NEXT: [[ST_B1:%.*]] = ptrtoint <9 x double>* [[C]] to i64
|
||||
; CHECK-NEXT: [[ST_E2:%.*]] = add nuw nsw i64 [[ST_B1]], 72
|
||||
; CHECK-NEXT: [[LD_B6:%.*]] = ptrtoint <6 x double>* [[B]] to i64
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[ST_E2]], [[LD_B6]]
|
||||
; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT3:%.*]], label [[NO_ALIAS5:%.*]]
|
||||
; CHECK: alias_cont1:
|
||||
; CHECK-NEXT: [[LD_E7:%.*]] = add nuw nsw i64 [[LD_B6]], 48
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LD_E7]], [[ST_B1]]
|
||||
; CHECK-NEXT: br i1 [[TMP7]], label [[COPY4:%.*]], label [[NO_ALIAS5]]
|
||||
; CHECK: copy2:
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = alloca <6 x double>, align 64
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <6 x double>* [[TMP8]] to i8*
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <6 x double>* [[B]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP9]], i8* nonnull align 16 dereferenceable(48) [[TMP10]], i64 48, i1 false)
|
||||
; CHECK-NEXT: br label [[NO_ALIAS5]]
|
||||
; CHECK: no_alias3:
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = phi <6 x double>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT3]] ], [ [[TMP8]], [[COPY4]] ]
|
||||
; CHECK-NEXT: [[COL_CAST8:%.*]] = bitcast <6 x double>* [[TMP5]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST8]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST9]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast <6 x double>* [[TMP11]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST12]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP14:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST15:%.*]] = bitcast double* [[COL_GEP14]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST15]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP12]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT22]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT25]], <2 x double> [[TMP14]])
|
||||
; CHECK-NEXT: [[COL_CAST27:%.*]] = bitcast <9 x double>* [[C]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP13]], <2 x double>* [[COL_CAST27]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP28:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST29:%.*]] = bitcast double* [[COL_GEP28]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP15]], <2 x double>* [[COL_CAST29]], align 8
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST31:%.*]] = bitcast double* [[TMP16]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD32:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST31]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP33:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 5
|
||||
; CHECK-NEXT: [[COL_CAST34:%.*]] = bitcast double* [[COL_GEP33]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD35:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST34]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST37:%.*]] = bitcast <6 x double>* [[TMP11]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD38:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST37]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP39:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST40:%.*]] = bitcast double* [[COL_GEP39]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD41:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST40]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> undef, <1 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = fmul <1 x double> [[COL_LOAD32]], [[SPLAT_SPLATINSERT43]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT46:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> undef, <1 x i32> <i32 1>
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> [[TMP17]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT49:%.*]] = shufflevector <2 x double> [[COL_LOAD41]], <2 x double> undef, <1 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = fmul <1 x double> [[COL_LOAD32]], [[SPLAT_SPLATINSERT49]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT52:%.*]] = shufflevector <2 x double> [[COL_LOAD41]], <2 x double> undef, <1 x i32> <i32 1>
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> [[TMP19]])
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST55:%.*]] = bitcast double* [[TMP21]] to <1 x double>*
|
||||
; CHECK-NEXT: store <1 x double> [[TMP18]], <1 x double>* [[COL_CAST55]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP56:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 5
|
||||
; CHECK-NEXT: [[COL_CAST57:%.*]] = bitcast double* [[COL_GEP56]] to <1 x double>*
|
||||
; CHECK-NEXT: store <1 x double> [[TMP20]], <1 x double>* [[COL_CAST57]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST59:%.*]] = bitcast <6 x double>* [[TMP5]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD60:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST59]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP61:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST62:%.*]] = bitcast double* [[COL_GEP61]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD63:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST62]], align 8
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST65:%.*]] = bitcast double* [[TMP22]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD66:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST65]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD66]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD60]], [[SPLAT_SPLAT69]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD66]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD63]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP23]])
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 6
|
||||
; CHECK-NEXT: [[COL_CAST74:%.*]] = bitcast double* [[TMP25]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP24]], <2 x double>* [[COL_CAST74]], align 8
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST76:%.*]] = bitcast double* [[TMP26]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD77:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST76]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP78:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP5]], i64 0, i64 5
|
||||
; CHECK-NEXT: [[COL_CAST79:%.*]] = bitcast double* [[COL_GEP78]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD80:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST79]], align 8
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP11]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST82:%.*]] = bitcast double* [[TMP27]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD83:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST82]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT85:%.*]] = shufflevector <2 x double> [[COL_LOAD83]], <2 x double> undef, <1 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = fmul <1 x double> [[COL_LOAD77]], [[SPLAT_SPLATINSERT85]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT88:%.*]] = shufflevector <2 x double> [[COL_LOAD83]], <2 x double> undef, <1 x i32> <i32 1>
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD80]], <1 x double> [[SPLAT_SPLATINSERT88]], <1 x double> [[TMP28]])
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 8
|
||||
; CHECK-NEXT: [[COL_CAST91:%.*]] = bitcast double* [[TMP30]] to <1 x double>*
|
||||
; CHECK-NEXT: store <1 x double> [[TMP29]], <1 x double>* [[COL_CAST91]], align 8
|
||||
; CHECK-NEXT: br i1 [[COND:%.*]], label [[TRUE:%.*]], label [[FALSE:%.*]]
|
||||
; CHECK: true:
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = fadd <3 x double> [[COL_LOAD197]], [[COL_LOAD197]]
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = fadd <3 x double> [[COL_LOAD200]], [[COL_LOAD200]]
|
||||
; CHECK-NEXT: [[COL_CAST214:%.*]] = bitcast <6 x double>* [[A]] to <3 x double>*
|
||||
; CHECK-NEXT: store <3 x double> [[TMP31]], <3 x double>* [[COL_CAST214]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP215:%.*]] = getelementptr <6 x double>, <6 x double>* [[A]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST216:%.*]] = bitcast double* [[COL_GEP215]] to <3 x double>*
|
||||
; CHECK-NEXT: store <3 x double> [[TMP32]], <3 x double>* [[COL_CAST216]], align 8
|
||||
; CHECK-NEXT: br label [[END:%.*]]
|
||||
; CHECK: false:
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = fadd <2 x double> [[COL_LOAD202]], [[COL_LOAD202]]
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = fadd <2 x double> [[COL_LOAD205]], [[COL_LOAD205]]
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = fadd <2 x double> [[COL_LOAD208]], [[COL_LOAD208]]
|
||||
; CHECK-NEXT: [[COL_CAST209:%.*]] = bitcast <6 x double>* [[B]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP33]], <2 x double>* [[COL_CAST209]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP210:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST211:%.*]] = bitcast double* [[COL_GEP210]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP34]], <2 x double>* [[COL_CAST211]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP212:%.*]] = getelementptr <6 x double>, <6 x double>* [[B]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST213:%.*]] = bitcast double* [[COL_GEP212]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP35]], <2 x double>* [[COL_CAST213]], align 8
|
||||
; CHECK-NEXT: br label [[END]]
|
||||
; CHECK: end:
|
||||
; CHECK-NEXT: [[ST_B92:%.*]] = ptrtoint <9 x double>* [[C]] to i64
|
||||
; CHECK-NEXT: [[ST_E93:%.*]] = add nuw nsw i64 [[ST_B92]], 72
|
||||
; CHECK-NEXT: [[LD_B97:%.*]] = ptrtoint <6 x double>* [[A]] to i64
|
||||
; CHECK-NEXT: [[TMP36:%.*]] = icmp ugt i64 [[ST_E93]], [[LD_B97]]
|
||||
; CHECK-NEXT: br i1 [[TMP36]], label [[ALIAS_CONT94:%.*]], label [[NO_ALIAS96:%.*]]
|
||||
; CHECK: alias_cont92:
|
||||
; CHECK-NEXT: [[LD_E98:%.*]] = add nuw nsw i64 [[LD_B97]], 48
|
||||
; CHECK-NEXT: [[TMP37:%.*]] = icmp ugt i64 [[LD_E98]], [[ST_B92]]
|
||||
; CHECK-NEXT: br i1 [[TMP37]], label [[COPY95:%.*]], label [[NO_ALIAS96]]
|
||||
; CHECK: copy93:
|
||||
; CHECK-NEXT: [[TMP38:%.*]] = alloca <6 x double>, align 64
|
||||
; CHECK-NEXT: [[TMP39:%.*]] = bitcast <6 x double>* [[TMP38]] to i8*
|
||||
; CHECK-NEXT: [[TMP40:%.*]] = bitcast <6 x double>* [[A]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP39]], i8* nonnull align 16 dereferenceable(48) [[TMP40]], i64 48, i1 false)
|
||||
; CHECK-NEXT: br label [[NO_ALIAS96]]
|
||||
; CHECK: no_alias94:
|
||||
; CHECK-NEXT: [[TMP41:%.*]] = phi <6 x double>* [ [[A]], [[END]] ], [ [[A]], [[ALIAS_CONT94]] ], [ [[TMP38]], [[COPY95]] ]
|
||||
; CHECK-NEXT: [[ST_B99:%.*]] = ptrtoint <9 x double>* [[C]] to i64
|
||||
; CHECK-NEXT: [[ST_E100:%.*]] = add nuw nsw i64 [[ST_B99]], 72
|
||||
; CHECK-NEXT: [[LD_B104:%.*]] = ptrtoint <6 x double>* [[B]] to i64
|
||||
; CHECK-NEXT: [[TMP42:%.*]] = icmp ugt i64 [[ST_E100]], [[LD_B104]]
|
||||
; CHECK-NEXT: br i1 [[TMP42]], label [[ALIAS_CONT101:%.*]], label [[NO_ALIAS103:%.*]]
|
||||
; CHECK: alias_cont99:
|
||||
; CHECK-NEXT: [[LD_E105:%.*]] = add nuw nsw i64 [[LD_B104]], 48
|
||||
; CHECK-NEXT: [[TMP43:%.*]] = icmp ugt i64 [[LD_E105]], [[ST_B99]]
|
||||
; CHECK-NEXT: br i1 [[TMP43]], label [[COPY102:%.*]], label [[NO_ALIAS103]]
|
||||
; CHECK: copy100:
|
||||
; CHECK-NEXT: [[TMP44:%.*]] = alloca <6 x double>, align 64
|
||||
; CHECK-NEXT: [[TMP45:%.*]] = bitcast <6 x double>* [[TMP44]] to i8*
|
||||
; CHECK-NEXT: [[TMP46:%.*]] = bitcast <6 x double>* [[B]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 64 dereferenceable(48) [[TMP45]], i8* nonnull align 16 dereferenceable(48) [[TMP46]], i64 48, i1 false)
|
||||
; CHECK-NEXT: br label [[NO_ALIAS103]]
|
||||
; CHECK: no_alias101:
|
||||
; CHECK-NEXT: [[TMP47:%.*]] = phi <6 x double>* [ [[B]], [[NO_ALIAS96]] ], [ [[B]], [[ALIAS_CONT101]] ], [ [[TMP44]], [[COPY102]] ]
|
||||
; CHECK-NEXT: [[COL_CAST107:%.*]] = bitcast <6 x double>* [[TMP41]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD108:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST107]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP109:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST110:%.*]] = bitcast double* [[COL_GEP109]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD111:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST110]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST113:%.*]] = bitcast <6 x double>* [[TMP47]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD114:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST113]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP115:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST116:%.*]] = bitcast double* [[COL_GEP115]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD117:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST116]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT120:%.*]] = shufflevector <2 x double> [[COL_LOAD114]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD108]], [[SPLAT_SPLAT120]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT123:%.*]] = shufflevector <2 x double> [[COL_LOAD114]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP49:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD111]], <2 x double> [[SPLAT_SPLAT123]], <2 x double> [[TMP48]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT126:%.*]] = shufflevector <2 x double> [[COL_LOAD117]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD108]], [[SPLAT_SPLAT126]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT129:%.*]] = shufflevector <2 x double> [[COL_LOAD117]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP51:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD111]], <2 x double> [[SPLAT_SPLAT129]], <2 x double> [[TMP50]])
|
||||
; CHECK-NEXT: [[COL_CAST131:%.*]] = bitcast <9 x double>* [[C]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP49]], <2 x double>* [[COL_CAST131]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP132:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST133:%.*]] = bitcast double* [[COL_GEP132]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP51]], <2 x double>* [[COL_CAST133]], align 8
|
||||
; CHECK-NEXT: [[TMP52:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST135:%.*]] = bitcast double* [[TMP52]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD136:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST135]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP137:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 5
|
||||
; CHECK-NEXT: [[COL_CAST138:%.*]] = bitcast double* [[COL_GEP137]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD139:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST138]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST141:%.*]] = bitcast <6 x double>* [[TMP47]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD142:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST141]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP143:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST144:%.*]] = bitcast double* [[COL_GEP143]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD145:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST144]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT147:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <1 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP53:%.*]] = fmul <1 x double> [[COL_LOAD136]], [[SPLAT_SPLATINSERT147]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT150:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <1 x i32> <i32 1>
|
||||
; CHECK-NEXT: [[TMP54:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD139]], <1 x double> [[SPLAT_SPLATINSERT150]], <1 x double> [[TMP53]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT153:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <1 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP55:%.*]] = fmul <1 x double> [[COL_LOAD136]], [[SPLAT_SPLATINSERT153]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT156:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <1 x i32> <i32 1>
|
||||
; CHECK-NEXT: [[TMP56:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD139]], <1 x double> [[SPLAT_SPLATINSERT156]], <1 x double> [[TMP55]])
|
||||
; CHECK-NEXT: [[TMP57:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST159:%.*]] = bitcast double* [[TMP57]] to <1 x double>*
|
||||
; CHECK-NEXT: store <1 x double> [[TMP54]], <1 x double>* [[COL_CAST159]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP160:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 5
|
||||
; CHECK-NEXT: [[COL_CAST161:%.*]] = bitcast double* [[COL_GEP160]] to <1 x double>*
|
||||
; CHECK-NEXT: store <1 x double> [[TMP56]], <1 x double>* [[COL_CAST161]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST163:%.*]] = bitcast <6 x double>* [[TMP41]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD164:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST163]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP165:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 3
|
||||
; CHECK-NEXT: [[COL_CAST166:%.*]] = bitcast double* [[COL_GEP165]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD167:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST166]], align 8
|
||||
; CHECK-NEXT: [[TMP58:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST169:%.*]] = bitcast double* [[TMP58]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD170:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST169]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT173:%.*]] = shufflevector <2 x double> [[COL_LOAD170]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP59:%.*]] = fmul <2 x double> [[COL_LOAD164]], [[SPLAT_SPLAT173]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT176:%.*]] = shufflevector <2 x double> [[COL_LOAD170]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP60:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD167]], <2 x double> [[SPLAT_SPLAT176]], <2 x double> [[TMP59]])
|
||||
; CHECK-NEXT: [[TMP61:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 6
|
||||
; CHECK-NEXT: [[COL_CAST178:%.*]] = bitcast double* [[TMP61]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP60]], <2 x double>* [[COL_CAST178]], align 8
|
||||
; CHECK-NEXT: [[TMP62:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST180:%.*]] = bitcast double* [[TMP62]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD181:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST180]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP182:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP41]], i64 0, i64 5
|
||||
; CHECK-NEXT: [[COL_CAST183:%.*]] = bitcast double* [[COL_GEP182]] to <1 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD184:%.*]] = load <1 x double>, <1 x double>* [[COL_CAST183]], align 8
|
||||
; CHECK-NEXT: [[TMP63:%.*]] = getelementptr <6 x double>, <6 x double>* [[TMP47]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST186:%.*]] = bitcast double* [[TMP63]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD187:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST186]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT189:%.*]] = shufflevector <2 x double> [[COL_LOAD187]], <2 x double> undef, <1 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP64:%.*]] = fmul <1 x double> [[COL_LOAD181]], [[SPLAT_SPLATINSERT189]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLATINSERT192:%.*]] = shufflevector <2 x double> [[COL_LOAD187]], <2 x double> undef, <1 x i32> <i32 1>
|
||||
; CHECK-NEXT: [[TMP65:%.*]] = call <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD184]], <1 x double> [[SPLAT_SPLATINSERT192]], <1 x double> [[TMP64]])
|
||||
; CHECK-NEXT: [[TMP66:%.*]] = getelementptr <9 x double>, <9 x double>* [[C]], i64 0, i64 8
|
||||
; CHECK-NEXT: [[COL_CAST195:%.*]] = bitcast double* [[TMP66]] to <1 x double>*
|
||||
; CHECK-NEXT: store <1 x double> [[TMP65]], <1 x double>* [[COL_CAST195]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%a = load <6 x double>, <6 x double>* %A, align 16
|
||||
%b = load <6 x double>, <6 x double>* %B, align 16
|
||||
%c = call <9 x double> @llvm.matrix.multiply(<6 x double> %a, <6 x double> %b, i32 3, i32 2, i32 3)
|
||||
store <9 x double> %c, <9 x double>* %C, align 16
|
||||
|
||||
br i1 %cond, label %true, label %false
|
||||
|
||||
true:
|
||||
%a.add = fadd <6 x double> %a, %a
|
||||
store <6 x double> %a.add, <6 x double>* %A
|
||||
br label %end
|
||||
|
||||
false:
|
||||
%b.add = fadd <6 x double> %b, %b
|
||||
store <6 x double> %b.add, <6 x double>* %B
|
||||
br label %end
|
||||
|
||||
end:
|
||||
%a.2 = load <6 x double>, <6 x double>* %A, align 16
|
||||
%b.2 = load <6 x double>, <6 x double>* %B, align 16
|
||||
%c.2 = call <9 x double> @llvm.matrix.multiply(<6 x double> %a.2, <6 x double> %b.2, i32 3, i32 2, i32 3)
|
||||
store <9 x double> %c.2, <9 x double>* %C, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <9 x double> @llvm.matrix.multiply(<6 x double>, <6 x double>, i32, i32, i32)
|
|
@ -0,0 +1,273 @@
|
|||
; RUN: opt -lower-matrix-intrinsics -fuse-matrix-tile-size=2 -matrix-allow-contract -force-fuse-matrix -instcombine -verify-dom-info %s -S | FileCheck %s
|
||||
|
||||
; REQUIRES: aarch64-registered-target
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "aarch64-apple-ios"
|
||||
|
||||
define void @multiply(<16 x double> * %A, <16 x double> * %B, <16 x double>* %C) {
|
||||
; CHECK-LABEL: @multiply(
|
||||
; CHECK-NEXT: entry:
|
||||
; CHECK-NEXT: [[ST_B:%.*]] = ptrtoint <16 x double>* [[C:%.*]] to i64
|
||||
; CHECK-NEXT: [[ST_E:%.*]] = add nuw nsw i64 [[ST_B]], 128
|
||||
; CHECK-NEXT: [[LD_B:%.*]] = ptrtoint <16 x double>* [[A:%.*]] to i64
|
||||
; CHECK-NEXT: [[TMP0:%.*]] = icmp ugt i64 [[ST_E]], [[LD_B]]
|
||||
; CHECK-NEXT: br i1 [[TMP0]], label [[ALIAS_CONT:%.*]], label [[NO_ALIAS:%.*]]
|
||||
; CHECK: alias_cont:
|
||||
; CHECK-NEXT: [[LD_E:%.*]] = add nuw nsw i64 [[LD_B]], 128
|
||||
; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt i64 [[LD_E]], [[ST_B]]
|
||||
; CHECK-NEXT: br i1 [[TMP1]], label [[COPY:%.*]], label [[NO_ALIAS]]
|
||||
; CHECK: copy:
|
||||
; CHECK-NEXT: [[TMP2:%.*]] = alloca <16 x double>, align 128
|
||||
; CHECK-NEXT: [[TMP3:%.*]] = bitcast <16 x double>* [[TMP2]] to i8*
|
||||
; CHECK-NEXT: [[TMP4:%.*]] = bitcast <16 x double>* [[A]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP3]], i8* nonnull align 16 dereferenceable(128) [[TMP4]], i64 128, i1 false)
|
||||
; CHECK-NEXT: br label [[NO_ALIAS]]
|
||||
; CHECK: no_alias:
|
||||
; CHECK-NEXT: [[TMP5:%.*]] = phi <16 x double>* [ [[A]], [[ENTRY:%.*]] ], [ [[A]], [[ALIAS_CONT]] ], [ [[TMP2]], [[COPY]] ]
|
||||
; CHECK-NEXT: [[ST_B1:%.*]] = ptrtoint <16 x double>* [[C]] to i64
|
||||
; CHECK-NEXT: [[ST_E2:%.*]] = add nuw nsw i64 [[ST_B1]], 128
|
||||
; CHECK-NEXT: [[LD_B6:%.*]] = ptrtoint <16 x double>* [[B:%.*]] to i64
|
||||
; CHECK-NEXT: [[TMP6:%.*]] = icmp ugt i64 [[ST_E2]], [[LD_B6]]
|
||||
; CHECK-NEXT: br i1 [[TMP6]], label [[ALIAS_CONT3:%.*]], label [[NO_ALIAS5:%.*]]
|
||||
; CHECK: alias_cont1:
|
||||
; CHECK-NEXT: [[LD_E7:%.*]] = add nuw nsw i64 [[LD_B6]], 128
|
||||
; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i64 [[LD_E7]], [[ST_B1]]
|
||||
; CHECK-NEXT: br i1 [[TMP7]], label [[COPY4:%.*]], label [[NO_ALIAS5]]
|
||||
; CHECK: copy2:
|
||||
; CHECK-NEXT: [[TMP8:%.*]] = alloca <16 x double>, align 128
|
||||
; CHECK-NEXT: [[TMP9:%.*]] = bitcast <16 x double>* [[TMP8]] to i8*
|
||||
; CHECK-NEXT: [[TMP10:%.*]] = bitcast <16 x double>* [[B]] to i8*
|
||||
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 128 dereferenceable(128) [[TMP9]], i8* nonnull align 16 dereferenceable(128) [[TMP10]], i64 128, i1 false)
|
||||
; CHECK-NEXT: br label [[NO_ALIAS5]]
|
||||
|
||||
; CHECK: no_alias3:
|
||||
; CHECK-NEXT: [[TMP11:%.*]] = phi <16 x double>* [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT3]] ], [ [[TMP8]], [[COPY4]] ]
|
||||
|
||||
;; np.dot(a[0:2, 0:2], b[0:2, 0:2])
|
||||
|
||||
; CHECK-NEXT: [[COL_CAST8:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST8]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD10:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST9]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST12:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD13:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST12]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP14:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST15:%.*]] = bitcast double* [[COL_GEP14]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD16:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST15]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP12:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT19:%.*]] = shufflevector <2 x double> [[COL_LOAD13]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP13:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT19]], <2 x double> [[TMP12]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT22:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP14:%.*]] = fmul <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT22]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT25:%.*]] = shufflevector <2 x double> [[COL_LOAD16]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP15:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD10]], <2 x double> [[SPLAT_SPLAT25]], <2 x double> [[TMP14]])
|
||||
|
||||
;; + np.dot(a[0:2, 2:4], b[2:4, 0:2])
|
||||
|
||||
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8
|
||||
; CHECK-NEXT: [[COL_CAST27:%.*]] = bitcast double* [[TMP16]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD28:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST27]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP29:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12
|
||||
; CHECK-NEXT: [[COL_CAST30:%.*]] = bitcast double* [[COL_GEP29]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD31:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST30]], align 8
|
||||
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST33:%.*]] = bitcast double* [[TMP17]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD34:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST33]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP35:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 6
|
||||
; CHECK-NEXT: [[COL_CAST36:%.*]] = bitcast double* [[COL_GEP35]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD37:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST36]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT41:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP18:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD28]], <2 x double> [[SPLAT_SPLAT41]], <2 x double> [[TMP13]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT44:%.*]] = shufflevector <2 x double> [[COL_LOAD34]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP19:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD31]], <2 x double> [[SPLAT_SPLAT44]], <2 x double> [[TMP18]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT48:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP20:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD28]], <2 x double> [[SPLAT_SPLAT48]], <2 x double> [[TMP15]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT51:%.*]] = shufflevector <2 x double> [[COL_LOAD37]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP21:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD31]], <2 x double> [[SPLAT_SPLAT51]], <2 x double> [[TMP20]])
|
||||
|
||||
;; -> c[0:2, 0:2]
|
||||
|
||||
; CHECK-NEXT: [[COL_CAST53:%.*]] = bitcast <16 x double>* [[C]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP19]], <2 x double>* [[COL_CAST53]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP54:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST55:%.*]] = bitcast double* [[COL_GEP54]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP21]], <2 x double>* [[COL_CAST55]], align 8
|
||||
|
||||
;; np.dot(a[2:4, 0:2], b[0:2, 0:2])
|
||||
|
||||
; CHECK-NEXT: [[TMP22:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST57:%.*]] = bitcast double* [[TMP22]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD58:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST57]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP59:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 6
|
||||
; CHECK-NEXT: [[COL_CAST60:%.*]] = bitcast double* [[COL_GEP59]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD61:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST60]], align 8
|
||||
; CHECK-NEXT: [[COL_CAST63:%.*]] = bitcast <16 x double>* [[TMP11]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD64:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST63]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP65:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST66:%.*]] = bitcast double* [[COL_GEP65]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD67:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST66]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT70:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP23:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT70]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT73:%.*]] = shufflevector <2 x double> [[COL_LOAD64]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP24:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD61]], <2 x double> [[SPLAT_SPLAT73]], <2 x double> [[TMP23]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT76:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP25:%.*]] = fmul <2 x double> [[COL_LOAD58]], [[SPLAT_SPLAT76]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD67]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP26:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD61]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP25]])
|
||||
|
||||
;; + np.dot(a[2:4, 2:4], b[2:4, 0:2])
|
||||
|
||||
; CHECK-NEXT: [[TMP27:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10
|
||||
; CHECK-NEXT: [[COL_CAST81:%.*]] = bitcast double* [[TMP27]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD82:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST81]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP83:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 14
|
||||
; CHECK-NEXT: [[COL_CAST84:%.*]] = bitcast double* [[COL_GEP83]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD85:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST84]], align 8
|
||||
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST87:%.*]] = bitcast double* [[TMP28]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD88:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST87]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP89:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 6
|
||||
; CHECK-NEXT: [[COL_CAST90:%.*]] = bitcast double* [[COL_GEP89]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD91:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST90]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT95:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP29:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD82]], <2 x double> [[SPLAT_SPLAT95]], <2 x double> [[TMP24]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD88]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP30:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD85]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP29]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT102:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP31:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD82]], <2 x double> [[SPLAT_SPLAT102]], <2 x double> [[TMP26]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT105:%.*]] = shufflevector <2 x double> [[COL_LOAD91]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP32:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD85]], <2 x double> [[SPLAT_SPLAT105]], <2 x double> [[TMP31]])
|
||||
|
||||
;; -> c[2:4, 0:2]
|
||||
|
||||
; CHECK-NEXT: [[TMP33:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST107:%.*]] = bitcast double* [[TMP33]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP30]], <2 x double>* [[COL_CAST107]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP108:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 6
|
||||
; CHECK-NEXT: [[COL_CAST109:%.*]] = bitcast double* [[COL_GEP108]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP32]], <2 x double>* [[COL_CAST109]], align 8
|
||||
|
||||
;; np.dot(a[0:2, 0:2], b[0:2, 2:4])
|
||||
|
||||
; CHECK-NEXT: [[COL_CAST111:%.*]] = bitcast <16 x double>* [[TMP5]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD112:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST111]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP113:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 4
|
||||
; CHECK-NEXT: [[COL_CAST114:%.*]] = bitcast double* [[COL_GEP113]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD115:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST114]], align 8
|
||||
; CHECK-NEXT: [[TMP34:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8
|
||||
; CHECK-NEXT: [[COL_CAST117:%.*]] = bitcast double* [[TMP34]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD118:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST117]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP119:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12
|
||||
; CHECK-NEXT: [[COL_CAST120:%.*]] = bitcast double* [[COL_GEP119]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD121:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST120]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT124:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP35:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT124]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT127:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP36:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT127]], <2 x double> [[TMP35]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP37:%.*]] = fmul <2 x double> [[COL_LOAD112]], [[SPLAT_SPLAT130]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT133:%.*]] = shufflevector <2 x double> [[COL_LOAD121]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP38:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT133]], <2 x double> [[TMP37]])
|
||||
|
||||
;; + np.dot(a[0:2, 2:4], b[2:4, 2:4])
|
||||
|
||||
; CHECK-NEXT: [[TMP39:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 8
|
||||
; CHECK-NEXT: [[COL_CAST135:%.*]] = bitcast double* [[TMP39]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD136:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST135]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP137:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 12
|
||||
; CHECK-NEXT: [[COL_CAST138:%.*]] = bitcast double* [[COL_GEP137]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD139:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST138]], align 8
|
||||
; CHECK-NEXT: [[TMP40:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10
|
||||
; CHECK-NEXT: [[COL_CAST141:%.*]] = bitcast double* [[TMP40]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD142:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST141]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP143:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 14
|
||||
; CHECK-NEXT: [[COL_CAST144:%.*]] = bitcast double* [[COL_GEP143]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD145:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST144]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT149:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP41:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD136]], <2 x double> [[SPLAT_SPLAT149]], <2 x double> [[TMP36]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT152:%.*]] = shufflevector <2 x double> [[COL_LOAD142]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP42:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD139]], <2 x double> [[SPLAT_SPLAT152]], <2 x double> [[TMP41]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT156:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP43:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD136]], <2 x double> [[SPLAT_SPLAT156]], <2 x double> [[TMP38]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT159:%.*]] = shufflevector <2 x double> [[COL_LOAD145]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP44:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD139]], <2 x double> [[SPLAT_SPLAT159]], <2 x double> [[TMP43]])
|
||||
|
||||
;; -> c[0:2, 2:4]
|
||||
|
||||
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 8
|
||||
; CHECK-NEXT: [[COL_CAST161:%.*]] = bitcast double* [[TMP45]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP42]], <2 x double>* [[COL_CAST161]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP162:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 12
|
||||
; CHECK-NEXT: [[COL_CAST163:%.*]] = bitcast double* [[COL_GEP162]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP44]], <2 x double>* [[COL_CAST163]], align 8
|
||||
|
||||
;; np.dot(a[2:4, 0:2], b[2:4, 0:2])
|
||||
|
||||
; CHECK-NEXT: [[TMP46:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 2
|
||||
; CHECK-NEXT: [[COL_CAST165:%.*]] = bitcast double* [[TMP46]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD166:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST165]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP167:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 6
|
||||
; CHECK-NEXT: [[COL_CAST168:%.*]] = bitcast double* [[COL_GEP167]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD169:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST168]], align 8
|
||||
; CHECK-NEXT: [[TMP47:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 8
|
||||
; CHECK-NEXT: [[COL_CAST171:%.*]] = bitcast double* [[TMP47]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD172:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST171]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP173:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 12
|
||||
; CHECK-NEXT: [[COL_CAST174:%.*]] = bitcast double* [[COL_GEP173]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD175:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST174]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT178:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP48:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT178]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT181:%.*]] = shufflevector <2 x double> [[COL_LOAD172]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP49:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD169]], <2 x double> [[SPLAT_SPLAT181]], <2 x double> [[TMP48]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT184:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP50:%.*]] = fmul <2 x double> [[COL_LOAD166]], [[SPLAT_SPLAT184]]
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT187:%.*]] = shufflevector <2 x double> [[COL_LOAD175]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP51:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD169]], <2 x double> [[SPLAT_SPLAT187]], <2 x double> [[TMP50]])
|
||||
|
||||
;; + np.dot(a[2:4, 2:4], b[2:4, 2:4])
|
||||
|
||||
; CHECK-NEXT: [[TMP52:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 10
|
||||
; CHECK-NEXT: [[COL_CAST189:%.*]] = bitcast double* [[TMP52]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD190:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST189]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP191:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP5]], i64 0, i64 14
|
||||
; CHECK-NEXT: [[COL_CAST192:%.*]] = bitcast double* [[COL_GEP191]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD193:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST192]], align 8
|
||||
; CHECK-NEXT: [[TMP53:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 10
|
||||
; CHECK-NEXT: [[COL_CAST195:%.*]] = bitcast double* [[TMP53]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD196:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST195]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP197:%.*]] = getelementptr <16 x double>, <16 x double>* [[TMP11]], i64 0, i64 14
|
||||
; CHECK-NEXT: [[COL_CAST198:%.*]] = bitcast double* [[COL_GEP197]] to <2 x double>*
|
||||
; CHECK-NEXT: [[COL_LOAD199:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST198]], align 8
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT203:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP54:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD190]], <2 x double> [[SPLAT_SPLAT203]], <2 x double> [[TMP49]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT206:%.*]] = shufflevector <2 x double> [[COL_LOAD196]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP55:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD193]], <2 x double> [[SPLAT_SPLAT206]], <2 x double> [[TMP54]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT210:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> zeroinitializer
|
||||
; CHECK-NEXT: [[TMP56:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD190]], <2 x double> [[SPLAT_SPLAT210]], <2 x double> [[TMP51]])
|
||||
; CHECK-NEXT: [[SPLAT_SPLAT213:%.*]] = shufflevector <2 x double> [[COL_LOAD199]], <2 x double> undef, <2 x i32> <i32 1, i32 1>
|
||||
; CHECK-NEXT: [[TMP57:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD193]], <2 x double> [[SPLAT_SPLAT213]], <2 x double> [[TMP56]])
|
||||
|
||||
;; -> c[2:4, 2:4]
|
||||
|
||||
; CHECK-NEXT: [[TMP58:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 10
|
||||
; CHECK-NEXT: [[COL_CAST215:%.*]] = bitcast double* [[TMP58]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP55]], <2 x double>* [[COL_CAST215]], align 8
|
||||
; CHECK-NEXT: [[COL_GEP216:%.*]] = getelementptr <16 x double>, <16 x double>* [[C]], i64 0, i64 14
|
||||
; CHECK-NEXT: [[COL_CAST217:%.*]] = bitcast double* [[COL_GEP216]] to <2 x double>*
|
||||
; CHECK-NEXT: store <2 x double> [[TMP57]], <2 x double>* [[COL_CAST217]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
entry:
|
||||
%a = load <16 x double>, <16 x double>* %A, align 16
|
||||
%b = load <16 x double>, <16 x double>* %B, align 16
|
||||
|
||||
%c = call <16 x double> @llvm.matrix.multiply(<16 x double> %a, <16 x double> %b, i32 4, i32 4, i32 4)
|
||||
|
||||
store <16 x double> %c, <16 x double>* %C, align 16
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <16 x double> @llvm.matrix.multiply(<16 x double>, <16 x double>, i32, i32, i32)
|
Loading…
Reference in New Issue