forked from OSchip/llvm-project
[Matrix] Add optimization remarks for matrix expression.
Generate remarks for matrix operations in a function. To generate remarks for matrix expressions, the following approach is used: 1. Collect leafs of matrix expressions (done in RemarkGenerator::getExpressionLeafs). Leafs are lowered matrix instructions without other matrix users (like stores). 2. For each leaf, create a remark containing a linearizied version of the matrix expression. The following improvements will be submitted as follow-ups: * Summarize number of vector instructions generated for each expression. * Account for shared sub-expressions. * Propagate matrix remarks up the inlining chain. The information provided by the matrix remarks helps users to spot cases where matrix expression got split up, e.g. due to inlining not happening. The remarks allow users to address those issues, ensuring best performance. Reviewers: anemet, Gerolf, thegameg, hfinkel, andrew.w.kaylor, LuoYuanke Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D72453
This commit is contained in:
parent
6874dfce3a
commit
949294f396
|
@ -10,7 +10,8 @@
|
|||
//
|
||||
// TODO:
|
||||
// * Implement multiply & add fusion
|
||||
// * Add remark, summarizing the available matrix optimization opportunities.
|
||||
// * Add remark, summarizing the available matrix optimization opportunities
|
||||
// (WIP).
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
@ -18,7 +19,9 @@
|
|||
#include "llvm/ADT/GraphTraits.h"
|
||||
#include "llvm/ADT/PostOrderIterator.h"
|
||||
#include "llvm/ADT/SmallVector.h"
|
||||
#include "llvm/Analysis/OptimizationRemarkEmitter.h"
|
||||
#include "llvm/Analysis/TargetTransformInfo.h"
|
||||
#include "llvm/Analysis/ValueTracking.h"
|
||||
#include "llvm/Analysis/VectorUtils.h"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
|
@ -136,6 +139,7 @@ class LowerMatrixIntrinsics {
|
|||
Function &Func;
|
||||
const DataLayout &DL;
|
||||
const TargetTransformInfo &TTI;
|
||||
OptimizationRemarkEmitter &ORE;
|
||||
|
||||
/// Wrapper class representing a matrix as a set of column vectors.
|
||||
/// All column vectors must have the same vector type.
|
||||
|
@ -213,11 +217,12 @@ class LowerMatrixIntrinsics {
|
|||
SmallVector<Instruction *, 16> ToRemove;
|
||||
|
||||
/// Map from instructions to their produced column matrix.
|
||||
DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
|
||||
MapVector<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
|
||||
|
||||
public:
|
||||
LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI)
|
||||
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {}
|
||||
LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
|
||||
OptimizationRemarkEmitter &ORE)
|
||||
: Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), ORE(ORE) {}
|
||||
|
||||
/// Return the set of column vectors that a matrix value is lowered to.
|
||||
///
|
||||
|
@ -509,6 +514,9 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL);
|
||||
RemarkGen.emitRemarks();
|
||||
|
||||
for (Instruction *Inst : reverse(ToRemove))
|
||||
Inst->eraseFromParent();
|
||||
|
||||
|
@ -599,6 +607,7 @@ public:
|
|||
Shape.NumRows, VType->getElementType(), Builder);
|
||||
createColumnStore(C.value(), GEP, VType->getElementType(), Builder);
|
||||
}
|
||||
Inst2ColumnMatrix[Inst] = ColumnMatrixTy();
|
||||
|
||||
ToRemove.push_back(Inst);
|
||||
}
|
||||
|
@ -844,13 +853,301 @@ public:
|
|||
finalizeLowering(Inst, Result, Builder);
|
||||
return true;
|
||||
}
|
||||
|
||||
/// Helper to linearize a matrix expression tree into a string. Currently
|
||||
/// matrix expressions are linarized by starting at an expression leaf and
|
||||
/// linearizing bottom up.
|
||||
struct ExprLinearizer {
|
||||
unsigned LengthToBreak = 100;
|
||||
std::string Str;
|
||||
raw_string_ostream Stream;
|
||||
unsigned LineLength = 0;
|
||||
const DataLayout &DL;
|
||||
|
||||
/// Mapping from instructions to column matrixes. It is used to identify
|
||||
/// matrix instructions.
|
||||
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
|
||||
|
||||
/// Used to keep track of sub-expressions that get reused while linearizing
|
||||
/// the expression. Re-used sub-expressions are marked as (reused).
|
||||
SmallPtrSet<Value *, 8> ReusedExprs;
|
||||
|
||||
ExprLinearizer(const DataLayout &DL,
|
||||
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix)
|
||||
: Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix) {}
|
||||
|
||||
void indent(unsigned N) {
|
||||
LineLength += N;
|
||||
for (unsigned i = 0; i < N; i++)
|
||||
Stream << " ";
|
||||
}
|
||||
|
||||
void lineBreak() {
|
||||
Stream << "\n";
|
||||
LineLength = 0;
|
||||
}
|
||||
|
||||
void maybeIndent(unsigned Indent) {
|
||||
if (LineLength >= LengthToBreak)
|
||||
lineBreak();
|
||||
|
||||
if (LineLength == 0)
|
||||
indent(Indent);
|
||||
}
|
||||
|
||||
void write(const std::string &S) {
|
||||
LineLength += S.size();
|
||||
Stream << S;
|
||||
}
|
||||
|
||||
Value *getUnderlyingObjectThroughLoads(Value *V) {
|
||||
if (Value *Ptr = getPointerOperand(V))
|
||||
return getUnderlyingObjectThroughLoads(Ptr);
|
||||
else if (V->getType()->isPointerTy())
|
||||
return GetUnderlyingObject(V, DL);
|
||||
return V;
|
||||
}
|
||||
|
||||
/// Returns true if \p V is a matrix value.
|
||||
bool isMatrix(Value *V) const {
|
||||
return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end();
|
||||
}
|
||||
|
||||
/// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
|
||||
/// \p SS.
|
||||
void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {
|
||||
auto M = Inst2ColumnMatrix.find(V);
|
||||
if (M == Inst2ColumnMatrix.end())
|
||||
SS << "unknown";
|
||||
else {
|
||||
SS << M->second.getNumRows();
|
||||
SS << "x";
|
||||
SS << M->second.getNumColumns();
|
||||
}
|
||||
}
|
||||
|
||||
/// Write the called function name. Handles calls to llvm.matrix.*
|
||||
/// specially: we write the name, followed by the dimensions of the input
|
||||
/// matrixes, followed by the scalar type name.
|
||||
void writeFnName(CallInst *CI) {
|
||||
if (!CI->getCalledFunction())
|
||||
write("<no called fn>");
|
||||
else {
|
||||
StringRef Name = CI->getCalledFunction()->getName();
|
||||
if (!Name.startswith("llvm.matrix")) {
|
||||
write(Name);
|
||||
return;
|
||||
}
|
||||
IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
|
||||
write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
|
||||
.drop_front(StringRef("llvm.matrix.").size()));
|
||||
write(".");
|
||||
std::string Tmp = "";
|
||||
raw_string_ostream SS(Tmp);
|
||||
|
||||
switch (II->getIntrinsicID()) {
|
||||
case Intrinsic::matrix_multiply:
|
||||
prettyPrintMatrixType(II->getOperand(0), SS);
|
||||
SS << ".";
|
||||
prettyPrintMatrixType(II->getOperand(1), SS);
|
||||
SS << "." << *II->getType()->getScalarType();
|
||||
break;
|
||||
case Intrinsic::matrix_transpose:
|
||||
prettyPrintMatrixType(II->getOperand(0), SS);
|
||||
SS << "." << *II->getType()->getScalarType();
|
||||
break;
|
||||
case Intrinsic::matrix_columnwise_load:
|
||||
prettyPrintMatrixType(II, SS);
|
||||
SS << "." << *II->getType()->getScalarType();
|
||||
break;
|
||||
case Intrinsic::matrix_columnwise_store:
|
||||
prettyPrintMatrixType(II->getOperand(0), SS);
|
||||
SS << "." << *II->getOperand(0)->getType()->getScalarType();
|
||||
break;
|
||||
default:
|
||||
llvm_unreachable("Unhandled case");
|
||||
}
|
||||
SS.flush();
|
||||
write(Tmp);
|
||||
}
|
||||
}
|
||||
|
||||
unsigned getNumShapeArgs(CallInst *CI) const {
|
||||
if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
|
||||
switch (II->getIntrinsicID()) {
|
||||
case Intrinsic::matrix_multiply:
|
||||
return 3;
|
||||
case Intrinsic::matrix_transpose:
|
||||
case Intrinsic::matrix_columnwise_load:
|
||||
case Intrinsic::matrix_columnwise_store:
|
||||
return 2;
|
||||
default:
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/// Special printing for values: for pointers, we print if they refer to an
|
||||
/// (function) external address or a stack address, for other values we
|
||||
/// either print the constant or "scalar"/"matrix" for other values.
|
||||
void write(Value *V) {
|
||||
V = getUnderlyingObjectThroughLoads(V);
|
||||
if (V->getType()->isPointerTy()) {
|
||||
if (isa<AllocaInst>(V)) {
|
||||
Stream << "stack addr";
|
||||
LineLength += StringRef("stack addr").size();
|
||||
} else {
|
||||
Stream << "addr";
|
||||
LineLength += StringRef("addr").size();
|
||||
}
|
||||
if (!V->getName().empty()) {
|
||||
Stream << " %" << V->getName() << "";
|
||||
LineLength += V->getName().size() + 2;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
std::string Tmp;
|
||||
raw_string_ostream TmpStream(Tmp);
|
||||
|
||||
if (auto *CI = dyn_cast<ConstantInt>(V))
|
||||
TmpStream << CI->getValue();
|
||||
else if (isa<Constant>(V))
|
||||
TmpStream << "constant";
|
||||
else {
|
||||
if (isMatrix(V))
|
||||
TmpStream << "matrix";
|
||||
else
|
||||
TmpStream << "scalar";
|
||||
}
|
||||
TmpStream.flush();
|
||||
Tmp = StringRef(Tmp).trim();
|
||||
LineLength += Tmp.size();
|
||||
Stream << Tmp;
|
||||
}
|
||||
|
||||
/// Linearize expression \p Expr starting at an indentation of \p Indent.
|
||||
/// Expressions that are re-used multiple times are prefixed with (reused)
|
||||
/// at the re-used root instruction.
|
||||
void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused) {
|
||||
auto *I = cast<Instruction>(Expr);
|
||||
maybeIndent(Indent);
|
||||
SmallVector<Value *, 8> Ops;
|
||||
|
||||
bool Reused = !ReusedExprs.insert(Expr).second;
|
||||
if (Reused && !ParentReused)
|
||||
write("(reused) ");
|
||||
|
||||
if (auto *CI = dyn_cast<CallInst>(I)) {
|
||||
writeFnName(CI);
|
||||
|
||||
Ops.append(CallSite(CI).arg_begin(),
|
||||
CallSite(CI).arg_end() - getNumShapeArgs(CI));
|
||||
} else if (isa<BitCastInst>(Expr)) {
|
||||
// Special case bitcasts, which are used to materialize matrixes from
|
||||
// non-matrix ops.
|
||||
write("matrix");
|
||||
return;
|
||||
} else {
|
||||
Ops.append(I->value_op_begin(), I->value_op_end());
|
||||
write(std::string(I->getOpcodeName()));
|
||||
}
|
||||
|
||||
write(std::string("("));
|
||||
|
||||
unsigned NumOpsToBreak = 1;
|
||||
if (match(Expr, m_Intrinsic<Intrinsic::matrix_columnwise_load>()))
|
||||
NumOpsToBreak = 2;
|
||||
|
||||
for (Value *Op : Ops) {
|
||||
if (Ops.size() > NumOpsToBreak)
|
||||
lineBreak();
|
||||
|
||||
maybeIndent(Indent + 1);
|
||||
if (isMatrix(Op))
|
||||
linearizeExpr(Op, Indent + 1, Reused);
|
||||
else
|
||||
write(Op);
|
||||
if (Op != Ops.back())
|
||||
write(", ");
|
||||
}
|
||||
|
||||
write(")");
|
||||
}
|
||||
|
||||
const std::string &getResult() {
|
||||
Stream.flush();
|
||||
return Str;
|
||||
}
|
||||
};
|
||||
|
||||
/// Generate remarks for matrix operations in a function. To generate remarks
|
||||
/// for matrix expressions, the following approach is used:
|
||||
/// 1. Collect leafs of matrix expressions (done in
|
||||
/// RemarkGenerator::getExpressionLeaves). Leaves are lowered matrix
|
||||
/// instructions without other matrix users (like stores).
|
||||
///
|
||||
/// 2. For each leaf, create a remark containing a linearizied version of the
|
||||
/// matrix expression.
|
||||
///
|
||||
/// TODO:
|
||||
/// * Summarize number of vector instructions generated for each expression.
|
||||
/// * Account for shared sub-expressions.
|
||||
/// * Propagate matrix remarks up the inlining chain.
|
||||
struct RemarkGenerator {
|
||||
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
|
||||
OptimizationRemarkEmitter &ORE;
|
||||
const DataLayout &DL;
|
||||
|
||||
RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
|
||||
OptimizationRemarkEmitter &ORE, const DataLayout &DL)
|
||||
: Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {}
|
||||
|
||||
/// Return all leafs of matrix expressions. Those are instructions in
|
||||
/// Inst2ColumnMatrix returing void. Currently that should only include
|
||||
/// stores.
|
||||
SmallVector<Value *, 4> getExpressionLeaves() {
|
||||
SmallVector<Value *, 4> Leaves;
|
||||
for (auto &KV : Inst2ColumnMatrix)
|
||||
if (KV.first->getType()->isVoidTy())
|
||||
Leaves.push_back(KV.first);
|
||||
|
||||
return Leaves;
|
||||
}
|
||||
|
||||
void emitRemarks() {
|
||||
if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
|
||||
return;
|
||||
|
||||
// Find leafs of matrix expressions.
|
||||
auto Leaves = getExpressionLeaves();
|
||||
|
||||
// Generate remarks for each leaf.
|
||||
for (auto *L : Leaves) {
|
||||
OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
|
||||
cast<Instruction>(L)->getDebugLoc(),
|
||||
cast<Instruction>(L)->getParent());
|
||||
Rem << "Lowered matrix expression ";
|
||||
Rem << ("\n" + linearize(L, DL));
|
||||
ORE.emit(Rem);
|
||||
}
|
||||
}
|
||||
|
||||
std::string linearize(Value *L, const DataLayout &DL) {
|
||||
ExprLinearizer Lin(DL, Inst2ColumnMatrix);
|
||||
Lin.linearizeExpr(L, 0, false);
|
||||
return Lin.getResult();
|
||||
}
|
||||
};
|
||||
};
|
||||
} // namespace
|
||||
|
||||
PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
|
||||
FunctionAnalysisManager &AM) {
|
||||
auto &TTI = AM.getResult<TargetIRAnalysis>(F);
|
||||
LowerMatrixIntrinsics LMT(F, TTI);
|
||||
auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
|
||||
LowerMatrixIntrinsics LMT(F, TTI, ORE);
|
||||
if (LMT.Visit()) {
|
||||
PreservedAnalyses PA;
|
||||
PA.preserveSet<CFGAnalyses>();
|
||||
|
@ -871,14 +1168,16 @@ public:
|
|||
}
|
||||
|
||||
bool runOnFunction(Function &F) override {
|
||||
auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
||||
LowerMatrixIntrinsics LMT(F, *TTI);
|
||||
auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
|
||||
auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
|
||||
LowerMatrixIntrinsics LMT(F, TTI, ORE);
|
||||
bool C = LMT.Visit();
|
||||
return C;
|
||||
}
|
||||
|
||||
void getAnalysisUsage(AnalysisUsage &AU) const override {
|
||||
AU.addRequired<TargetTransformInfoWrapperPass>();
|
||||
AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
|
||||
AU.setPreservesCFG();
|
||||
}
|
||||
};
|
||||
|
@ -888,6 +1187,7 @@ static const char pass_name[] = "Lower the matrix intrinsics";
|
|||
char LowerMatrixIntrinsicsLegacyPass::ID = 0;
|
||||
INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
|
||||
false, false)
|
||||
INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
|
||||
INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
|
||||
false, false)
|
||||
|
||||
|
|
|
@ -0,0 +1,195 @@
|
|||
; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics < %s 2>&1 | FileCheck %s
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "aarch64-apple-ios"
|
||||
|
||||
; CHECK-LABEL: remark: test.h:40:20: Lowered matrix expression
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: transpose.2x6.double(load(addr %A)),
|
||||
; CHECK-NEXT: addr %B)
|
||||
define void @transpose(<12 x double>* %A, <12 x double>* %B) !dbg !23 {
|
||||
%load = load <12 x double>, <12 x double>* %A, !dbg !24
|
||||
%t = call <12 x double> @llvm.matrix.transpose.v12f64.v12f64(<12 x double> %load, i32 2, i32 6), !dbg !24
|
||||
store <12 x double> %t, <12 x double>* %B, !dbg !24
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <12 x double> @llvm.matrix.transpose.v12f64.v12f64(<12 x double>, i32, i32)
|
||||
|
||||
|
||||
; CHECK-LABEL: remark: test.h:50:20: Lowered matrix expression
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: multiply.2x6.6x2.double(
|
||||
; CHECK-NEXT: load(addr %A),
|
||||
; CHECK-NEXT: load(addr %B)),
|
||||
; CHECK-NEXT: addr %C)
|
||||
define void @multiply(<12 x double>* %A, <12 x double>* %B, <4 x double>* %C) !dbg !25 {
|
||||
%A.matrix = load <12 x double>, <12 x double>* %A, !dbg !26
|
||||
%B.matrix = load <12 x double>, <12 x double>* %B, !dbg !26
|
||||
%t = call <4 x double> @llvm.matrix.multiply(<12 x double> %A.matrix, <12 x double> %B.matrix, i32 2, i32 6, i32 2), !dbg !26
|
||||
store <4 x double> %t, <4 x double>* %C, !dbg !26
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <4 x double> @llvm.matrix.multiply(<12 x double>, <12 x double>, i32, i32, i32)
|
||||
|
||||
; CHECK-LABEL: remark: test.h:60:20: Lowered matrix expression
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5),
|
||||
; CHECK-NEXT: addr %B)
|
||||
define void @columnwise.load(<9 x double>* %A, <9 x double>* %B) !dbg !27 {
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !28
|
||||
store <9 x double> %A.matrix, <9 x double>* %B, !dbg !28
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <9 x double> @llvm.matrix.columnwise.load(<9 x double>*, i32, i32, i32)
|
||||
|
||||
; CHECK-LABEL: remark: test.h:70:20: Lowered matrix expression
|
||||
; CHECK-NEXT: columnwise.store.3x3.double(
|
||||
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5),
|
||||
; CHECK-NEXT: addr %B,
|
||||
; CHECK-NEXT: 10)
|
||||
define void @columnwise.store(<9 x double>* %A, <9 x double>* %B) !dbg !29 {
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !30
|
||||
call void @llvm.matrix.columnwise.store(<9 x double> %A.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !30
|
||||
ret void
|
||||
}
|
||||
|
||||
declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32, i32)
|
||||
|
||||
; CHECK-LABEL: remark: test.h:80:20: Lowered matrix expression
|
||||
; CHECK-NEXT: columnwise.store.3x3.double(
|
||||
; CHECK-NEXT: fmul(
|
||||
; CHECK-NEXT: fadd(
|
||||
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5)
|
||||
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
|
||||
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
|
||||
; CHECK-NEXT: addr %B,
|
||||
; CHECK-NEXT: 10)
|
||||
|
||||
define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
|
||||
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: remark: test.h:90:20: Lowered matrix expression
|
||||
; CHECK-NEXT: columnwise.store.3x3.double(
|
||||
; CHECK-NEXT: fmul(
|
||||
; CHECK-NEXT: fadd(
|
||||
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5)
|
||||
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
|
||||
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
|
||||
; CHECK-NEXT: addr %B,
|
||||
; CHECK-NEXT: 10)
|
||||
; CHECK-NEXT: remark: test.h:90:20: Lowered matrix expression
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: multiply.2x6.6x2.double(
|
||||
; CHECK-NEXT: load(addr %C),
|
||||
; CHECK-NEXT: load(addr %D)),
|
||||
; CHECK-NEXT: addr %E)
|
||||
|
||||
define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
|
||||
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
|
||||
|
||||
%C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
|
||||
%D.matrix = load <12 x double>, <12 x double>* %D, !dbg !34
|
||||
%Mult.matrix = call <4 x double> @llvm.matrix.multiply(<12 x double> %C.matrix, <12 x double> %D.matrix, i32 2, i32 6, i32 2), !dbg !34
|
||||
store <4 x double> %Mult.matrix, <4 x double>* %E, !dbg !34
|
||||
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: remark: test.h:100:20: Lowered matrix expression
|
||||
; CHECK-NEXT: columnwise.store.3x3.double(
|
||||
; CHECK-NEXT: fmul(
|
||||
; CHECK-NEXT: fadd(
|
||||
; CHECK-NEXT: columnwise.load.3x3.double(addr %A, 5)
|
||||
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
|
||||
; CHECK-NEXT: (reused) columnwise.load.3x3.double(addr %A, 5)),
|
||||
; CHECK-NEXT: stack addr %B,
|
||||
; CHECK-NEXT: 10)
|
||||
define void @stackaddresses(<9 x double>* %A) !dbg !35 {
|
||||
%B = alloca <9 x double>
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
|
||||
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: remark: test.h:30:20: Lowered matrix expression
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: transpose.5x3.double(load(addr %A)),
|
||||
; CHECK-NEXT: stack addr %s1)
|
||||
%S1 = type {<15 x double>*}
|
||||
define void @get_underlying_object(%S1* %A) !dbg !21 {
|
||||
entry:
|
||||
%s1 = alloca <15 x double>, !dbg !22
|
||||
%a1 = getelementptr %S1, %S1* %A, i32 0, i32 0, !dbg !22
|
||||
%a2 = load <15 x double>*, <15 x double>** %a1, !dbg !22
|
||||
%av = load <15 x double>, <15 x double>* %a2, !dbg !22
|
||||
|
||||
%s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22
|
||||
%s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22
|
||||
|
||||
%t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3)
|
||||
|
||||
store <15 x double> %t, <15 x double>* %s3, !dbg !22
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double>, i32, i32)
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!3, !4}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
|
||||
!1 = !DIFile(filename: "test.h", directory: "/test")
|
||||
!2 = !{}
|
||||
!3 = !{i32 2, !"Dwarf Version", i32 4}
|
||||
!4 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
|
||||
!6 = !DISubroutineType(types: !7)
|
||||
!7 = !{null, !8, !8, !11}
|
||||
!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
|
||||
!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
|
||||
!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
|
||||
!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
|
||||
!12 = !{!13}
|
||||
!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
|
||||
!14 = !DILocation(line: 1, column: 27, scope: !5)
|
||||
|
||||
!5 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!19 = !DILocation(line: 10, column: 20, scope: !5)
|
||||
!20 = !DILocation(line: 10, column: 10, scope: !5)
|
||||
|
||||
!21 = distinct !DISubprogram(name: "fn2", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!22 = !DILocation(line: 30, column: 20, scope: !21)
|
||||
|
||||
!23 = distinct !DISubprogram(name: "fn3", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!24 = !DILocation(line: 40, column: 20, scope: !23)
|
||||
|
||||
!25 = distinct !DISubprogram(name: "fn4", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!26 = !DILocation(line: 50, column: 20, scope: !25)
|
||||
|
||||
!27 = distinct !DISubprogram(name: "fn5", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!28 = !DILocation(line: 60, column: 20, scope: !27)
|
||||
|
||||
!29 = distinct !DISubprogram(name: "fn6", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!30 = !DILocation(line: 70, column: 20, scope: !29)
|
||||
|
||||
!31 = distinct !DISubprogram(name: "fn7", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!32 = !DILocation(line: 80, column: 20, scope: !31)
|
||||
|
||||
!33 = distinct !DISubprogram(name: "fn8", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!34 = !DILocation(line: 90, column: 20, scope: !33)
|
||||
|
||||
!35 = distinct !DISubprogram(name: "fn9", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!36 = !DILocation(line: 100, column: 20, scope: !35)
|
Loading…
Reference in New Issue