forked from OSchip/llvm-project
[Matrix] Add remark propagation along the inlined-at chain.
This patch adds support for propagating matrix expressions along the inlined-at chain and emitting remarks at the traversed function scopes. To motivate this new behavior, consider the example below. Without the remark 'up-leveling', we would only get remarks in load.h and store.h, but we cannot generate a remark describing the full expression in toplevel.cpp, which is the place where the user has the best chance of spotting/fixing potential problems. With this patch, we generate a remark for the load in load.h, one for the store in store.h and one for the complete expression in toplevel.cpp. For a bigger example, please see remarks-inlining.ll. load.h: template <typename Ty, unsigned R, unsigned C> Matrix<Ty, R, C> load(Ty *Ptr) { Matrix<Ty, R, C> Result; Result.value = *reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t *>(Ptr); return Result; } store.h: template <typename Ty, unsigned R, unsigned C> void store(Matrix<Ty, R, C> M1, Ty *Ptr) { *reinterpret_cast<typename decltype(M1)::matrix_t *>(Ptr) = M1.value; } toplevel.cpp void test(double *A, double *B, double *C) { store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C); } For a given function, we traverse the inlined-at chain for each matrix instruction (= instructions with shape information). We collect the matrix instructions in each DISubprogram we visit. This produces a mapping of DISubprogram -> (List of matrix instructions visible in the subpogram). We then generate remarks using the list of instructions for each subprogram in the inlined-at chain. Note that the list of instructions for a subprogram includes the instructions from its own subprograms recursively. For example using the example above, for the subprogram 'test' this includes inline functions 'load' and 'store'. This allows surfacing the remarks at a level useful to users. Please note that the current approach may create a lot of extra remarks. Additional heuristics to cut-off the traversal can be implemented in the future. For example, it might make sense to stop 'up-leveling' once all matrix instructions are at the same debug location. Reviewers: anemet, Gerolf, thegameg, hfinkel, andrew.w.kaylor, LuoYuanke Reviewed By: anemet Differential Revision: https://reviews.llvm.org/D73600
This commit is contained in:
parent
0d7c8c07d2
commit
bc6c8c4bbb
|
@ -10,8 +10,6 @@
|
|||
//
|
||||
// TODO:
|
||||
// * Implement multiply & add fusion
|
||||
// * Add remark, summarizing the available matrix optimization opportunities
|
||||
// (WIP).
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
|
@ -25,6 +23,7 @@
|
|||
#include "llvm/Analysis/VectorUtils.h"
|
||||
#include "llvm/IR/CFG.h"
|
||||
#include "llvm/IR/DataLayout.h"
|
||||
#include "llvm/IR/DebugInfoMetadata.h"
|
||||
#include "llvm/IR/Function.h"
|
||||
#include "llvm/IR/IRBuilder.h"
|
||||
#include "llvm/IR/Instructions.h"
|
||||
|
@ -50,6 +49,14 @@ static cl::opt<bool> AllowContractEnabled(
|
|||
cl::desc("Allow the use of FMAs if available and profitable. This may "
|
||||
"result in different results, due to less rounding error."));
|
||||
|
||||
/// Helper function to either return Scope, if it is a subprogram or the
|
||||
/// attached subprogram for a local scope.
|
||||
static DISubprogram *getSubprogram(DIScope *Scope) {
|
||||
if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
|
||||
return Subprogram;
|
||||
return cast<DILocalScope>(Scope)->getSubprogram();
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute
|
||||
|
@ -574,7 +581,7 @@ public:
|
|||
}
|
||||
}
|
||||
|
||||
RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, DL);
|
||||
RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
|
||||
RemarkGen.emitRemarks();
|
||||
|
||||
for (Instruction *Inst : reverse(ToRemove))
|
||||
|
@ -950,6 +957,9 @@ public:
|
|||
/// part of.
|
||||
const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
|
||||
|
||||
/// Set of matrix expressions in the scope of a given DISubprogram.
|
||||
const SmallSetVector<Value *, 32> &ExprsInSubprogram;
|
||||
|
||||
/// Leaf node of the expression to linearize.
|
||||
Value *Leaf;
|
||||
|
||||
|
@ -960,9 +970,10 @@ public:
|
|||
ExprLinearizer(const DataLayout &DL,
|
||||
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
|
||||
const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
|
||||
const SmallSetVector<Value *, 32> &ExprsInSubprogram,
|
||||
Value *Leaf)
|
||||
: Str(), Stream(Str), DL(DL), Inst2ColumnMatrix(Inst2ColumnMatrix),
|
||||
Shared(Shared), Leaf(Leaf) {}
|
||||
Shared(Shared), ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
|
||||
|
||||
void indent(unsigned N) {
|
||||
LineLength += N;
|
||||
|
@ -996,10 +1007,8 @@ public:
|
|||
return V;
|
||||
}
|
||||
|
||||
/// Returns true if \p V is a matrix value.
|
||||
bool isMatrix(Value *V) const {
|
||||
return Inst2ColumnMatrix.find(V) != Inst2ColumnMatrix.end();
|
||||
}
|
||||
/// Returns true if \p V is a matrix value in the given subprogram.
|
||||
bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }
|
||||
|
||||
/// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
|
||||
/// \p SS.
|
||||
|
@ -1191,60 +1200,69 @@ public:
|
|||
|
||||
/// Generate remarks for matrix operations in a function. To generate remarks
|
||||
/// for matrix expressions, the following approach is used:
|
||||
/// 1. Collect leafs of matrix expressions (done in
|
||||
/// RemarkGenerator::getExpressionLeaves). Leaves are lowered matrix
|
||||
/// instructions without other matrix users (like stores).
|
||||
///
|
||||
/// 2. For each leaf, create a remark containing a linearizied version of the
|
||||
/// matrix expression.
|
||||
///
|
||||
/// TODO:
|
||||
/// * Summarize number of vector instructions generated for each expression.
|
||||
/// * Propagate matrix remarks up the inlining chain.
|
||||
/// 1. Use the inlined-at debug information to group matrix operations to the
|
||||
/// DISubprograms they are contained in.
|
||||
/// 2. Collect leaves of matrix expressions (done in
|
||||
/// RemarkGenerator::getExpressionLeaves) for each subprogram - expression
|
||||
// mapping. Leaves are lowered matrix instructions without other matrix
|
||||
// users (like stores) in the current subprogram.
|
||||
/// 3. For each leaf, create a remark containing a linearizied version of the
|
||||
/// matrix expression. The expression is linearized by a recursive
|
||||
/// bottom-up traversal of the matrix operands, starting at a leaf. Note
|
||||
/// that multiple leaves can share sub-expressions. Shared subexpressions
|
||||
/// are explicitly marked as shared().
|
||||
struct RemarkGenerator {
|
||||
const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix;
|
||||
OptimizationRemarkEmitter &ORE;
|
||||
Function &Func;
|
||||
const DataLayout &DL;
|
||||
|
||||
RemarkGenerator(const MapVector<Value *, ColumnMatrixTy> &Inst2ColumnMatrix,
|
||||
OptimizationRemarkEmitter &ORE, const DataLayout &DL)
|
||||
: Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), DL(DL) {}
|
||||
OptimizationRemarkEmitter &ORE, Function &Func)
|
||||
: Inst2ColumnMatrix(Inst2ColumnMatrix), ORE(ORE), Func(Func),
|
||||
DL(Func.getParent()->getDataLayout()) {}
|
||||
|
||||
/// Return all leafs of matrix expressions. Those are instructions in
|
||||
/// Inst2ColumnMatrix returing void. Currently that should only include
|
||||
/// stores.
|
||||
SmallVector<Value *, 4> getExpressionLeaves() {
|
||||
/// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
|
||||
/// instructions in Inst2ColumnMatrix returning void or without any users in
|
||||
/// \p ExprsInSubprogram. Currently that should only include stores.
|
||||
SmallVector<Value *, 4>
|
||||
getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
|
||||
SmallVector<Value *, 4> Leaves;
|
||||
for (auto &KV : Inst2ColumnMatrix)
|
||||
if (KV.first->getType()->isVoidTy())
|
||||
Leaves.push_back(KV.first);
|
||||
|
||||
for (auto *Expr : ExprsInSubprogram)
|
||||
if (Expr->getType()->isVoidTy() ||
|
||||
!any_of(Expr->users(), [&ExprsInSubprogram](User *U) {
|
||||
return ExprsInSubprogram.count(U);
|
||||
}))
|
||||
Leaves.push_back(Expr);
|
||||
return Leaves;
|
||||
}
|
||||
|
||||
/// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
|
||||
/// to all visited expressions in \p Shared.
|
||||
/// to all visited expressions in \p Shared. Limit the matrix operations to
|
||||
/// the ones in \p ExprsInSubprogram.
|
||||
void collectSharedInfo(Value *Leaf, Value *V,
|
||||
const SmallSetVector<Value *, 32> &ExprsInSubprogram,
|
||||
DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
|
||||
|
||||
if (Inst2ColumnMatrix.find(V) == Inst2ColumnMatrix.end())
|
||||
if (!ExprsInSubprogram.count(V))
|
||||
return;
|
||||
|
||||
auto I = Shared.insert({V, {}});
|
||||
I.first->second.insert(Leaf);
|
||||
|
||||
for (Value *Op : cast<Instruction>(V)->operand_values())
|
||||
collectSharedInfo(Leaf, Op, Shared);
|
||||
collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
|
||||
return;
|
||||
}
|
||||
|
||||
/// Calculate the number of exclusive and shared op counts for expression
|
||||
/// starting at \p V. Expressions used multiple times are counted once.
|
||||
/// Limit the matrix operations to the ones in \p ExprsInSubprogram.
|
||||
std::pair<OpInfoTy, OpInfoTy>
|
||||
sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
|
||||
DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
|
||||
auto CM = Inst2ColumnMatrix.find(Root);
|
||||
if (CM == Inst2ColumnMatrix.end())
|
||||
const SmallSetVector<Value *, 32> &ExprsInSubprogram,
|
||||
DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
|
||||
if (!ExprsInSubprogram.count(Root))
|
||||
return {};
|
||||
|
||||
// Already counted this expression. Stop.
|
||||
|
@ -1255,13 +1273,14 @@ public:
|
|||
OpInfoTy Count;
|
||||
|
||||
auto I = Shared.find(Root);
|
||||
auto CM = Inst2ColumnMatrix.find(Root);
|
||||
if (I->second.size() == 1)
|
||||
Count = CM->second.getOpInfo();
|
||||
else
|
||||
SharedCount = CM->second.getOpInfo();
|
||||
|
||||
for (Value *Op : cast<Instruction>(Root)->operand_values()) {
|
||||
auto C = sumOpInfos(Op, ReusedExprs, Shared);
|
||||
auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);
|
||||
Count += C.first;
|
||||
SharedCount += C.second;
|
||||
}
|
||||
|
@ -1272,49 +1291,83 @@ public:
|
|||
if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
|
||||
return;
|
||||
|
||||
// Find leafs of matrix expressions.
|
||||
auto Leaves = getExpressionLeaves();
|
||||
|
||||
DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
|
||||
|
||||
for (Value *Leaf : Leaves)
|
||||
collectSharedInfo(Leaf, Leaf, Shared);
|
||||
|
||||
// Generate remarks for each leaf.
|
||||
for (auto *L : Leaves) {
|
||||
SmallPtrSet<Value *, 8> ReusedExprs;
|
||||
OpInfoTy Counts, SharedCounts;
|
||||
std::tie(Counts, SharedCounts) = sumOpInfos(L, ReusedExprs, Shared);
|
||||
|
||||
OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered",
|
||||
cast<Instruction>(L)->getDebugLoc(),
|
||||
cast<Instruction>(L)->getParent());
|
||||
|
||||
Rem << "Lowered with ";
|
||||
Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
|
||||
<< ore::NV("NumLoads", Counts.NumLoads) << " loads, "
|
||||
<< ore::NV("NumComputeOps", Counts.NumComputeOps) << " compute ops";
|
||||
|
||||
if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
|
||||
SharedCounts.NumComputeOps > 0) {
|
||||
Rem << ",\nadditionally "
|
||||
<< ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
|
||||
<< ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
|
||||
<< ore::NV("NumFPOps", SharedCounts.NumComputeOps)
|
||||
<< " compute ops"
|
||||
<< " are shared with other expressions";
|
||||
// Map matrix operations to their containting subprograms, by traversing
|
||||
// the inlinedAt chain. If the function does not have a DISubprogram, we
|
||||
// only map them to the containing function.
|
||||
MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
|
||||
for (auto &KV : Inst2ColumnMatrix) {
|
||||
if (Func.getSubprogram()) {
|
||||
auto *I = cast<Instruction>(KV.first);
|
||||
DILocation *Context = I->getDebugLoc();
|
||||
while (Context) {
|
||||
auto I =
|
||||
Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}});
|
||||
I.first->second.push_back(KV.first);
|
||||
Context = DebugLoc(Context).getInlinedAt();
|
||||
}
|
||||
} else {
|
||||
auto I = Subprog2Exprs.insert({nullptr, {}});
|
||||
I.first->second.push_back(KV.first);
|
||||
}
|
||||
}
|
||||
for (auto &KV : Subprog2Exprs) {
|
||||
SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
|
||||
KV.second.end());
|
||||
auto Leaves = getExpressionLeaves(ExprsInSubprogram);
|
||||
|
||||
Rem << ("\n" + linearize(L, Shared, DL));
|
||||
ORE.emit(Rem);
|
||||
DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
|
||||
for (Value *Leaf : Leaves)
|
||||
collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
|
||||
|
||||
// Generate remarks for each leaf.
|
||||
for (auto *L : Leaves) {
|
||||
|
||||
DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
|
||||
DILocation *Context = cast<Instruction>(L)->getDebugLoc();
|
||||
while (Context) {
|
||||
if (getSubprogram(Context->getScope()) == KV.first) {
|
||||
Loc = Context;
|
||||
break;
|
||||
}
|
||||
Context = DebugLoc(Context).getInlinedAt();
|
||||
}
|
||||
|
||||
SmallPtrSet<Value *, 8> ReusedExprs;
|
||||
OpInfoTy Counts, SharedCounts;
|
||||
std::tie(Counts, SharedCounts) =
|
||||
sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
|
||||
|
||||
OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
|
||||
cast<Instruction>(L)->getParent());
|
||||
|
||||
Rem << "Lowered with ";
|
||||
Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
|
||||
<< ore::NV("NumLoads", Counts.NumLoads) << " loads, "
|
||||
<< ore::NV("NumComputeOps", Counts.NumComputeOps)
|
||||
<< " compute ops";
|
||||
|
||||
if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
|
||||
SharedCounts.NumComputeOps > 0) {
|
||||
Rem << ",\nadditionally "
|
||||
<< ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
|
||||
<< ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
|
||||
<< ore::NV("NumFPOps", SharedCounts.NumComputeOps)
|
||||
<< " compute ops"
|
||||
<< " are shared with other expressions";
|
||||
}
|
||||
|
||||
Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));
|
||||
ORE.emit(Rem);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::string
|
||||
linearize(Value *L,
|
||||
const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
|
||||
const SmallSetVector<Value *, 32> &ExprsInSubprogram,
|
||||
const DataLayout &DL) {
|
||||
ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, L);
|
||||
ExprLinearizer Lin(DL, Inst2ColumnMatrix, Shared, ExprsInSubprogram, L);
|
||||
Lin.linearizeExpr(L, 0, false, false);
|
||||
return Lin.getResult();
|
||||
}
|
||||
|
|
|
@ -0,0 +1,166 @@
|
|||
; REQUIRES: aarch64-registered-target
|
||||
|
||||
; This test needs to be target specific due to the cost estimate in the output.
|
||||
|
||||
; RUN: opt -lower-matrix-intrinsics -pass-remarks=lower-matrix-intrinsics -mtriple=arm64-apple-iphoneos -S < %s 2>&1 | FileCheck %s
|
||||
|
||||
; Test the propagation of matrix expressions along to inlined-at chain. The IR
|
||||
; in the test roughly corresponds to the C++ code below, with the IR containing
|
||||
; references to a few more functions.
|
||||
|
||||
; matrix.h
|
||||
; template <typename Ty, unsigned R, unsigned C>
|
||||
; struct Matrix {
|
||||
; using matrix_t = Ty __attribute__((matrix_type(R, C)));
|
||||
;
|
||||
; matrix_t value;
|
||||
; };
|
||||
;
|
||||
; ; add.h
|
||||
; template <typename Ty, unsigned R, unsigned C>
|
||||
; Matrix<Ty, R, C> add(Matrix<Ty, R, C> M1, Matrix<Ty, R, C> M2) {
|
||||
; Matrix<Ty, R, C> Result;
|
||||
; Result.value = __builtin_matrix_add(M1.value, M2.value);
|
||||
; return Result;
|
||||
; }
|
||||
;
|
||||
; load.h:
|
||||
; template <typename Ty, unsigned R, unsigned C>
|
||||
; Matrix<Ty, R, C> load(Ty *Ptr) {
|
||||
; Matrix<Ty, R, C> Result;
|
||||
; Result.value = *reinterpret_cast <typename Matrix<Ty, R, C>::matrix_t *>(Ptr);
|
||||
; return Result;
|
||||
; }
|
||||
;
|
||||
; store.h:
|
||||
; template <typename Ty, unsigned R, unsigned C>
|
||||
; void store(Matrix<Ty, R, C> M1, Ty *Ptr) {
|
||||
; *reinterpret_cast<typename decltype(M1)::matrix_t *>(Ptr) = M1.value;
|
||||
; }
|
||||
;
|
||||
; toplevel.cpp
|
||||
; void test(double *A, double *B, double *C) {
|
||||
; store(add(load<double, 3, 5>(A), load<double, 3, 5>(B)), C);
|
||||
; }
|
||||
;
|
||||
|
||||
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
|
||||
target triple = "aarch64-apple-ios"
|
||||
|
||||
; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
|
||||
; CHECK-NEXT: load(addr %A)
|
||||
|
||||
; CHECK-LABEL: remark: load.h:41:43: Lowered with 0 stores, 10 loads, 0 compute ops
|
||||
; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)
|
||||
|
||||
; CHECK-LABEL: remark: load.h:41:11: Lowered with 0 stores, 1 loads, 0 compute ops
|
||||
; CHECK-NEXT: load(addr %D)
|
||||
|
||||
; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
|
||||
; CHECK-NEXT: load(addr %A)
|
||||
|
||||
; CHECK-LABEL: remark: assign.h:32:43: Lowered with 0 stores, 10 loads, 0 compute ops
|
||||
; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)
|
||||
|
||||
; CHECK-LABEL: remark: toplevel.c:410:0: Lowered with 10 stores, 20 loads, 10 compute ops
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: fadd(
|
||||
; CHECK-NEXT: load(addr %A),
|
||||
; CHECK-NEXT: columnwise.load.3x5.double(addr %B, 5)),
|
||||
; CHECK-NEXT: addr %C)
|
||||
|
||||
; CHECK-LABEL: remark: toplevel.c:510:0: Lowered with 1 stores, 1 loads, 8 compute ops
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(load(addr %D))),
|
||||
; CHECK-NEXT: addr %D)
|
||||
|
||||
; CHECK-LABEL: remark: add.h:66:11: Lowered with 0 stores, 0 loads, 10 compute ops
|
||||
; CHECK-NEXT: fadd(
|
||||
; CHECK-NEXT: addr %A,
|
||||
; CHECK-NEXT: scalar)
|
||||
|
||||
; CHECK-LABEL: remark: store.h:10:11: Lowered with 10 stores, 0 loads, 0 compute ops
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: scalar,
|
||||
; CHECK-NEXT: addr %C)
|
||||
|
||||
; CHECK-LABEL: remark: store.h:66:11: Lowered with 1 stores, 0 loads, 0 compute ops
|
||||
; CHECK-NEXT: store(
|
||||
; CHECK-NEXT: scalar,
|
||||
; CHECK-NEXT: addr %D)
|
||||
|
||||
; CHECK-LABEL: remark: transpose.h:13:11: Lowered with 0 stores, 0 loads, 8 compute ops
|
||||
; CHECK-NEXT: transpose.1x2.float(transpose.2x1.float(addr %D))
|
||||
|
||||
define void @toplevel(<15 x double>* %A, <15 x double>* %B, <15 x double>* %C, <2 x float>* %D) !dbg !16 {
|
||||
entry:
|
||||
%a = load <15 x double>, <15 x double> *%A, align 16, !dbg !3791
|
||||
%b = call <15 x double> @llvm.matrix.columnwise.load(<15 x double>* %B, i32 5, i32 3, i32 5), !dbg !3793
|
||||
%c = fadd <15 x double> %a, %b, !dbg !100
|
||||
store <15 x double> %c, <15 x double> *%C, align 16, !dbg !102
|
||||
|
||||
%load = load <2 x float>, <2 x float>* %D, !dbg !104
|
||||
%t1 = call <2 x float> @llvm.matrix.transpose(<2 x float> %load, i32 2, i32 1), !dbg !106
|
||||
%t2 = call <2 x float> @llvm.matrix.transpose(<2 x float> %t1, i32 1, i32 2), !dbg !106
|
||||
store <2 x float> %t2, <2 x float>* %D, !dbg !108
|
||||
ret void
|
||||
}
|
||||
|
||||
declare <15 x double> @llvm.matrix.columnwise.load(<15 x double>*, i32, i32, i32)
|
||||
declare <2 x float> @llvm.matrix.transpose(<2 x float>, i32, i32)
|
||||
|
||||
!llvm.dbg.cu = !{!0}
|
||||
!llvm.module.flags = !{!3, !4}
|
||||
|
||||
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
|
||||
!1 = !DIFile(filename: "load.h", directory: "/test")
|
||||
!2 = !{}
|
||||
!3 = !{i32 2, !"Dwarf Version", i32 4}
|
||||
!4 = !{i32 2, !"Debug Info Version", i32 3}
|
||||
!5 = distinct !DISubprogram(name: "load_fn", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!17 = !DIFile(filename: "toplevel.c", directory: "/test")
|
||||
!16 = distinct !DISubprogram(name: "toplevel", scope: !1, file: !17, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
!18 = !DIFile(filename: "assign.h", directory: "/test")
|
||||
!19 = distinct !DISubprogram(name: "assign", scope: !1, file: !18, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
|
||||
!20 = !DIFile(filename: "add.h", directory: "/test")
|
||||
!21 = distinct !DISubprogram(name: "add_fn", scope: !1, file: !20, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
|
||||
!22 = !DIFile(filename: "store.h", directory: "/test")
|
||||
!23 = distinct !DISubprogram(name: "store_fn", scope: !1, file: !22, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
|
||||
!24 = !DIFile(filename: "transpose.h", directory: "/test")
|
||||
!25 = distinct !DISubprogram(name: "transpose", scope: !1, file: !24, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !12)
|
||||
|
||||
|
||||
!6 = !DISubroutineType(types: !7)
|
||||
!7 = !{null, !8, !8, !11}
|
||||
!8 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !9)
|
||||
!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 32, align: 32)
|
||||
!10 = !DIBasicType(name: "float", size: 32, align: 32, encoding: DW_ATE_float)
|
||||
!11 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
|
||||
!12 = !{!13}
|
||||
!13 = !DILocalVariable(name: "a", arg: 1, scope: !5, file: !1, line: 1, type: !8)
|
||||
!14 = !DILocation(line: 1, column: 27, scope: !5)
|
||||
|
||||
!3791 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3795)
|
||||
!3792 = !DILocation(line: 405, column: 3, scope: !16)
|
||||
!3793 = !DILocation(line: 41, column: 43, scope: !5, inlinedAt: !3796)
|
||||
!3794 = !DILocation(line: 406, column: 11, scope: !16)
|
||||
!3795 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3792)
|
||||
!3796 = !DILocation(line: 32, column: 43, scope: !19, inlinedAt: !3794)
|
||||
|
||||
!100 = !DILocation(line: 66, column: 11, scope: !21, inlinedAt: !101)
|
||||
!101 = !DILocation(line: 410, column: 11, scope: !16)
|
||||
|
||||
!102 = !DILocation(line: 10, column: 11, scope: !23, inlinedAt: !103)
|
||||
!103 = !DILocation(line: 410, column: 0, scope: !16)
|
||||
|
||||
!104 = !DILocation(line: 41, column: 11, scope: !5, inlinedAt: !101)
|
||||
!105 = !DILocation(line: 500, column: 11, scope: !16)
|
||||
|
||||
!106 = !DILocation(line: 13, column: 11, scope: !25, inlinedAt: !101)
|
||||
!107 = !DILocation(line: 510, column: 11, scope: !16)
|
||||
|
||||
!108 = !DILocation(line: 66, column: 11, scope: !23, inlinedAt: !109)
|
||||
!109 = !DILocation(line: 510, column: 0, scope: !16)
|
|
@ -71,8 +71,8 @@ declare void @llvm.matrix.columnwise.store(<9 x double>, <9 x double>*, i32, i32
|
|||
|
||||
define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !32
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !32
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !32
|
||||
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !32
|
||||
ret void
|
||||
}
|
||||
|
@ -95,8 +95,8 @@ define void @binaryops(<9 x double>* %A, <9 x double>* %B) !dbg !31 {
|
|||
|
||||
define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x double>* %C, <12 x double>* %D, <4 x double>* %E) !dbg !33 {
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !34
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !34
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !34
|
||||
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !34
|
||||
|
||||
%C.matrix = load <12 x double>, <12 x double>* %C, !dbg !34
|
||||
|
@ -119,8 +119,8 @@ define void @multiple_expressions(<9 x double>* %A, <9 x double>* %B, <12 x doub
|
|||
define void @stackaddresses(<9 x double>* %A) !dbg !35 {
|
||||
%B = alloca <9 x double>
|
||||
%A.matrix = call <9 x double> @llvm.matrix.columnwise.load(<9 x double>* %A, i32 5, i32 3, i32 3), !dbg !36
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix
|
||||
%R1.matrix = fadd <9 x double> %A.matrix, %A.matrix, !dbg !36
|
||||
%R2.matrix = fmul <9 x double> %R1.matrix, %A.matrix, !dbg !36
|
||||
call void @llvm.matrix.columnwise.store(<9 x double> %R2.matrix, <9 x double>* %B, i32 10, i32 3, i32 3), !dbg !36
|
||||
ret void
|
||||
}
|
||||
|
@ -140,7 +140,7 @@ entry:
|
|||
%s2 = bitcast <15 x double>* %s1 to i64*, !dbg !22
|
||||
%s3 = bitcast i64* %s2 to <15 x double>*, !dbg !22
|
||||
|
||||
%t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3)
|
||||
%t = call <15 x double> @llvm.matrix.transpose.v15f64.v15f64(<15 x double> %av, i32 5, i32 3), !dbg !22
|
||||
|
||||
store <15 x double> %t, <15 x double>* %s3, !dbg !22
|
||||
ret void
|
||||
|
|
Loading…
Reference in New Issue