forked from OSchip/llvm-project
[mlir][NFC] Move SubTensorOp and SubTensorInsertOp to TensorDialect
The main goal of this commit is to remove the dependency of Standard dialect on the Tensor dialect. * Rename ops: SubTensorOp --> ExtractTensorOp, SubTensorInsertOp --> InsertTensorOp * Some helper functions are (already) duplicated between the Tensor dialect and the MemRef dialect. To keep this commit smaller, this will be cleaned up in a separate commit. * Additional dialect dependencies: Shape --> Tensor, Tensor --> Standard * Remove dialect dependencies: Standard --> Tensor * Move canonicalization test cases to correct dialect (Tensor/MemRef). Differential Revision: https://reviews.llvm.org/D104499
This commit is contained in:
parent
64b2676ca8
commit
83bf801f5f
|
@ -367,7 +367,8 @@ def ConvertShapeToStandard : Pass<"convert-shape-to-std", "ModuleOp"> {
|
|||
let dependentDialects = [
|
||||
"memref::MemRefDialect",
|
||||
"StandardOpsDialect",
|
||||
"scf::SCFDialect"
|
||||
"scf::SCFDialect",
|
||||
"tensor::TensorDialect"
|
||||
];
|
||||
}
|
||||
|
||||
|
@ -504,7 +505,7 @@ def TosaToSCF : Pass<"tosa-to-scf"> {
|
|||
|
||||
def TosaToStandard : Pass<"tosa-to-standard"> {
|
||||
let summary = "Lower TOSA to the Standard dialect";
|
||||
let dependentDialects = ["StandardOpsDialect"];
|
||||
let dependentDialects = ["StandardOpsDialect", "tensor::TensorDialect"];
|
||||
let description = [{
|
||||
Pass that converts TOSA operations to the equivalent operations using the
|
||||
operations in the Standard dialect.
|
||||
|
|
|
@ -579,12 +579,11 @@ def Linalg_TiledLoopOp : Linalg_Op<"tiled_loop", [
|
|||
|
||||
Tensor-based version:
|
||||
|
||||
The body region of the loop contains `subtensor` operations applied to
|
||||
The body region of the loop contains `extract_slice` operations applied to
|
||||
every tensor argument of TiledLoopOp.
|
||||
|
||||
The body region must contain exactly one block that terminates with
|
||||
`linalg.yield` with the operands resulting from `subtensor_insert`
|
||||
operations.
|
||||
`linalg.yield` with the operands resulting from `insert_slice` operations.
|
||||
|
||||
Example:
|
||||
|
||||
|
@ -594,16 +593,16 @@ def Linalg_TiledLoopOp : Linalg_Op<"tiled_loop", [
|
|||
outs(%out : tensor<24x64xi8>)
|
||||
iterators("parallel")
|
||||
distribution("block_x") {
|
||||
%lhs_sub = subtensor %lhs[%i, 0] [%c4, %c64] [1, 1]
|
||||
%lhs_sub = tensor.extract_slice %lhs[%i, 0] [%c4, %c64] [1, 1]
|
||||
: tensor<24x64xi8> to tensor<?x?xi8>
|
||||
%rhs_sub = subtensor %rhs[%i, 0] [%c4, %c64] [1, 1]
|
||||
%rhs_sub = tensor.extract_slice %rhs[%i, 0] [%c4, %c64] [1, 1]
|
||||
: tensor<24x64xi8> to tensor<?x?xi8>
|
||||
%out_sub = subtensor %out[%i, 0] [%c4, %c64] [1, 1]
|
||||
%out_sub = tensor.extract_slice %out[%i, 0] [%c4, %c64] [1, 1]
|
||||
: tensor<24x64xi8> to tensor<?x?xi8>
|
||||
|
||||
%result_sub = linalg.generic ...
|
||||
|
||||
%result = subtensor_insert %result_sub into %out[%i, 0][%c4, %c64][1, 1]
|
||||
%result = tensor.insert_slice %result_sub into %out[%i, 0][%c4, %c64][1, 1]
|
||||
: tensor<?x?xi8> into tensor<24x64xi8>
|
||||
linalg.yield %result : tensor<24x64xi8>
|
||||
}
|
||||
|
|
|
@ -47,7 +47,7 @@ void hoistRedundantVectorTransfersOnTensor(FuncOp func);
|
|||
/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR.
|
||||
/// ```
|
||||
/// scf.for (%i, %j, %k)
|
||||
/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
|
||||
/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
|
||||
/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] {
|
||||
/// ^bb0( ... ):
|
||||
/// linalg.yield %pad
|
||||
|
@ -61,16 +61,17 @@ void hoistRedundantVectorTransfersOnTensor(FuncOp func);
|
|||
/// scf.for (%i) {
|
||||
/// %packed_init = linalg.init_tensor range(%j) : tensor<?x4x8xf32>
|
||||
/// %packed = scf.for (%k) iter_args(%p : %packed_init) {
|
||||
/// %st0 = subtensor f(%i, %k) : ... to tensor<?x?xf32>
|
||||
/// %st0 = tensor.extract_slice f(%i, %k) : ... to tensor<?x?xf32>
|
||||
/// %0 = linalg.pad_tensor %st0 low[0, 0] high[...] {
|
||||
/// ^bb0( ... ):
|
||||
/// linalg.yield %pad
|
||||
/// } : tensor<?x?xf32> to tensor<4x8xf32>
|
||||
/// %1 = subtensor_insert %0 ... : tensor<4x8xf32> to tensor<?x4x8xf32>
|
||||
/// %1 = tensor.insert_slice %0 ...
|
||||
/// : tensor<4x8xf32> to tensor<?x4x8xf32>
|
||||
/// scf.yield %1: tensor<?x4x8xf32>
|
||||
/// } -> tensor<?x4x8xf32>
|
||||
/// scf.for (%j, %k) {
|
||||
/// %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
|
||||
/// %st0 = tensor.extract_slice %packed [%k, 0, 0][1, 4, 8][1, 1, 1] :
|
||||
/// tensor<?x4x8xf32> to tensor<4x8xf32>
|
||||
/// compute(%st0)
|
||||
/// }
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/SCF/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Vector/VectorOps.h"
|
||||
#include "mlir/IR/Identifier.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
|
@ -1077,12 +1078,12 @@ LogicalResult applyStagedPatterns(
|
|||
const FrozenRewritePatternSet &stage2Patterns,
|
||||
function_ref<LogicalResult(Operation *)> stage3Lambda = nullptr);
|
||||
|
||||
/// Rewrite subtensor(pad_tensor(x)) into pad_tensor(subtensor(x)).
|
||||
struct SubTensorOfPadTensorSwapPattern
|
||||
: public OpRewritePattern<SubTensorOp> {
|
||||
using OpRewritePattern<SubTensorOp>::OpRewritePattern;
|
||||
/// Rewrite extract_slice(pad_tensor(x)) into pad_tensor(extract_slice(x)).
|
||||
struct ExtractSliceOfPadTensorSwapPattern
|
||||
: public OpRewritePattern<tensor::ExtractSliceOp> {
|
||||
using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(SubTensorOp subTensorOp,
|
||||
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
|
||||
PatternRewriter &rewriter) const override;
|
||||
};
|
||||
|
||||
|
|
|
@ -78,7 +78,7 @@ bool isProducerLastWriteOfView(const LinalgDependenceGraph &graph,
|
|||
bool isFusableInto(const LinalgDependenceGraph &graph, LinalgOp consumer,
|
||||
Value consumedView, LinalgOp producer);
|
||||
|
||||
/// Creates subtensor/subview ops for all `tiledOperands` of the given
|
||||
/// Creates extract_slice/subview ops for all `tiledOperands` of the given
|
||||
/// `linalgOp` with `builder`, assuming `linalgOp` is being fused into a loop
|
||||
/// nest for tiling with the given induction variables `ivs` and tile sizes
|
||||
/// `tileSizes`. `sizeBounds` are the iteration space bounds for *all* the
|
||||
|
@ -118,15 +118,17 @@ Optional<FusionInfo> fuseProducerOfBuffer(OpBuilder &b,
|
|||
const LinalgDependenceGraph &graph);
|
||||
/// Tensor counterpart of `fuseProducerOfBuffer`.
|
||||
/// This implements the fusion part of the "tileAndFuse on tensors"
|
||||
/// transformation and thus requires the `consumerOpOperand` to be a `subtensor`
|
||||
/// op (generally obtained by applying the tiling transformation).
|
||||
/// transformation and thus requires the `consumerOpOperand` to be a
|
||||
/// `extract_slice` op (generally obtained by applying the tiling
|
||||
/// transformation).
|
||||
Optional<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
|
||||
OpOperand &consumerOpOperand);
|
||||
/// Tensor counterpart of `fuseProducerOfBuffer`.
|
||||
/// This implements the fusion part of the "tileAndFuse on tensors"
|
||||
/// transformation and thus requires the `consumerOpOperand` to be a `subtensor`
|
||||
/// op (generally obtained by applying the tiling transformation).
|
||||
/// Assumes `producerOfTensor` is a Linalg op that produces `consumerOpOperand`.
|
||||
/// transformation and thus requires the `consumerOpOperand` to be a
|
||||
/// `extract_slice` op (generally obtained by applying the tiling
|
||||
/// transformation). Assumes `producerOfTensor` is a Linalg op that produces
|
||||
/// `consumerOpOperand`.
|
||||
Optional<FusionInfo> fuseProducerOfTensor(OpBuilder &b,
|
||||
OpResult producerOpResult,
|
||||
OpOperand &consumerOpOperand);
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#ifndef MLIR_SHAPE_IR_SHAPE_H
|
||||
#define MLIR_SHAPE_IR_SHAPE_H
|
||||
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
#include "mlir/IR/Dialect.h"
|
||||
#include "mlir/IR/OpDefinition.h"
|
||||
|
|
|
@ -35,6 +35,7 @@ def ShapeDialect : Dialect {
|
|||
}];
|
||||
|
||||
let cppNamespace = "::mlir::shape";
|
||||
let dependentDialects = ["tensor::TensorDialect"];
|
||||
|
||||
let hasConstantMaterializer = 1;
|
||||
let hasOperationAttrVerify = 1;
|
||||
|
|
|
@ -23,7 +23,6 @@
|
|||
#include "mlir/Interfaces/ControlFlowInterfaces.h"
|
||||
#include "mlir/Interfaces/SideEffectInterfaces.h"
|
||||
#include "mlir/Interfaces/VectorInterfaces.h"
|
||||
#include "mlir/Interfaces/ViewLikeInterface.h"
|
||||
|
||||
// Pull in all enum type definitions and utility function declarations.
|
||||
#include "mlir/Dialect/StandardOps/IR/OpsEnums.h.inc"
|
||||
|
@ -34,12 +33,6 @@ class Builder;
|
|||
class FuncOp;
|
||||
class OpBuilder;
|
||||
class PatternRewriter;
|
||||
|
||||
/// Return the list of Range (i.e. offset, size, stride). Each Range
|
||||
/// entry contains either the dynamic value or a ConstantIndexOp constructed
|
||||
/// with `b` at location `loc`.
|
||||
SmallVector<Range, 8> getOrCreateRanges(OffsetSizeAndStrideOpInterface op,
|
||||
OpBuilder &b, Location loc);
|
||||
} // namespace mlir
|
||||
|
||||
#define GET_OP_CLASSES
|
||||
|
|
|
@ -21,7 +21,6 @@ include "mlir/Interfaces/CastInterfaces.td"
|
|||
include "mlir/Interfaces/ControlFlowInterfaces.td"
|
||||
include "mlir/Interfaces/SideEffectInterfaces.td"
|
||||
include "mlir/Interfaces/VectorInterfaces.td"
|
||||
include "mlir/Interfaces/ViewLikeInterface.td"
|
||||
|
||||
def StandardOps_Dialect : Dialect {
|
||||
let name = "std";
|
||||
|
@ -1754,245 +1753,6 @@ def SubIOp : IntBinaryOp<"subi"> {
|
|||
let hasCanonicalizer = 1;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SubTensorOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def SubTensorOp : BaseOpWithOffsetSizesAndStrides<
|
||||
StandardOps_Dialect, "subtensor", [NoSideEffect, AttrSizedOperandSegments,
|
||||
OffsetSizeAndStrideOpInterface]> {
|
||||
let summary = "subtensor operation";
|
||||
let description = [{
|
||||
The "subtensor" operation extract a tensor from another tensor as
|
||||
specified by the operation's offsets, sizes and strides arguments.
|
||||
|
||||
The subtensor operation supports the following arguments:
|
||||
|
||||
* source: the "base" tensor from which to extract a subtensor.
|
||||
* offsets: tensor-rank number of offsets into the "base" tensor from which
|
||||
to extract the subtensor.
|
||||
* sizes: tensor-rank number of sizes which specify the sizes of the result
|
||||
tensor type.
|
||||
* strides: tensor-rank number of strides specifying subsampling in each
|
||||
dimension.
|
||||
|
||||
The representation based on offsets, sizes and strides support a
|
||||
partially-static specification via attributes specified through the
|
||||
`static_offsets`, `static_sizes` and `static_strides` arguments. A special
|
||||
sentinel value ShapedType::kDynamicSize and
|
||||
ShapedType::kDynamicStrideOrOffset encodes that the corresponding entry has
|
||||
a dynamic value.
|
||||
|
||||
After buffer-allocation, the "subtensor" op is expected to lower into a
|
||||
"subview" op.
|
||||
|
||||
A subtensor operation may additionally reduce the rank of the resulting
|
||||
tensor by removing dimensions that are statically known to be of size 1.
|
||||
|
||||
Example:
|
||||
|
||||
```
|
||||
// Rank-reducing subtensor.
|
||||
%1 = subtensor %0[0, 0, 0][1, 16, 4][1, 1, 1] :
|
||||
tensor<8x16x4xf32> to tensor<16x4xf32>
|
||||
%3 = subtensor %2[3, 4, 2][1, 6, 3][1, 1, 1] :
|
||||
tensor<8x16x4xf32> to tensor<6x3xf32>
|
||||
```
|
||||
}];
|
||||
|
||||
let arguments = (ins
|
||||
AnyRankedTensor:$source,
|
||||
Variadic<Index>:$offsets,
|
||||
Variadic<Index>:$sizes,
|
||||
Variadic<Index>:$strides,
|
||||
I64ArrayAttr:$static_offsets,
|
||||
I64ArrayAttr:$static_sizes,
|
||||
I64ArrayAttr:$static_strides
|
||||
);
|
||||
let results = (outs AnyRankedTensor:$result);
|
||||
|
||||
let assemblyFormat = [{
|
||||
$source ``
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($offsets, $static_offsets)
|
||||
custom<OperandsOrIntegersSizesList>($sizes, $static_sizes)
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($strides, $static_strides)
|
||||
attr-dict `:` type($source) `to` type($result)
|
||||
}];
|
||||
|
||||
let builders = [
|
||||
// Build a SubTensorOp with mixed static and dynamic entries and inferred
|
||||
// result type.
|
||||
OpBuilder<(ins "Value":$source, "ArrayRef<OpFoldResult>":$offsets,
|
||||
"ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build a SubTensorOp with mixed static and dynamic entries and custom
|
||||
// result type. If the type passed is nullptr, it is inferred.
|
||||
OpBuilder<(ins "RankedTensorType":$resultType, "Value":$source,
|
||||
"ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
|
||||
"ArrayRef<OpFoldResult>":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build a SubTensorOp with dynamic entries and custom result type. If the
|
||||
// type passed is nullptr, it is inferred.
|
||||
OpBuilder<(ins "Value":$source, "ValueRange":$offsets,
|
||||
"ValueRange":$sizes, "ValueRange":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build a SubTensorOp with dynamic entries and inferred result type.
|
||||
OpBuilder<(ins "RankedTensorType":$resultType, "Value":$source,
|
||||
"ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
|
||||
];
|
||||
|
||||
let extraClassDeclaration = extraBaseClassDeclaration # [{
|
||||
/// Returns the type of the base tensor operand.
|
||||
RankedTensorType getSourceType() {
|
||||
return source().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// The result of a subtensor is always a tensor.
|
||||
RankedTensorType getType() {
|
||||
return getResult().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// A subtensor result type can be fully inferred from the source type and
|
||||
/// the static representation of offsets, sizes and strides. Special
|
||||
/// sentinels encode the dynamic case.
|
||||
static Type inferResultType(RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> staticOffsets,
|
||||
ArrayRef<int64_t> staticSizes,
|
||||
ArrayRef<int64_t> staticStrides);
|
||||
static Type inferResultType(RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> staticOffsets,
|
||||
ArrayRef<OpFoldResult> staticSizes,
|
||||
ArrayRef<OpFoldResult> staticStrides);
|
||||
static Type inferRankReducedResultType(unsigned resultRank,
|
||||
RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> staticOffsets,
|
||||
ArrayRef<int64_t> staticSizes,
|
||||
ArrayRef<int64_t> staticStrides);
|
||||
static Type inferRankReducedResultType(unsigned resultRank,
|
||||
RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> staticOffsets,
|
||||
ArrayRef<OpFoldResult> staticSizes,
|
||||
ArrayRef<OpFoldResult> staticStrides);
|
||||
|
||||
/// Return the expected rank of each of the`static_offsets`, `static_sizes`
|
||||
/// and `static_strides` attributes.
|
||||
std::array<unsigned, 3> getArrayAttrMaxRanks() {
|
||||
unsigned rank = getSourceType().getRank();
|
||||
return {rank, rank, rank};
|
||||
}
|
||||
|
||||
/// Return the number of leading operands before the `offsets`, `sizes` and
|
||||
/// and `strides` operands.
|
||||
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
|
||||
}];
|
||||
|
||||
let hasCanonicalizer = 1;
|
||||
let hasFolder = 1;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SubTensorInsertOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def SubTensorInsertOp : BaseOpWithOffsetSizesAndStrides<
|
||||
StandardOps_Dialect, "subtensor_insert",
|
||||
[NoSideEffect, AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface,
|
||||
TypesMatchWith<"expected result type to match dest type",
|
||||
"dest", "result", "$_self">]> {
|
||||
let summary = "subtensor_insert operation";
|
||||
let description = [{
|
||||
The "subtensor_insert" operation insert a tensor `source` into another
|
||||
tensor `dest` as specified by the operation's offsets, sizes and strides
|
||||
arguments.
|
||||
|
||||
It returns a copy of `dest` with the proper subtensor updated with the value
|
||||
of `source`.
|
||||
|
||||
The subtensor_insert operation has the encodes the following information:
|
||||
|
||||
* source: the tensor that is inserted.
|
||||
* dest: the tensor into which the source tensor is inserted.
|
||||
* offsets: tensor-rank number of offsets into the "base" tensor from which
|
||||
to extract the subtensor.
|
||||
* sizes: tensor-rank number of sizes which specify the sizes of the result
|
||||
tensor type.
|
||||
* strides: tensor-rank number of strides that specify subsampling in each
|
||||
dimension.
|
||||
|
||||
The representation based on offsets, sizes and strides support a
|
||||
partially-static specification via attributes specified through the
|
||||
`static_offsets`, `static_sizes` and `static_strides` arguments. A special
|
||||
sentinel value ShapedType::kDynamicSize and
|
||||
ShapedType::kDynamicStrideOrOffset encodes that the corresponding entry has
|
||||
a dynamic value.
|
||||
|
||||
After buffer-allocation, the "subtensor_insert" op is expected to become
|
||||
an in-place buffer update.
|
||||
}];
|
||||
|
||||
let arguments = (ins
|
||||
AnyRankedTensor:$source,
|
||||
AnyRankedTensor:$dest,
|
||||
Variadic<Index>:$offsets,
|
||||
Variadic<Index>:$sizes,
|
||||
Variadic<Index>:$strides,
|
||||
I64ArrayAttr:$static_offsets,
|
||||
I64ArrayAttr:$static_sizes,
|
||||
I64ArrayAttr:$static_strides
|
||||
);
|
||||
let results = (outs AnyRankedTensor:$result);
|
||||
|
||||
let assemblyFormat = [{
|
||||
$source `into` $dest ``
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($offsets, $static_offsets)
|
||||
custom<OperandsOrIntegersSizesList>($sizes, $static_sizes)
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($strides, $static_strides)
|
||||
attr-dict `:` type($source) `into` type($dest)
|
||||
}];
|
||||
|
||||
let verifier = ?;
|
||||
|
||||
let builders = [
|
||||
// Build a SubTensorInsertOp with mixed static and dynamic entries.
|
||||
OpBuilder<(ins "Value":$source, "Value":$dest,
|
||||
"ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
|
||||
"ArrayRef<OpFoldResult>":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build a SubTensorInsertOp with dynamic entries.
|
||||
OpBuilder<(ins "Value":$source, "Value":$dest,
|
||||
"ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
|
||||
];
|
||||
|
||||
let extraClassDeclaration = extraBaseClassDeclaration # [{
|
||||
/// Returns the type of the base tensor operand.
|
||||
RankedTensorType getSourceType() {
|
||||
return source().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// The result of a subtensor_insert is always a tensor.
|
||||
RankedTensorType getType() {
|
||||
return getResult().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// Return the expected rank of each of the`static_offsets`, `static_sizes`
|
||||
/// and `static_strides` attributes.
|
||||
std::array<unsigned, 3> getArrayAttrMaxRanks() {
|
||||
unsigned rank = getType().getRank();
|
||||
return {rank, rank, rank};
|
||||
}
|
||||
|
||||
/// Return the number of leading operands before the `offsets`, `sizes` and
|
||||
/// and `strides` operands.
|
||||
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }
|
||||
}];
|
||||
|
||||
let hasCanonicalizer = 1;
|
||||
let hasFolder = 1;
|
||||
}
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SwitchOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -16,6 +16,21 @@
|
|||
#include "mlir/Interfaces/CastInterfaces.h"
|
||||
#include "mlir/Interfaces/ControlFlowInterfaces.h"
|
||||
#include "mlir/Interfaces/SideEffectInterfaces.h"
|
||||
#include "mlir/Interfaces/ViewLikeInterface.h"
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Tensor Dialect Helpers
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
namespace mlir {
|
||||
|
||||
/// Return the list of Range (i.e. offset, size, stride). Each Range
|
||||
/// entry contains either the dynamic value or a ConstantIndexOp constructed
|
||||
/// with `b` at location `loc`.
|
||||
SmallVector<Range, 8> getOrCreateRanges(OffsetSizeAndStrideOpInterface op,
|
||||
OpBuilder &b, Location loc);
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Tensor Dialect
|
||||
|
@ -41,8 +56,8 @@ namespace tensor {
|
|||
/// source tensor. This is useful to fold a tensor.cast into a consuming op and
|
||||
/// implement canonicalization patterns for ops in different dialects that may
|
||||
/// consume the results of tensor.cast operations. Such foldable tensor.cast
|
||||
/// operations are typically inserted as `subtensor` ops and are canonicalized,
|
||||
/// to preserve the type compatibility of their uses.
|
||||
/// operations are typically inserted as `extract_slice` ops and are
|
||||
/// canonicalized, to preserve the type compatibility of their uses.
|
||||
///
|
||||
/// Returns true when all conditions are met:
|
||||
/// 1. source and result are ranked tensors with same element type and rank.
|
||||
|
@ -64,7 +79,6 @@ bool canFoldIntoConsumerOp(CastOp castOp);
|
|||
/// Performs folding of any operand of `op` if it comes from a tensor::CastOp
|
||||
/// that can be folded.
|
||||
LogicalResult foldTensorCast(Operation *op);
|
||||
|
||||
} // namespace tensor
|
||||
} // namespace mlir
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ include "mlir/Dialect/Tensor/IR/TensorBase.td"
|
|||
include "mlir/Interfaces/CastInterfaces.td"
|
||||
include "mlir/Interfaces/ControlFlowInterfaces.td"
|
||||
include "mlir/Interfaces/SideEffectInterfaces.td"
|
||||
include "mlir/Interfaces/ViewLikeInterface.td"
|
||||
|
||||
class Tensor_Op<string mnemonic, list<OpTrait> traits = []>
|
||||
: Op<Tensor_Dialect, mnemonic, traits> {
|
||||
|
@ -99,6 +100,144 @@ def Tensor_ExtractOp : Tensor_Op<"extract",
|
|||
let hasFolder = 1;
|
||||
}
|
||||
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ExtractSliceOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def Tensor_ExtractSliceOp : BaseOpWithOffsetSizesAndStrides<
|
||||
Tensor_Dialect, "extract_slice", [NoSideEffect, AttrSizedOperandSegments,
|
||||
OffsetSizeAndStrideOpInterface]> {
|
||||
let summary = "extract slice operation";
|
||||
let description = [{
|
||||
The "extract_slice" operation extract a tensor from another tensor as
|
||||
specified by the operation's offsets, sizes and strides arguments.
|
||||
|
||||
The extract_slice operation supports the following arguments:
|
||||
|
||||
* source: the "base" tensor from which to extract a slice.
|
||||
* offsets: tensor-rank number of offsets into the "base" tensor from which
|
||||
to extract the slice.
|
||||
* sizes: tensor-rank number of sizes which specify the sizes of the result
|
||||
tensor type.
|
||||
* strides: tensor-rank number of strides specifying subsampling in each
|
||||
dimension.
|
||||
|
||||
The representation based on offsets, sizes and strides support a
|
||||
partially-static specification via attributes specified through the
|
||||
`static_offsets`, `static_sizes` and `static_strides` arguments. A special
|
||||
sentinel value ShapedType::kDynamicSize and
|
||||
ShapedType::kDynamicStrideOrOffset encodes that the corresponding entry has
|
||||
a dynamic value.
|
||||
|
||||
After buffer-allocation, the "extract_slice" op is expected to lower into a
|
||||
"subview" op.
|
||||
|
||||
An extract_slice operation may additionally reduce the rank of the resulting
|
||||
tensor by removing dimensions that are statically known to be of size 1.
|
||||
|
||||
Example:
|
||||
|
||||
```
|
||||
// Rank-reducing extract_slice.
|
||||
%1 = tensor.extract_slice %0[0, 0, 0][1, 16, 4][1, 1, 1] :
|
||||
tensor<8x16x4xf32> to tensor<16x4xf32>
|
||||
%3 = tensor.extract_slice %2[3, 4, 2][1, 6, 3][1, 1, 1] :
|
||||
tensor<8x16x4xf32> to tensor<6x3xf32>
|
||||
```
|
||||
}];
|
||||
|
||||
let arguments = (ins
|
||||
AnyRankedTensor:$source,
|
||||
Variadic<Index>:$offsets,
|
||||
Variadic<Index>:$sizes,
|
||||
Variadic<Index>:$strides,
|
||||
I64ArrayAttr:$static_offsets,
|
||||
I64ArrayAttr:$static_sizes,
|
||||
I64ArrayAttr:$static_strides
|
||||
);
|
||||
let results = (outs AnyRankedTensor:$result);
|
||||
|
||||
let assemblyFormat = [{
|
||||
$source ``
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($offsets, $static_offsets)
|
||||
custom<OperandsOrIntegersSizesList>($sizes, $static_sizes)
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($strides, $static_strides)
|
||||
attr-dict `:` type($source) `to` type($result)
|
||||
}];
|
||||
|
||||
let builders = [
|
||||
// Build an ExtractSliceOp with mixed static and dynamic entries and
|
||||
// inferred result type.
|
||||
OpBuilder<(ins "Value":$source, "ArrayRef<OpFoldResult>":$offsets,
|
||||
"ArrayRef<OpFoldResult>":$sizes, "ArrayRef<OpFoldResult>":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build an ExtractSliceOp with mixed static and dynamic entries and custom
|
||||
// result type. If the type passed is nullptr, it is inferred.
|
||||
OpBuilder<(ins "RankedTensorType":$resultType, "Value":$source,
|
||||
"ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
|
||||
"ArrayRef<OpFoldResult>":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build an ExtractSliceOp with dynamic entries and custom result type. If
|
||||
// the type passed is nullptr, it is inferred.
|
||||
OpBuilder<(ins "Value":$source, "ValueRange":$offsets,
|
||||
"ValueRange":$sizes, "ValueRange":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build an ExtractSliceOp with dynamic entries and inferred result type.
|
||||
OpBuilder<(ins "RankedTensorType":$resultType, "Value":$source,
|
||||
"ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
|
||||
];
|
||||
|
||||
let extraClassDeclaration = extraBaseClassDeclaration # [{
|
||||
/// Returns the type of the base tensor operand.
|
||||
RankedTensorType getSourceType() {
|
||||
return source().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// The result of an extract_slice is always a tensor.
|
||||
RankedTensorType getType() {
|
||||
return getResult().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// An extract_slice result type can be fully inferred from the source type
|
||||
/// and the static representation of offsets, sizes and strides. Special
|
||||
/// sentinels encode the dynamic case.
|
||||
static Type inferResultType(RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> staticOffsets,
|
||||
ArrayRef<int64_t> staticSizes,
|
||||
ArrayRef<int64_t> staticStrides);
|
||||
static Type inferResultType(RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> staticOffsets,
|
||||
ArrayRef<OpFoldResult> staticSizes,
|
||||
ArrayRef<OpFoldResult> staticStrides);
|
||||
static Type inferRankReducedResultType(unsigned resultRank,
|
||||
RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> staticOffsets,
|
||||
ArrayRef<int64_t> staticSizes,
|
||||
ArrayRef<int64_t> staticStrides);
|
||||
static Type inferRankReducedResultType(unsigned resultRank,
|
||||
RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> staticOffsets,
|
||||
ArrayRef<OpFoldResult> staticSizes,
|
||||
ArrayRef<OpFoldResult> staticStrides);
|
||||
|
||||
/// Return the expected rank of each of the`static_offsets`, `static_sizes`
|
||||
/// and `static_strides` attributes.
|
||||
std::array<unsigned, 3> getArrayAttrMaxRanks() {
|
||||
unsigned rank = getSourceType().getRank();
|
||||
return {rank, rank, rank};
|
||||
}
|
||||
|
||||
/// Return the number of leading operands before the `offsets`, `sizes` and
|
||||
/// and `strides` operands.
|
||||
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
|
||||
}];
|
||||
|
||||
let hasCanonicalizer = 1;
|
||||
let hasFolder = 1;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// FromElementsOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
@ -200,7 +339,7 @@ def Tensor_InsertOp : Tensor_Op<"insert",
|
|||
The `tensor.insert` op writes a tensor into a tensor `dest`as specified by
|
||||
the operation's indices.
|
||||
|
||||
It returns a copy of `dest` with the proper subtensor updated with the value
|
||||
It returns a copy of `dest` with the proper slice updated with the value
|
||||
of `scalar`.
|
||||
|
||||
The arity of indices must match the rank of the tensor `dest` (i.e., if a
|
||||
|
@ -234,6 +373,107 @@ def Tensor_InsertOp : Tensor_Op<"insert",
|
|||
let hasFolder = 1;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// InsertSliceOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
def Tensor_InsertSliceOp : BaseOpWithOffsetSizesAndStrides<
|
||||
Tensor_Dialect, "insert_slice",
|
||||
[NoSideEffect, AttrSizedOperandSegments, OffsetSizeAndStrideOpInterface,
|
||||
TypesMatchWith<"expected result type to match dest type",
|
||||
"dest", "result", "$_self">]> {
|
||||
let summary = "insert_slice operation";
|
||||
let description = [{
|
||||
The "insert_slice" operation insert a tensor `source` into another
|
||||
tensor `dest` as specified by the operation's offsets, sizes and strides
|
||||
arguments.
|
||||
|
||||
It returns a copy of `dest` with the proper slice updated with the value
|
||||
of `source`.
|
||||
|
||||
The insert_slice operation supports the following arguments:
|
||||
|
||||
* source: the tensor that is inserted.
|
||||
* dest: the tensor into which the source tensor is inserted.
|
||||
* offsets: tensor-rank number of offsets into the `dest` tensor into which
|
||||
the slice is inserted.
|
||||
* sizes: tensor-rank number of sizes which specify the sizes of the result
|
||||
tensor type.
|
||||
* strides: tensor-rank number of strides that specify subsampling in each
|
||||
dimension.
|
||||
|
||||
The representation based on offsets, sizes and strides support a
|
||||
partially-static specification via attributes specified through the
|
||||
`static_offsets`, `static_sizes` and `static_strides` arguments. A special
|
||||
sentinel value ShapedType::kDynamicSize and
|
||||
ShapedType::kDynamicStrideOrOffset encodes that the corresponding entry has
|
||||
a dynamic value.
|
||||
|
||||
After buffer-allocation, the "insert_slice" op is expected to become an
|
||||
in-place buffer update.
|
||||
}];
|
||||
|
||||
let arguments = (ins
|
||||
AnyRankedTensor:$source,
|
||||
AnyRankedTensor:$dest,
|
||||
Variadic<Index>:$offsets,
|
||||
Variadic<Index>:$sizes,
|
||||
Variadic<Index>:$strides,
|
||||
I64ArrayAttr:$static_offsets,
|
||||
I64ArrayAttr:$static_sizes,
|
||||
I64ArrayAttr:$static_strides
|
||||
);
|
||||
let results = (outs AnyRankedTensor:$result);
|
||||
|
||||
let assemblyFormat = [{
|
||||
$source `into` $dest ``
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($offsets, $static_offsets)
|
||||
custom<OperandsOrIntegersSizesList>($sizes, $static_sizes)
|
||||
custom<OperandsOrIntegersOffsetsOrStridesList>($strides, $static_strides)
|
||||
attr-dict `:` type($source) `into` type($dest)
|
||||
}];
|
||||
|
||||
let verifier = ?;
|
||||
|
||||
let builders = [
|
||||
// Build a InsertSliceOp with mixed static and dynamic entries.
|
||||
OpBuilder<(ins "Value":$source, "Value":$dest,
|
||||
"ArrayRef<OpFoldResult>":$offsets, "ArrayRef<OpFoldResult>":$sizes,
|
||||
"ArrayRef<OpFoldResult>":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>,
|
||||
// Build a InsertSliceOp with dynamic entries.
|
||||
OpBuilder<(ins "Value":$source, "Value":$dest,
|
||||
"ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides,
|
||||
CArg<"ArrayRef<NamedAttribute>", "{}">:$attrs)>
|
||||
];
|
||||
|
||||
let extraClassDeclaration = extraBaseClassDeclaration # [{
|
||||
/// Returns the type of the base tensor operand.
|
||||
RankedTensorType getSourceType() {
|
||||
return source().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// The result of a insert_slice is always a tensor.
|
||||
RankedTensorType getType() {
|
||||
return getResult().getType().cast<RankedTensorType>();
|
||||
}
|
||||
|
||||
/// Return the expected rank of each of the`static_offsets`, `static_sizes`
|
||||
/// and `static_strides` attributes.
|
||||
std::array<unsigned, 3> getArrayAttrMaxRanks() {
|
||||
unsigned rank = getType().getRank();
|
||||
return {rank, rank, rank};
|
||||
}
|
||||
|
||||
/// Return the number of leading operands before the `offsets`, `sizes` and
|
||||
/// and `strides` operands.
|
||||
static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 2; }
|
||||
}];
|
||||
|
||||
let hasCanonicalizer = 1;
|
||||
let hasFolder = 1;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ReshapeOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -1384,7 +1384,7 @@ def Tosa_GatherOp : Tosa_Op<"gather", [NoSideEffect]> {
|
|||
let summary = "Gather operation,";
|
||||
|
||||
let description = [{
|
||||
Generate a tensor for which each element in the output is a subtensor of the
|
||||
Generate a tensor for which each element in the output is a slice of the
|
||||
values tensor based on the value of indices.
|
||||
}];
|
||||
|
||||
|
|
|
@ -627,10 +627,11 @@ LogicalResult SplitAtOpConversion::matchAndRewrite(
|
|||
Value index = b.create<SelectOp>(indexIsNegative, add, originalIndex);
|
||||
|
||||
Value one = b.create<ConstantIndexOp>(1);
|
||||
Value head = b.create<SubTensorOp>(transformed.operand(), zero, index, one);
|
||||
Value head =
|
||||
b.create<tensor::ExtractSliceOp>(transformed.operand(), zero, index, one);
|
||||
Value tailSize = b.create<SubIOp>(rank, index);
|
||||
Value tail =
|
||||
b.create<SubTensorOp>(transformed.operand(), index, tailSize, one);
|
||||
Value tail = b.create<tensor::ExtractSliceOp>(transformed.operand(), index,
|
||||
tailSize, one);
|
||||
rewriter.replaceOp(op, {head, tail});
|
||||
return success();
|
||||
}
|
||||
|
|
|
@ -1741,8 +1741,8 @@ struct ConcatConverter : public OpConversionPattern<tosa::ConcatOp> {
|
|||
|
||||
for (auto arg : args) {
|
||||
sizes[axis] = rewriter.create<memref::DimOp>(loc, arg, axisValue);
|
||||
result = rewriter.create<SubTensorInsertOp>(loc, arg, result, offsets,
|
||||
sizes, strides);
|
||||
result = rewriter.create<tensor::InsertSliceOp>(loc, arg, result, offsets,
|
||||
sizes, strides);
|
||||
offsets[axis] = rewriter.create<AddIOp>(loc, offsets[axis], sizes[axis]);
|
||||
}
|
||||
rewriter.replaceOp(op, result);
|
||||
|
|
|
@ -12,6 +12,7 @@
|
|||
|
||||
#include "mlir/Conversion/TosaToStandard/TosaToStandard.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Tosa/IR/TosaOps.h"
|
||||
#include "mlir/IR/PatternMatch.h"
|
||||
#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
|
||||
|
@ -42,7 +43,7 @@ public:
|
|||
SmallVector<int64_t> strides;
|
||||
strides.resize(sliceOp.getType().template cast<ShapedType>().getRank(), 1);
|
||||
|
||||
rewriter.replaceOpWithNewOp<SubTensorOp>(
|
||||
rewriter.replaceOpWithNewOp<tensor::ExtractSliceOp>(
|
||||
sliceOp, sliceOp.getType(), input, ValueRange({}), ValueRange({}),
|
||||
ValueRange({}), sliceOp.start(), sliceOp.size(),
|
||||
rewriter.getI64ArrayAttr(strides));
|
||||
|
|
|
@ -35,6 +35,7 @@ public:
|
|||
target.addIllegalOp<tosa::SliceOp>();
|
||||
target.addIllegalOp<tosa::ApplyScaleOp>();
|
||||
target.addLegalDialect<StandardOpsDialect>();
|
||||
target.addLegalDialect<tensor::TensorDialect>();
|
||||
|
||||
mlir::tosa::populateTosaToStandardConversionPatterns(&patterns);
|
||||
if (failed(applyPartialConversion(getOperation(), target,
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
|
||||
#include "mlir/Dialect/MemRef/IR/MemRef.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/AffineExprVisitor.h"
|
||||
#include "mlir/IR/Matchers.h"
|
||||
#include "mlir/IR/OpImplementation.h"
|
||||
|
@ -746,22 +747,23 @@ struct ReplaceStaticShapeDims : OpRewritePattern<InitTensorOp> {
|
|||
|
||||
namespace {
|
||||
/// Since `init_tensor` operation creates a tensor needed only for its shape, a
|
||||
/// subtensor of this is also needed only for its shape. The result can be
|
||||
/// replaced by a new init_tensor operation of the same size as the subtensor
|
||||
/// op.
|
||||
struct FoldInitTensorWithSubTensorOp : public OpRewritePattern<SubTensorOp> {
|
||||
using OpRewritePattern<SubTensorOp>::OpRewritePattern;
|
||||
/// slice of this is also needed only for its shape. The result can be
|
||||
/// replaced by a new init_tensor operation of the same size as the extract
|
||||
/// slice op.
|
||||
struct FoldInitTensorWithExtractSliceOp
|
||||
: public OpRewritePattern<tensor::ExtractSliceOp> {
|
||||
using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(SubTensorOp subtensorOp,
|
||||
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
if (!subtensorOp.source().getDefiningOp<linalg::InitTensorOp>())
|
||||
if (!sliceOp.source().getDefiningOp<linalg::InitTensorOp>())
|
||||
return failure();
|
||||
rewriter.replaceOpWithNewOp<linalg::InitTensorOp>(
|
||||
subtensorOp, subtensorOp.sizes(),
|
||||
sliceOp, sliceOp.sizes(),
|
||||
llvm::to_vector<4>(llvm::map_range(
|
||||
subtensorOp.static_sizes(),
|
||||
sliceOp.static_sizes(),
|
||||
[](Attribute attr) { return attr.cast<IntegerAttr>().getInt(); })),
|
||||
subtensorOp.getSourceType().getElementType());
|
||||
sliceOp.getSourceType().getElementType());
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
@ -797,7 +799,7 @@ struct FoldInitTensorWithTensorReshapeOp
|
|||
|
||||
void InitTensorOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<FoldInitTensorWithSubTensorOp,
|
||||
results.add<FoldInitTensorWithExtractSliceOp,
|
||||
FoldInitTensorWithTensorReshapeOp<TensorExpandShapeOp>,
|
||||
FoldInitTensorWithTensorReshapeOp<TensorCollapseShapeOp>,
|
||||
ReplaceStaticShapeDims>(context);
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "mlir/Dialect/Math/IR/Math.h"
|
||||
#include "mlir/Dialect/StandardOps/Transforms/Passes.h"
|
||||
#include "mlir/Dialect/StandardOps/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Vector/VectorOps.h"
|
||||
#include "mlir/IR/BuiltinDialect.h"
|
||||
#include "mlir/IR/Operation.h"
|
||||
|
@ -232,8 +233,8 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
/// Convert `subtensor %t [offsets][sizes][strides] -> %st` to an alloc + copy
|
||||
/// pattern.
|
||||
/// Convert `extract_slice %t [offsets][sizes][strides] -> %st` to an
|
||||
/// alloc + copy pattern.
|
||||
/// ```
|
||||
/// %a = alloc(sizes)
|
||||
/// %sv = subview %source [offsets][sizes][strides]
|
||||
|
@ -242,21 +243,22 @@ public:
|
|||
///
|
||||
/// This pattern is arguable a std pattern once linalg::CopyOp becomes
|
||||
/// std::CopyOp.
|
||||
class SubTensorOpConverter : public OpConversionPattern<SubTensorOp> {
|
||||
class ExtractSliceOpConverter
|
||||
: public OpConversionPattern<tensor::ExtractSliceOp> {
|
||||
public:
|
||||
using OpConversionPattern<SubTensorOp>::OpConversionPattern;
|
||||
using OpConversionPattern<tensor::ExtractSliceOp>::OpConversionPattern;
|
||||
|
||||
LogicalResult
|
||||
matchAndRewrite(SubTensorOp op, ArrayRef<Value> operands,
|
||||
matchAndRewrite(tensor::ExtractSliceOp op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
SubTensorOpAdaptor adaptor(operands, op->getAttrDictionary());
|
||||
tensor::ExtractSliceOpAdaptor adaptor(operands, op->getAttrDictionary());
|
||||
Value sourceMemref = adaptor.source();
|
||||
assert(sourceMemref.getType().isa<MemRefType>());
|
||||
|
||||
MemRefType subviewMemRefType =
|
||||
getTypeConverter()->convertType(op.getType()).cast<MemRefType>();
|
||||
// op.sizes() capture exactly the dynamic alloc operands matching the
|
||||
// subviewMemRefType thanks to subview/subtensor canonicalization and
|
||||
// subviewMemRefType thanks to subview/slice canonicalization and
|
||||
// verification.
|
||||
Value alloc = rewriter.create<memref::AllocOp>(
|
||||
op.getLoc(), subviewMemRefType, op.sizes());
|
||||
|
@ -269,7 +271,7 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
/// Convert `subtensor_insert %source into %dest [offsets][sizes][strides] ->
|
||||
/// Convert `insert_slice %source into %dest [offsets][sizes][strides] ->
|
||||
/// %t` to an buffer_cast + subview + copy + tensor_load pattern.
|
||||
/// buffer_cast and tensor_load are inserted automatically by the
|
||||
/// conversion infra:
|
||||
|
@ -281,15 +283,15 @@ public:
|
|||
///
|
||||
/// This pattern is arguable a std pattern once linalg::CopyOp becomes
|
||||
/// std::CopyOp.
|
||||
class SubTensorInsertOpConverter
|
||||
: public OpConversionPattern<SubTensorInsertOp> {
|
||||
class InsertSliceOpConverter
|
||||
: public OpConversionPattern<tensor::InsertSliceOp> {
|
||||
public:
|
||||
using OpConversionPattern<SubTensorInsertOp>::OpConversionPattern;
|
||||
using OpConversionPattern<tensor::InsertSliceOp>::OpConversionPattern;
|
||||
|
||||
LogicalResult
|
||||
matchAndRewrite(SubTensorInsertOp op, ArrayRef<Value> operands,
|
||||
matchAndRewrite(tensor::InsertSliceOp op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const final {
|
||||
SubTensorInsertOpAdaptor adaptor(operands, op->getAttrDictionary());
|
||||
tensor::InsertSliceOpAdaptor adaptor(operands, op->getAttrDictionary());
|
||||
Value sourceMemRef = adaptor.source();
|
||||
assert(sourceMemRef.getType().isa<MemRefType>());
|
||||
|
||||
|
@ -323,7 +325,8 @@ struct LinalgBufferizePass : public LinalgBufferizeBase<LinalgBufferizePass> {
|
|||
// Mark all Standard operations legal.
|
||||
target.addLegalDialect<AffineDialect, math::MathDialect,
|
||||
memref::MemRefDialect, StandardOpsDialect>();
|
||||
target.addIllegalOp<InitTensorOp, SubTensorOp, SubTensorInsertOp>();
|
||||
target.addIllegalOp<InitTensorOp, tensor::ExtractSliceOp,
|
||||
tensor::InsertSliceOp>();
|
||||
|
||||
// Mark all Linalg operations illegal as long as they work on tensors.
|
||||
auto isLegalOperation = [&](Operation *op) {
|
||||
|
@ -355,8 +358,8 @@ void mlir::linalg::populateLinalgBufferizePatterns(
|
|||
BufferizeInitTensorOp,
|
||||
BufferizeTensorReshapeOp<TensorExpandShapeOp>,
|
||||
BufferizeTensorReshapeOp<TensorCollapseShapeOp>,
|
||||
SubTensorOpConverter,
|
||||
SubTensorInsertOpConverter
|
||||
ExtractSliceOpConverter,
|
||||
InsertSliceOpConverter
|
||||
>(typeConverter, patterns.getContext());
|
||||
// clang-format on
|
||||
}
|
||||
|
|
|
@ -77,7 +77,7 @@
|
|||
// out of the function at each call site.
|
||||
//
|
||||
// iii. as an optimization over ii., it may be possible to reuse an argument
|
||||
// and only want to return a subtensor.
|
||||
// and only want to return a slice.
|
||||
// This may forego allocation by letting *all* callers decide whether to
|
||||
// pass a new *aliasing* memref function argument (i.e. a subview).
|
||||
// Without loss of generality, callers may agree to allocate a new buffer
|
||||
|
@ -284,7 +284,7 @@ LLVM_ATTRIBUTE_UNUSED static InPlaceSpec getInPlace(Value v) {
|
|||
// 5. Wheher an op bufferizes to a memory read.
|
||||
// 6. Wheher an op bufferizes to a memory write.
|
||||
// These interfaces are necessary to distinguish between various cases and allow
|
||||
// special inplace behavior for (SubTensorOp, SubTensorInsertOp) pairs.
|
||||
// special inplace behavior for (ExtractSliceOp, InsertSliceOp) pairs.
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// Return `true` if the op is explicitly supported by bufferization or if it
|
||||
|
@ -295,8 +295,8 @@ static bool hasKnownBufferizationAliasingBehavior(Operation *op) {
|
|||
// clang-format off
|
||||
isa<LinalgOp,
|
||||
ReturnOp,
|
||||
SubTensorOp,
|
||||
SubTensorInsertOp,
|
||||
ExtractSliceOp,
|
||||
InsertSliceOp,
|
||||
VectorTransferOpInterface>(op)
|
||||
// clang-format on
|
||||
|| (none_of(op->getResultTypes(),
|
||||
|
@ -339,8 +339,7 @@ static OpResult getInplaceableOpResult(VectorTransferOpInterface op,
|
|||
/// Return the OpResult that may bufferize into the same buffer as `opOperand`
|
||||
/// when the op is bufferized inplace.
|
||||
/// Return null if no such result exists.
|
||||
static OpResult getInplaceableOpResult(SubTensorInsertOp op,
|
||||
OpOperand &opOperand) {
|
||||
static OpResult getInplaceableOpResult(InsertSliceOp op, OpOperand &opOperand) {
|
||||
if (opOperand.get() != op.dest())
|
||||
return OpResult();
|
||||
return op->getResult(0);
|
||||
|
@ -357,12 +356,12 @@ static OpResult getInplaceableOpResult(OpOperand &opOperand) {
|
|||
// Ops that perform destructive updates on operand(s) to produce
|
||||
// result(s).
|
||||
.Case<LinalgOp,
|
||||
SubTensorInsertOp,
|
||||
InsertSliceOp,
|
||||
VectorTransferOpInterface>(
|
||||
[&](auto op) { return getInplaceableOpResult(op, opOperand); })
|
||||
// SubTensorOp is special, when bufferized inplace it just returns an
|
||||
// ExtractSliceOp is special, when bufferized inplace it just returns an
|
||||
// alias to its operand. Its result is never inplaceable on its operand.
|
||||
.Case([&](SubTensorOp op) { return OpResult(); })
|
||||
.Case([&](ExtractSliceOp op) { return OpResult(); })
|
||||
// Other ops.
|
||||
.Default([&](Operation *op) { return OpResult(); });
|
||||
// clang-format on
|
||||
|
@ -380,10 +379,10 @@ static Optional<OpResult> getAliasingOpResult(OpOperand &opOperand) {
|
|||
return TypeSwitch<Operation *, OpResult>(opOperand.getOwner())
|
||||
// ReturnOp has no result.
|
||||
.Case([&](ReturnOp op) { return OpResult(); })
|
||||
// SubTensorOp is different: its result is not inplaceable on op.source
|
||||
// ExtractSliceOp is different: its result is not inplaceable on op.source
|
||||
// but when bufferized inplace, the result is an aliasing subregion of
|
||||
// op.source.
|
||||
.Case([&](SubTensorOp op) { return op->getResult(0); })
|
||||
.Case([&](ExtractSliceOp op) { return op->getResult(0); })
|
||||
.Default(
|
||||
[&](Operation *op) { return getInplaceableOpResult(opOperand); });
|
||||
}
|
||||
|
@ -395,8 +394,9 @@ static bool bufferizesToMemoryRead(OpOperand &opOperand) {
|
|||
// it. Conservatively return true.
|
||||
if (!maybeOpResult)
|
||||
return true;
|
||||
// SubTensorOp alone doesn't bufferize to a memory read, one of its uses may.
|
||||
if (isa<SubTensorOp>(opOperand.getOwner()))
|
||||
// ExtractSliceOp alone doesn't bufferize to a memory read, one of its uses
|
||||
// may.
|
||||
if (isa<ExtractSliceOp>(opOperand.getOwner()))
|
||||
return false;
|
||||
if (auto linalgOp = dyn_cast<LinalgOp>(opOperand.getOwner()))
|
||||
return linalgOp.isInputTensor(&opOperand) ||
|
||||
|
@ -425,8 +425,9 @@ bufferizesToMemoryWrite(OpOperand &opOperand,
|
|||
// A ReturnOp is not a write.
|
||||
if (isa<ReturnOp>(opOperand.getOwner()))
|
||||
return false;
|
||||
// SubTensorOp alone doesn't bufferize to a memory write, one of its uses may.
|
||||
if (maybeOpResult->getDefiningOp<SubTensorOp>())
|
||||
// ExtractSliceOp alone doesn't bufferize to a memory write, one of its uses
|
||||
// may.
|
||||
if (maybeOpResult->getDefiningOp<ExtractSliceOp>())
|
||||
return false;
|
||||
// If we have a matching OpResult, this is a write.
|
||||
// Additionally allow to restrict to only inPlace write, if so specified.
|
||||
|
@ -442,10 +443,10 @@ namespace {
|
|||
|
||||
/// The BufferizationAliasInfo class maintains a list of buffer aliases and
|
||||
/// equivalence classes to support bufferization.
|
||||
/// SubTensorOps have special behavior, they act as a level of indirection for
|
||||
/// bufferization. They don't create reads or writes themselves and analysis
|
||||
/// ExtractSliceOps have special behavior, they act as a level of indirection
|
||||
/// for bufferization. They don't create reads or writes themselves and analysis
|
||||
/// needs to look through their uses.
|
||||
/// SubTensorOp + SubTensorInsertOp have special joint behavior: they may
|
||||
/// ExtractSliceOp + InsertSliceOp have special joint behavior: they may
|
||||
/// bufferize to the same buffer (i.e. subview), which is what introduces the
|
||||
/// need for bufferization classes.
|
||||
/// Some of these functionalities could be refactored in a Bufferizer class that
|
||||
|
@ -469,7 +470,7 @@ public:
|
|||
|
||||
/// Return true if the buffer to which `operand` would bufferize is equivalent
|
||||
/// to some use that would bufferize to a write to a buffer.
|
||||
bool aliasesInPlaceWrite(SubTensorOp subTensorOp) const;
|
||||
bool aliasesInPlaceWrite(ExtractSliceOp extractSliceOp) const;
|
||||
|
||||
/// Merge result's and operand's aliasing sets and iterate to a fixed point.
|
||||
void bufferizeInPlace(OpResult result, OpOperand &operand,
|
||||
|
@ -495,10 +496,10 @@ public:
|
|||
bool existsNonDominatingRead(OpOperand &opOperand,
|
||||
const DominanceInfo &domInfo) const;
|
||||
|
||||
/// Return true if the source of a `subTensorInsertOp` bufferizes to an
|
||||
/// equivalent SubTensorOp.
|
||||
bool isSourceEquivalentToAMatchingSubTensorOp(
|
||||
SubTensorInsertOp subTensorInsertOp) const;
|
||||
/// Return true if the source of a `insertSliceOp` bufferizes to an
|
||||
/// equivalent ExtractSliceOp.
|
||||
bool isSourceEquivalentToAMatchingExtractSliceOp(
|
||||
InsertSliceOp insertSliceOp) const;
|
||||
|
||||
/// Print to `os`.
|
||||
void print(raw_ostream &os) const;
|
||||
|
@ -519,13 +520,13 @@ private:
|
|||
/// Iteratively merge alias sets until a fixed-point.
|
||||
void mergeAliasesToFixedPoint();
|
||||
|
||||
/// Return true if the (SubTensorOp, SubTensorInsertOp) pair match (i.e.
|
||||
/// Return true if the (ExtractSliceOp, InsertSliceOp) pair match (i.e.
|
||||
/// equivalent operand / result and same offset/sizes/strides specification).
|
||||
///
|
||||
/// This is one particular type of relationship between ops on tensors that
|
||||
/// reduce to an equivalence on buffers. This should be generalized and
|
||||
/// exposed as interfaces on the proper types.
|
||||
bool areEquivalentSubTensorOps(SubTensorOp st, SubTensorInsertOp sti) const;
|
||||
bool areEquivalentExtractSliceOps(ExtractSliceOp st, InsertSliceOp sti) const;
|
||||
|
||||
/// Return true if there is a `candidateOp` that would write to memory after
|
||||
/// bufferization and such that:
|
||||
|
@ -658,10 +659,10 @@ bool BufferizationAliasInfo::aliasesNonWriteableBuffer(
|
|||
/// Return true if the buffer to which `operand` would bufferize is equivalent
|
||||
/// to some use that would bufferize to a write to a buffer.
|
||||
bool BufferizationAliasInfo::aliasesInPlaceWrite(
|
||||
SubTensorOp subTensorOp) const {
|
||||
ExtractSliceOp extractSliceOp) const {
|
||||
LDBG("----Start aliasesInPlaceWrite\n");
|
||||
LDBG("-------for op: " << *subTensorOp.getOperation() << '\n');
|
||||
for (Value v : getAliasInfoRef(subTensorOp.result())) {
|
||||
LDBG("-------for op: " << *extractSliceOp.getOperation() << '\n');
|
||||
for (Value v : getAliasInfoRef(extractSliceOp.result())) {
|
||||
for (auto &use : v.getUses()) {
|
||||
if (bufferizesToMemoryWrite(use, InPlaceSpec::True)) {
|
||||
LDBG("-----------wants to bufferize to inPlace write: "
|
||||
|
@ -670,7 +671,7 @@ bool BufferizationAliasInfo::aliasesInPlaceWrite(
|
|||
}
|
||||
}
|
||||
}
|
||||
LDBG("----------->subtensor does not alias an inplace write");
|
||||
LDBG("----------->extract_slice does not alias an inplace write");
|
||||
return false;
|
||||
}
|
||||
|
||||
|
@ -796,16 +797,16 @@ bool BufferizationAliasInfo::existsNonDominatingRead(
|
|||
return false;
|
||||
}
|
||||
|
||||
/// Return true if the source of a `subTensorInsertOp` bufferizes to an
|
||||
/// equivalent SubTensorOp.
|
||||
bool BufferizationAliasInfo::isSourceEquivalentToAMatchingSubTensorOp(
|
||||
SubTensorInsertOp subTensorInsertOp) const {
|
||||
auto leaderIt = equivalentInfo.findLeader(subTensorInsertOp.source());
|
||||
/// Return true if the source of a `insertSliceOp` bufferizes to an
|
||||
/// equivalent ExtractSliceOp.
|
||||
bool BufferizationAliasInfo::isSourceEquivalentToAMatchingExtractSliceOp(
|
||||
InsertSliceOp insertSliceOp) const {
|
||||
auto leaderIt = equivalentInfo.findLeader(insertSliceOp.source());
|
||||
for (auto mit = leaderIt, meit = equivalentInfo.member_end(); mit != meit;
|
||||
++mit) {
|
||||
if (areEquivalentSubTensorOps(
|
||||
dyn_cast_or_null<SubTensorOp>(mit->v.getDefiningOp()),
|
||||
subTensorInsertOp))
|
||||
if (areEquivalentExtractSliceOps(
|
||||
dyn_cast_or_null<ExtractSliceOp>(mit->v.getDefiningOp()),
|
||||
insertSliceOp))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
|
@ -874,8 +875,8 @@ void BufferizationAliasInfo::mergeAliasesToFixedPoint() {
|
|||
/// This is one particular type of relationship between ops on tensors that
|
||||
/// reduce to an equivalence on buffers. This should be generalized and exposed
|
||||
/// as interfaces on the proper types.
|
||||
bool BufferizationAliasInfo::areEquivalentSubTensorOps(
|
||||
SubTensorOp st, SubTensorInsertOp sti) const {
|
||||
bool BufferizationAliasInfo::areEquivalentExtractSliceOps(
|
||||
ExtractSliceOp st, InsertSliceOp sti) const {
|
||||
if (!st || !sti)
|
||||
return false;
|
||||
if (!equivalentInfo.isEquivalent(st.source(), sti.dest()))
|
||||
|
@ -950,47 +951,47 @@ bool BufferizationAliasInfo::isClobberedWriteBeforeRead(
|
|||
return false;
|
||||
}
|
||||
|
||||
// The case `opToBufferize` isa SubTensorOp is important enough that we look
|
||||
// for it specifically. The key information to discover is whether the
|
||||
// aliasing read or write come from a matching SubTensorInsertOp.
|
||||
// The case `opToBufferize` isa ExtractSliceOp is important enough that we
|
||||
// look for it specifically. The key information to discover is whether the
|
||||
// aliasing read or write come from a matching InsertSliceOp.
|
||||
// Such a pattern is introduced by tiling and is the key inplace condition
|
||||
// not to miss.
|
||||
if (auto subTensorOp = dyn_cast<SubTensorOp>(opToBufferize)) {
|
||||
if (auto subTensorInsertOp = dyn_cast<SubTensorInsertOp>(aliasingReadOp)) {
|
||||
// %1 = subtensor %0[%offset_sizes_and_strides_1]
|
||||
if (auto extractSliceOp = dyn_cast<ExtractSliceOp>(opToBufferize)) {
|
||||
if (auto insertSliceOp = dyn_cast<InsertSliceOp>(aliasingReadOp)) {
|
||||
// %1 = extract_slice %0[%offset_sizes_and_strides_1]
|
||||
//
|
||||
// ... // 0 or more of inplace compute that reduces to: %X is an
|
||||
// // aliasingWrite equivalent to %1.
|
||||
// %W = inplace_write(%1)
|
||||
//
|
||||
// // aliasingRead %Y in subtensor_insert
|
||||
// ... = subtensor_insert %W into %R[%offset_sizes_and_strides_1]
|
||||
if (aliasingRead.get() == subTensorInsertOp.dest() &&
|
||||
// // aliasingRead %Y in insert_slice
|
||||
// ... = insert_slice %W into %R[%offset_sizes_and_strides_1]
|
||||
if (aliasingRead.get() == insertSliceOp.dest() &&
|
||||
// TODO: This is currently too restrictive and misses clobberings.
|
||||
// When available, use container-containee analysis: the condition
|
||||
// should be that the `aliasingWrite` is contained within
|
||||
// `subTensorInsertOp.source()`.
|
||||
// `insertSliceOp.source()`.
|
||||
equivalentInfo.isEquivalent(aliasingWrite.get(),
|
||||
subTensorInsertOp.source()) &&
|
||||
areEquivalentSubTensorOps(subTensorOp, subTensorInsertOp)) {
|
||||
LDBG("---->clobbering matching subtensor/subtensor_insert\n");
|
||||
insertSliceOp.source()) &&
|
||||
areEquivalentExtractSliceOps(extractSliceOp, insertSliceOp)) {
|
||||
LDBG("---->clobbering matching extract_slice/insert_slice\n");
|
||||
return true;
|
||||
}
|
||||
// %1 = subtensor %0[%offset_sizes_and_strides_1]
|
||||
// %1 = extract_slice %0[%offset_sizes_and_strides_1]
|
||||
//
|
||||
// ... // bunch of inplace ops that reduce to %X, equivalent to %1.
|
||||
// %X = inplace_write(%1)
|
||||
//
|
||||
// // aliasingRead %X in subtensor_insert
|
||||
// // aliasingWrite %Y in subtensor_insert
|
||||
// ... = subtensor_insert %X into %Y[%offset_sizes_and_strides_1]
|
||||
// // aliasingRead %X in insert_slice
|
||||
// // aliasingWrite %Y in insert_slice
|
||||
// ... = insert_slice %X into %Y[%offset_sizes_and_strides_1]
|
||||
if (aliasingReadOp == aliasingWriteOp) {
|
||||
assert(aliasingRead.get() == subTensorInsertOp.source() &&
|
||||
"expected read to source of subtensor_insert");
|
||||
assert(aliasingWrite.get() == subTensorInsertOp.dest() &&
|
||||
"expected write to dest of subtensor_insert");
|
||||
if (areEquivalentSubTensorOps(subTensorOp, subTensorInsertOp)) {
|
||||
LDBG("---->clobbering matching subtensor/subtensor_insert\n");
|
||||
assert(aliasingRead.get() == insertSliceOp.source() &&
|
||||
"expected read to source of insert_slice");
|
||||
assert(aliasingWrite.get() == insertSliceOp.dest() &&
|
||||
"expected write to dest of insert_slice");
|
||||
if (areEquivalentExtractSliceOps(extractSliceOp, insertSliceOp)) {
|
||||
LDBG("---->clobbering matching extract_slice/insert_slice\n");
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -1262,114 +1263,114 @@ static LogicalResult bufferize(OpBuilder &b, ReturnOp returnOp,
|
|||
return success();
|
||||
}
|
||||
|
||||
/// Bufferize SubTensorOp to subview with optional alloc + copy depending on
|
||||
/// Bufferize ExtractSliceOp to subview with optional alloc + copy depending on
|
||||
/// whether or not it is marked inplaceable.
|
||||
/// Note that `getInplaceableOpResult` on a SubTensorOp always returns null.
|
||||
/// As consequence a SubTensorOp always alloc + copy when taken in
|
||||
/// Note that `getInplaceableOpResult` on a ExtractSliceOp always returns null.
|
||||
/// As consequence a ExtractSliceOp always alloc + copy when taken in
|
||||
/// isolation.
|
||||
static LogicalResult bufferize(OpBuilder &b, SubTensorOp subTensorOp,
|
||||
static LogicalResult bufferize(OpBuilder &b, ExtractSliceOp extractSliceOp,
|
||||
BlockAndValueMapping &bvm,
|
||||
const BufferizationAliasInfo &aliasInfo) {
|
||||
LDBG("bufferize: " << *subTensorOp << '\n');
|
||||
LDBG("bufferize: " << *extractSliceOp << '\n');
|
||||
|
||||
// Take a guard before anything else.
|
||||
OpBuilder::InsertionGuard g(b);
|
||||
b.setInsertionPoint(subTensorOp);
|
||||
b.setInsertionPoint(extractSliceOp);
|
||||
|
||||
Location loc = subTensorOp.getLoc();
|
||||
Location loc = extractSliceOp.getLoc();
|
||||
// Bail if source was not bufferized.
|
||||
Value srcMemref = lookup(bvm, subTensorOp.source());
|
||||
Value srcMemref = lookup(bvm, extractSliceOp.source());
|
||||
if (!srcMemref)
|
||||
return failure();
|
||||
auto srcMemrefType = srcMemref.getType().cast<MemRefType>();
|
||||
auto dstTensorType = subTensorOp.result().getType().cast<RankedTensorType>();
|
||||
auto dstTensorType =
|
||||
extractSliceOp.result().getType().cast<RankedTensorType>();
|
||||
|
||||
// If not inplaceable, alloc.
|
||||
Value alloc;
|
||||
auto inPlace = getInPlace(subTensorOp->getResult(0));
|
||||
auto inPlace = getInPlace(extractSliceOp->getResult(0));
|
||||
if (inPlace != InPlaceSpec::True) {
|
||||
alloc =
|
||||
createNewAllocDeallocPairForShapedValue(b, loc, subTensorOp.result());
|
||||
alloc = createNewAllocDeallocPairForShapedValue(b, loc,
|
||||
extractSliceOp.result());
|
||||
b.setInsertionPointAfter(alloc.getDefiningOp());
|
||||
}
|
||||
|
||||
// Bufferize to subview.
|
||||
auto subviewMemRefType =
|
||||
memref::SubViewOp::inferRankReducedResultType(
|
||||
dstTensorType.getRank(), srcMemrefType, subTensorOp.getMixedOffsets(),
|
||||
subTensorOp.getMixedSizes(), subTensorOp.getMixedStrides())
|
||||
dstTensorType.getRank(), srcMemrefType,
|
||||
extractSliceOp.getMixedOffsets(), extractSliceOp.getMixedSizes(),
|
||||
extractSliceOp.getMixedStrides())
|
||||
.cast<MemRefType>();
|
||||
Value subView = b.create<memref::SubViewOp>(
|
||||
loc, subviewMemRefType, srcMemref, subTensorOp.getMixedOffsets(),
|
||||
subTensorOp.getMixedSizes(), subTensorOp.getMixedStrides());
|
||||
loc, subviewMemRefType, srcMemref, extractSliceOp.getMixedOffsets(),
|
||||
extractSliceOp.getMixedSizes(), extractSliceOp.getMixedStrides());
|
||||
|
||||
/// If not inplaceable, copy.
|
||||
if (alloc) {
|
||||
b.create<CopyOp>(subTensorOp.getLoc(), subView, alloc);
|
||||
b.create<CopyOp>(extractSliceOp.getLoc(), subView, alloc);
|
||||
subView = alloc;
|
||||
}
|
||||
|
||||
map(bvm, subTensorOp.result(), subView);
|
||||
map(bvm, extractSliceOp.result(), subView);
|
||||
return success();
|
||||
}
|
||||
|
||||
static LogicalResult bufferize(OpBuilder &b,
|
||||
SubTensorInsertOp subTensorInsertOp,
|
||||
static LogicalResult bufferize(OpBuilder &b, InsertSliceOp insertSliceOp,
|
||||
BlockAndValueMapping &bvm,
|
||||
const BufferizationAliasInfo &aliasInfo) {
|
||||
LDBG("bufferize: " << *subTensorInsertOp << '\n');
|
||||
LDBG("bufferize: " << *insertSliceOp << '\n');
|
||||
|
||||
// Take a guard before anything else.
|
||||
OpBuilder::InsertionGuard g(b);
|
||||
b.setInsertionPoint(subTensorInsertOp);
|
||||
Location loc = subTensorInsertOp.getLoc();
|
||||
b.setInsertionPoint(insertSliceOp);
|
||||
Location loc = insertSliceOp.getLoc();
|
||||
|
||||
Value dstMemref = lookup(bvm, subTensorInsertOp.dest());
|
||||
Value dstMemref = lookup(bvm, insertSliceOp.dest());
|
||||
if (!dstMemref)
|
||||
return failure();
|
||||
auto inPlace = getInPlace(subTensorInsertOp->getResult(0));
|
||||
auto inPlace = getInPlace(insertSliceOp->getResult(0));
|
||||
if (inPlace != InPlaceSpec::True) {
|
||||
// Since subtensor_insert arise from tiling and introducing loops, this
|
||||
// Since insert_slice arise from tiling and introducing loops, this
|
||||
// case is generally a deal breaker. When used with loops, this ends up
|
||||
// cloning the whole tensor on every single iteration and is a symptom
|
||||
// of a catastrophically bad scheduling decision.
|
||||
// TODO: be very loud about it or even consider failing the pass.
|
||||
Value newDstMemref = createNewAllocDeallocPairForShapedValue(
|
||||
b, loc, subTensorInsertOp.result());
|
||||
Value newDstMemref =
|
||||
createNewAllocDeallocPairForShapedValue(b, loc, insertSliceOp.result());
|
||||
b.setInsertionPointAfter(newDstMemref.getDefiningOp());
|
||||
b.create<CopyOp>(subTensorInsertOp.getLoc(), dstMemref, newDstMemref);
|
||||
b.create<CopyOp>(insertSliceOp.getLoc(), dstMemref, newDstMemref);
|
||||
dstMemref = newDstMemref;
|
||||
}
|
||||
auto dstMemrefType = dstMemref.getType().cast<MemRefType>();
|
||||
|
||||
Value srcMemref = lookup(bvm, subTensorInsertOp.source());
|
||||
Value srcMemref = lookup(bvm, insertSliceOp.source());
|
||||
if (!srcMemref)
|
||||
return failure();
|
||||
auto subviewMemRefType =
|
||||
memref::SubViewOp::inferRankReducedResultType(
|
||||
subTensorInsertOp.getSourceType().getRank(), dstMemrefType,
|
||||
subTensorInsertOp.getMixedOffsets(),
|
||||
subTensorInsertOp.getMixedSizes(),
|
||||
subTensorInsertOp.getMixedStrides())
|
||||
insertSliceOp.getSourceType().getRank(), dstMemrefType,
|
||||
insertSliceOp.getMixedOffsets(), insertSliceOp.getMixedSizes(),
|
||||
insertSliceOp.getMixedStrides())
|
||||
.cast<MemRefType>();
|
||||
|
||||
// A copy of the source buffer is needed if either:
|
||||
// - The producer of `source` is not inplace. This is the case where a
|
||||
// subtensor is computed out of place into the inplace full tensor.
|
||||
// slice is computed out of place into the inplace full tensor.
|
||||
// - The result is not inplace. This is the case where the whole tensor is
|
||||
// cloned and the clone needs to be updated.
|
||||
if (!aliasInfo.isSourceEquivalentToAMatchingSubTensorOp(subTensorInsertOp) ||
|
||||
if (!aliasInfo.isSourceEquivalentToAMatchingExtractSliceOp(insertSliceOp) ||
|
||||
inPlace != InPlaceSpec::True) {
|
||||
LDBG("subtensor_insert needs extra source copy: "
|
||||
<< subTensorInsertOp.source() << " -> copy\n");
|
||||
LDBG("insert_slice needs extra source copy: " << insertSliceOp.source()
|
||||
<< " -> copy\n");
|
||||
// Take a subview of the dst.
|
||||
Value subView = b.create<memref::SubViewOp>(
|
||||
loc, subviewMemRefType, dstMemref, subTensorInsertOp.getMixedOffsets(),
|
||||
subTensorInsertOp.getMixedSizes(), subTensorInsertOp.getMixedStrides());
|
||||
b.create<CopyOp>(subTensorInsertOp.getLoc(), srcMemref, subView);
|
||||
loc, subviewMemRefType, dstMemref, insertSliceOp.getMixedOffsets(),
|
||||
insertSliceOp.getMixedSizes(), insertSliceOp.getMixedStrides());
|
||||
b.create<CopyOp>(insertSliceOp.getLoc(), srcMemref, subView);
|
||||
}
|
||||
|
||||
map(bvm, subTensorInsertOp.result(), dstMemref);
|
||||
map(bvm, insertSliceOp.result(), dstMemref);
|
||||
|
||||
return success();
|
||||
}
|
||||
|
@ -1433,54 +1434,54 @@ static LogicalResult bufferize(OpBuilder &b, VectorTransferOpInterface op,
|
|||
//===----------------------------------------------------------------------===//
|
||||
|
||||
///
|
||||
/// Rationale for bufferizing `%1 = subtensor %0[...]` inplace.
|
||||
/// Rationale for bufferizing `%1 = tensor.extract_slice %0[...]` inplace.
|
||||
/// ===========================================================
|
||||
///
|
||||
/// When bufferized out of place, a SubTensorOp lowers to alloc + copy. This
|
||||
/// When bufferized out of place, a ExtractSlice lowers to alloc + copy. This
|
||||
/// cannot change the flow of information for either the source or the
|
||||
/// result buffers.
|
||||
///
|
||||
/// When bufferized inplace, a SubTensorOp does not by itself create any read or
|
||||
/// write from memory. Instead, it has the effect of merging the alias sets of
|
||||
/// the source and the result buffers.
|
||||
/// When bufferized inplace, a ExtractSliceOp does not by itself create any read
|
||||
/// or write from memory. Instead, it has the effect of merging the alias sets
|
||||
/// of the source and the result buffers.
|
||||
///
|
||||
/// An analysis is required to ensure inplace bufferization would not result in
|
||||
/// RaW dependence violations.
|
||||
static void bufferizableInPlaceAnalysis(SubTensorOp subTensorOp,
|
||||
static void bufferizableInPlaceAnalysis(ExtractSliceOp extractSliceOp,
|
||||
BufferizationAliasInfo &aliasInfo,
|
||||
const DominanceInfo &domInfo) {
|
||||
LDBG('\n');
|
||||
LDBG("Try to bufferize subtensor inplace: " << *subTensorOp << '\n');
|
||||
LDBG("Try to bufferize extract_slice inplace: " << *extractSliceOp << '\n');
|
||||
|
||||
// If `subTensorOp` were to be bufferized inplace, it cannot end up
|
||||
// If `extractSliceOp` were to be bufferized inplace, it cannot end up
|
||||
// aliasing a write into a non-writeable buffer.
|
||||
bool wouldCreateAliasingWriteToNonWriteableBuffer =
|
||||
aliasInfo.aliasesInPlaceWrite(subTensorOp) &&
|
||||
aliasInfo.aliasesNonWriteableBuffer(subTensorOp->getOpOperand(0));
|
||||
aliasInfo.aliasesInPlaceWrite(extractSliceOp) &&
|
||||
aliasInfo.aliasesNonWriteableBuffer(extractSliceOp->getOpOperand(0));
|
||||
|
||||
if (wouldCreateAliasingWriteToNonWriteableBuffer)
|
||||
LDBG("->the corresponding buffer is not writeable\n");
|
||||
LDBG("->bufferizes to writeable inplace buffer\n");
|
||||
|
||||
// In any of subTensorOp.result's aliases, can we find 2 such that we hit
|
||||
// In any of extractSliceOp.result's aliases, can we find 2 such that we hit
|
||||
// an interfering write?
|
||||
Value s = subTensorOp.source(), r = subTensorOp.result();
|
||||
Value s = extractSliceOp.source(), r = extractSliceOp.result();
|
||||
bool foundInterference = wouldCreateAliasingWriteToNonWriteableBuffer ||
|
||||
// Do not consider (s, s) and (r, r) as all the
|
||||
// aliasings already exist by construction; we are
|
||||
// interested in new interfering aliases only.
|
||||
aliasInfo.wouldCreateReadAfterWriteInterference(
|
||||
s, r, subTensorOp, domInfo) ||
|
||||
s, r, extractSliceOp, domInfo) ||
|
||||
aliasInfo.wouldCreateReadAfterWriteInterference(
|
||||
r, s, subTensorOp, domInfo);
|
||||
r, s, extractSliceOp, domInfo);
|
||||
if (foundInterference) {
|
||||
setInPlaceOpResult(subTensorOp->getResult(0), InPlaceSpec::False);
|
||||
setInPlaceOpResult(extractSliceOp->getResult(0), InPlaceSpec::False);
|
||||
} else {
|
||||
setInPlaceOpResult(subTensorOp->getResult(0), InPlaceSpec::True);
|
||||
aliasInfo.bufferizeInPlace(subTensorOp->getResult(0),
|
||||
subTensorOp->getOpOperand(0));
|
||||
setInPlaceOpResult(extractSliceOp->getResult(0), InPlaceSpec::True);
|
||||
aliasInfo.bufferizeInPlace(extractSliceOp->getResult(0),
|
||||
extractSliceOp->getOpOperand(0));
|
||||
}
|
||||
LDBG("Done bufferizing subtensor\n");
|
||||
LDBG("Done bufferizing extract_slice\n");
|
||||
}
|
||||
|
||||
/// Analyze the (opOperand, result) pair to determine whether the result can
|
||||
|
@ -1490,8 +1491,8 @@ static void bufferizableInPlaceAnalysis(OpOperand &operand, OpResult result,
|
|||
BufferizationAliasInfo &aliasInfo,
|
||||
const DominanceInfo &domInfo) {
|
||||
Operation *op = result.getDefiningOp();
|
||||
assert(result && !isa<SubTensorOp>(op) &&
|
||||
"expected OpResult not coming from a SubTensorOp");
|
||||
assert(result && !isa<ExtractSliceOp>(op) &&
|
||||
"expected OpResult not coming from a ExtractSliceOp");
|
||||
|
||||
int64_t resultNumber = result.getResultNumber();
|
||||
(void)resultNumber;
|
||||
|
@ -1541,48 +1542,47 @@ static void inPlaceAnalysisFuncOpInternals(FuncOp funcOp,
|
|||
"expected a funcOp definition with a body");
|
||||
|
||||
// Collect ops so we can build our own traversal.
|
||||
SmallVector<SubTensorOp> subTensorOps;
|
||||
SmallVector<SubTensorInsertOp> subTensorInsertOps;
|
||||
SmallVector<Operation *> nonSubTensorOps;
|
||||
SmallVector<ExtractSliceOp> extractSliceOps;
|
||||
SmallVector<InsertSliceOp> insertSliceOps;
|
||||
SmallVector<Operation *> nonSliceOps;
|
||||
funcOp.walk([&](Operation *op) {
|
||||
if (auto subTensorOp = dyn_cast<SubTensorOp>(op))
|
||||
return subTensorOps.push_back(subTensorOp);
|
||||
if (auto subTensorInsertOp = dyn_cast<SubTensorInsertOp>(op))
|
||||
return subTensorInsertOps.push_back(subTensorInsertOp);
|
||||
if (auto extractSliceOp = dyn_cast<ExtractSliceOp>(op))
|
||||
return extractSliceOps.push_back(extractSliceOp);
|
||||
if (auto insertSliceOp = dyn_cast<InsertSliceOp>(op))
|
||||
return insertSliceOps.push_back(insertSliceOp);
|
||||
auto isaTensor = [](Type t) { return t.isa<TensorType>(); };
|
||||
// No tensors => no buffers.
|
||||
if (none_of(op->getOperandTypes(), isaTensor) &&
|
||||
none_of(op->getResultTypes(), isaTensor))
|
||||
return;
|
||||
nonSubTensorOps.push_back(op);
|
||||
nonSliceOps.push_back(op);
|
||||
});
|
||||
|
||||
// Bufferize SubTensorInsertOp greedily: we almost never want to bufferize
|
||||
// Bufferize InsertSliceOp greedily: we almost never want to bufferize
|
||||
// the tensor "inserted into" to become out-of-place. This implementation
|
||||
// does not distinguish between different SubTensorInsertOps. If we want
|
||||
// finer-grained behavior, we could order the SubTensorInsertOps with some
|
||||
// metric.
|
||||
// Walk SubTensorInsertOps in reverse for better interference behavior.
|
||||
for (SubTensorInsertOp subTensorInsertOp : reverse(subTensorInsertOps)) {
|
||||
OpOperand &destOpOperand = subTensorInsertOp->getOpOperand(1);
|
||||
// does not distinguish between different InsertSliceOp. If we want
|
||||
// finer-grained behavior, we could order the InsertSliceOp with some metric.
|
||||
// Walk InsertSliceOp in reverse for better interference behavior.
|
||||
for (InsertSliceOp insertSliceOp : reverse(insertSliceOps)) {
|
||||
OpOperand &destOpOperand = insertSliceOp->getOpOperand(1);
|
||||
bufferizableInPlaceAnalysis(destOpOperand,
|
||||
getInplaceableOpResult(destOpOperand),
|
||||
aliasInfo, domInfo);
|
||||
}
|
||||
|
||||
// Bufferize all ops except SubTensorOp and SubTensorInsertOp which are
|
||||
// handled separately.
|
||||
// Bufferize all ops except ExtractSliceOp and InsertSliceOp which are handled
|
||||
// separately.
|
||||
// Walk other ops in reverse for better interference behavior.
|
||||
for (Operation *op : reverse(nonSubTensorOps))
|
||||
for (Operation *op : reverse(nonSliceOps))
|
||||
for (OpOperand &opOperand : op->getOpOperands())
|
||||
if (OpResult result = getInplaceableOpResult(opOperand))
|
||||
bufferizableInPlaceAnalysis(opOperand, result, aliasInfo, domInfo);
|
||||
|
||||
// Finally, bufferize SubTensorOp.
|
||||
// Walk SubTensorOps in reverse for better clobbering behavior: it is easier
|
||||
// to detect clobbers of smaller subtensors before larger ones.
|
||||
for (SubTensorOp subTensorOp : reverse(subTensorOps))
|
||||
bufferizableInPlaceAnalysis(subTensorOp, aliasInfo, domInfo);
|
||||
// Finally, bufferize ExtractSliceOp.
|
||||
// Walk ExtractSliceOps in reverse for better clobbering behavior: it is
|
||||
// easier to detect clobbers of smaller slices before larger ones.
|
||||
for (ExtractSliceOp extractSliceOp : reverse(extractSliceOps))
|
||||
bufferizableInPlaceAnalysis(extractSliceOp, aliasInfo, domInfo);
|
||||
|
||||
LDBG("End InPlaceAnalysisFuncOpInternals:\n" << funcOp << '\n');
|
||||
}
|
||||
|
@ -1611,8 +1611,8 @@ bufferizeFuncOpInternals(FuncOp funcOp, BlockAndValueMapping &bvm,
|
|||
.Case<memref::DimOp,
|
||||
LinalgOp,
|
||||
ReturnOp,
|
||||
SubTensorOp,
|
||||
SubTensorInsertOp,
|
||||
ExtractSliceOp,
|
||||
InsertSliceOp,
|
||||
VectorTransferOpInterface>(
|
||||
[&](auto op) { return bufferize(b, op, bvm, aliasInfo); })
|
||||
// clang-format on
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "mlir/Dialect/Linalg/Passes.h"
|
||||
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
|
||||
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/BuiltinTypes.h"
|
||||
|
@ -457,8 +458,8 @@ struct ReplaceUnitExtents : public OpRewritePattern<GenericOp> {
|
|||
};
|
||||
} // namespace
|
||||
|
||||
/// Get the reassociation maps to fold the result of a subtensor (or source of a
|
||||
/// subtensor_insert) operation with given offsets, and sizes to its
|
||||
/// Get the reassociation maps to fold the result of a extract_slice (or source
|
||||
/// of a insert_slice) operation with given offsets, and sizes to its
|
||||
/// rank-reduced version. This is only done for the cases where the size is 1
|
||||
/// and offset is 0. Strictly speaking the offset 0 is not required in general,
|
||||
/// but non-zero offsets are not handled by SPIR-V backend at this point (and
|
||||
|
@ -486,41 +487,41 @@ getReassociationMapForFoldingUnitDims(ArrayRef<OpFoldResult> mixedSizes) {
|
|||
}
|
||||
|
||||
namespace {
|
||||
/// Convert `subtensor` operations to rank-reduced versions.
|
||||
struct UseRankReducedSubTensorOp : public OpRewritePattern<SubTensorOp> {
|
||||
using OpRewritePattern<SubTensorOp>::OpRewritePattern;
|
||||
/// Convert `extract_slice` operations to rank-reduced versions.
|
||||
struct UseRankReducedExtractSliceOp
|
||||
: public OpRewritePattern<tensor::ExtractSliceOp> {
|
||||
using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(SubTensorOp subTensorOp,
|
||||
LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
RankedTensorType resultType = subTensorOp.getType();
|
||||
SmallVector<OpFoldResult> offsets = subTensorOp.getMixedOffsets();
|
||||
SmallVector<OpFoldResult> sizes = subTensorOp.getMixedSizes();
|
||||
SmallVector<OpFoldResult> strides = subTensorOp.getMixedStrides();
|
||||
RankedTensorType resultType = sliceOp.getType();
|
||||
SmallVector<OpFoldResult> offsets = sliceOp.getMixedOffsets();
|
||||
SmallVector<OpFoldResult> sizes = sliceOp.getMixedSizes();
|
||||
SmallVector<OpFoldResult> strides = sliceOp.getMixedStrides();
|
||||
auto reassociation = getReassociationMapForFoldingUnitDims(sizes);
|
||||
if (!reassociation ||
|
||||
reassociation->size() == static_cast<size_t>(resultType.getRank()))
|
||||
return failure();
|
||||
auto rankReducedType =
|
||||
SubTensorOp::inferRankReducedResultType(reassociation->size(),
|
||||
subTensorOp.getSourceType(),
|
||||
offsets, sizes, strides)
|
||||
.cast<RankedTensorType>();
|
||||
auto rankReducedType = tensor::ExtractSliceOp::inferRankReducedResultType(
|
||||
reassociation->size(), sliceOp.getSourceType(),
|
||||
offsets, sizes, strides)
|
||||
.cast<RankedTensorType>();
|
||||
|
||||
Location loc = subTensorOp.getLoc();
|
||||
Value newSubTensor = rewriter.create<SubTensorOp>(
|
||||
loc, rankReducedType, subTensorOp.source(), offsets, sizes, strides);
|
||||
rewriter.replaceOpWithNewOp<TensorExpandShapeOp>(
|
||||
subTensorOp, resultType, newSubTensor, *reassociation);
|
||||
Location loc = sliceOp.getLoc();
|
||||
Value newSlice = rewriter.create<tensor::ExtractSliceOp>(
|
||||
loc, rankReducedType, sliceOp.source(), offsets, sizes, strides);
|
||||
rewriter.replaceOpWithNewOp<TensorExpandShapeOp>(sliceOp, resultType,
|
||||
newSlice, *reassociation);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
/// Convert `subtensor_insert` operations to rank-reduced versions.
|
||||
struct UseRankReducedSubTensorInsertOp
|
||||
: public OpRewritePattern<SubTensorInsertOp> {
|
||||
using OpRewritePattern<SubTensorInsertOp>::OpRewritePattern;
|
||||
/// Convert `insert_slice` operations to rank-reduced versions.
|
||||
struct UseRankReducedInsertSliceOp
|
||||
: public OpRewritePattern<tensor::InsertSliceOp> {
|
||||
using OpRewritePattern<tensor::InsertSliceOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(SubTensorInsertOp insertOp,
|
||||
LogicalResult matchAndRewrite(tensor::InsertSliceOp insertOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
RankedTensorType sourceType = insertOp.getSourceType();
|
||||
SmallVector<OpFoldResult> offsets = insertOp.getMixedOffsets();
|
||||
|
@ -533,7 +534,7 @@ struct UseRankReducedSubTensorInsertOp
|
|||
Location loc = insertOp.getLoc();
|
||||
auto reshapedSource = rewriter.create<TensorCollapseShapeOp>(
|
||||
loc, insertOp.source(), *reassociation);
|
||||
rewriter.replaceOpWithNewOp<SubTensorInsertOp>(
|
||||
rewriter.replaceOpWithNewOp<tensor::InsertSliceOp>(
|
||||
insertOp, reshapedSource, insertOp.dest(), insertOp.getMixedOffsets(),
|
||||
insertOp.getMixedSizes(), insertOp.getMixedStrides());
|
||||
return success();
|
||||
|
@ -546,8 +547,9 @@ struct UseRankReducedSubTensorInsertOp
|
|||
void mlir::linalg::populateFoldUnitExtentDimsPatterns(
|
||||
RewritePatternSet &patterns) {
|
||||
auto *context = patterns.getContext();
|
||||
patterns.add<FoldUnitDimLoops, ReplaceUnitExtents, UseRankReducedSubTensorOp,
|
||||
UseRankReducedSubTensorInsertOp>(context);
|
||||
patterns.add<FoldUnitDimLoops, ReplaceUnitExtents,
|
||||
UseRankReducedExtractSliceOp, UseRankReducedInsertSliceOp>(
|
||||
context);
|
||||
TensorCollapseShapeOp::getCanonicalizationPatterns(patterns, context);
|
||||
TensorExpandShapeOp::getCanonicalizationPatterns(patterns, context);
|
||||
}
|
||||
|
|
|
@ -48,8 +48,8 @@ using llvm::dbgs;
|
|||
/// are 2 cases:
|
||||
/// a) buffer case: use the SSA value of the views and a simple alias
|
||||
/// analysis on subview ops to determine producer-consumer dependences;
|
||||
/// b) tensor case: use SSA use-def chains on subtensor ops;
|
||||
/// 2. greedily fuse the linalg ops that produce the subview/subtensor.
|
||||
/// b) tensor case: use SSA use-def chains on extract_slice ops;
|
||||
/// 2. greedily fuse the linalg ops that produce the subview/extract_slice.
|
||||
/// 3. inspect the fused ops and determine whether they have other remaining
|
||||
/// LinalgOp uses. If not, then erase the original producing linalg op.
|
||||
///
|
||||
|
@ -73,13 +73,14 @@ getShapeDefiningLoopRange(LinalgOp op, unsigned loopDepth,
|
|||
// Extract the subranges from the linearized ranges.
|
||||
for (OpOperand *opOperand : op.getInputAndOutputOperands()) {
|
||||
// The method `getRangeFromOperandShape` requires using SubViewOp or
|
||||
// SubTensorOps. If the value isnt defined from there continue.
|
||||
// ExtractSliceOps. If the value isn't defined from there continue.
|
||||
// todo: The method should be adapted to get the values from
|
||||
// `ViewInterface`. The interface needs a `getOrCreateRanges` method which
|
||||
// currently returns a `linalg.range`. The fix here is to move this op to
|
||||
// `std` dialect and add the method to `ViewInterface`.
|
||||
if (fromSubViewOpOnly && !isa_and_nonnull<memref::SubViewOp, SubTensorOp>(
|
||||
opOperand->get().getDefiningOp()))
|
||||
if (fromSubViewOpOnly &&
|
||||
!isa_and_nonnull<memref::SubViewOp, tensor::ExtractSliceOp>(
|
||||
opOperand->get().getDefiningOp()))
|
||||
continue;
|
||||
|
||||
AffineMap map = op.getTiedIndexingMap(opOperand);
|
||||
|
@ -221,7 +222,7 @@ static LinalgOp fuse(OpBuilder &b, LinalgOp producer,
|
|||
SmallVector<int64_t, 4> staticSizesVector(rank, ShapedType::kDynamicSize);
|
||||
SmallVector<int64_t, 4> staticStridesVector(
|
||||
rank, ShapedType::kDynamicStrideOrOffset);
|
||||
resultTypes.push_back(SubTensorOp::inferResultType(
|
||||
resultTypes.push_back(tensor::ExtractSliceOp::inferResultType(
|
||||
t.cast<RankedTensorType>(), staticOffsetsVector, staticSizesVector,
|
||||
staticStridesVector));
|
||||
}
|
||||
|
@ -252,15 +253,15 @@ static LinalgOp fuse(OpBuilder &b, LinalgOp producer,
|
|||
}
|
||||
|
||||
/// Get the loop range for a dimension `dim` based on the `shapedOperand`. It is
|
||||
/// expected to be defined by a subview op or a subtensor op.
|
||||
/// expected to be defined by a subview op or an extract_slice op.
|
||||
static Range getRangeFromOperandShape(OpBuilder &b, Location loc,
|
||||
Value shapedOperand, unsigned dim) {
|
||||
Operation *shapeProducingOp = shapedOperand.getDefiningOp();
|
||||
if (auto subViewOp = dyn_cast<memref::SubViewOp>(shapeProducingOp))
|
||||
return subViewOp.getOrCreateRanges(b, loc)[dim];
|
||||
if (auto subTensorOp = dyn_cast<SubTensorOp>(shapeProducingOp))
|
||||
return subTensorOp.getOrCreateRanges(b, loc)[dim];
|
||||
llvm_unreachable("SubviewOp or SubTensorOp expected");
|
||||
if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(shapeProducingOp))
|
||||
return sliceOp.getOrCreateRanges(b, loc)[dim];
|
||||
llvm_unreachable("SubviewOp or ExtractSliceOp expected");
|
||||
}
|
||||
|
||||
/// Fuses the producer into the loop immediately enclosing the consumer.
|
||||
|
@ -439,8 +440,8 @@ mlir::linalg::fuseProducerOfBuffer(OpBuilder &b, OpOperand &consumerOpOperand,
|
|||
if (!producerMap)
|
||||
return llvm::None;
|
||||
|
||||
// Must be a subview or a slice to guarantee there are loops we can fuse
|
||||
// into.
|
||||
// Must be a subview or an extract_slice to guarantee there are loops we can
|
||||
// fuse into.
|
||||
auto subView = consumerOpOperand.get().getDefiningOp<memref::SubViewOp>();
|
||||
if (!subView) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "\nNot fusable (not a subview)");
|
||||
|
@ -473,8 +474,8 @@ static void getProducerOfTensor(Value tensor, OpResult &opResult) {
|
|||
opResult = tensor.cast<OpResult>();
|
||||
return;
|
||||
}
|
||||
if (auto subTensorOp = tensor.getDefiningOp<SubTensorOp>()) {
|
||||
tensor = subTensorOp.source();
|
||||
if (auto sliceOp = tensor.getDefiningOp<tensor::ExtractSliceOp>()) {
|
||||
tensor = sliceOp.source();
|
||||
continue;
|
||||
}
|
||||
if (auto blockArg = tensor.dyn_cast<BlockArgument>()) {
|
||||
|
@ -512,11 +513,11 @@ mlir::linalg::fuseProducerOfTensor(OpBuilder &b, OpResult producerOpResult,
|
|||
|
||||
Value inputTensor = consumerOpOperand.get();
|
||||
|
||||
// Must be a subtensor to guarantee there are loops we can fuse into.
|
||||
auto subTensor = inputTensor.getDefiningOp<SubTensorOp>();
|
||||
if (!subTensor) {
|
||||
// Must be an extract_slice op to guarantee there are loops we can fuse into.
|
||||
auto sliceOp = inputTensor.getDefiningOp<tensor::ExtractSliceOp>();
|
||||
if (!sliceOp) {
|
||||
LLVM_DEBUG(llvm::dbgs()
|
||||
<< "\nNot fusable, not a subtensor: " << inputTensor);
|
||||
<< "\nNot fusable, not an extract_slice op: " << inputTensor);
|
||||
return {};
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/Dialect/SCF/Utils.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Vector/VectorOps.h"
|
||||
#include "mlir/Dialect/Vector/VectorUtils.h"
|
||||
#include "mlir/IR/BuiltinOps.h"
|
||||
|
@ -42,13 +43,13 @@ namespace {
|
|||
/// instructions that need to be hoisted too.
|
||||
struct HoistableWrite {
|
||||
vector::TransferWriteOp transferWriteOp;
|
||||
SubTensorInsertOp subTensorInsertOp;
|
||||
tensor::InsertSliceOp insertSliceOp;
|
||||
};
|
||||
/// Represents a unit of hoistable TransferReadOp. This may comprise other
|
||||
/// instructions that need to be hoisted too.
|
||||
struct HoistableRead {
|
||||
vector::TransferReadOp transferReadOp;
|
||||
SubTensorOp subTensorOp;
|
||||
tensor::ExtractSliceOp extractSliceOp;
|
||||
};
|
||||
} // namespace
|
||||
|
||||
|
@ -71,7 +72,8 @@ static bool isEqualOffsetSizeOrStride(OpFoldResult op1, OpFoldResult op2) {
|
|||
}
|
||||
|
||||
/// Return true is all offsets, sizes and strides are equal.
|
||||
static bool sameOffsetsSizesAndStrides(SubTensorOp s, SubTensorInsertOp si) {
|
||||
static bool sameOffsetsSizesAndStrides(tensor::ExtractSliceOp s,
|
||||
tensor::InsertSliceOp si) {
|
||||
if (s.static_offsets().size() != si.static_offsets().size())
|
||||
return false;
|
||||
if (s.static_sizes().size() != si.static_sizes().size())
|
||||
|
@ -99,38 +101,37 @@ static HoistableRead findMatchingTransferRead(HoistableWrite write,
|
|||
|
||||
LLVM_DEBUG(DBGS() << "findMatchingTransferRead for: "
|
||||
<< *write.transferWriteOp.getOperation() << "\n");
|
||||
if (write.subTensorInsertOp)
|
||||
LLVM_DEBUG(DBGS() << "findMatchingTransferRead subTensorInsertOp: "
|
||||
<< *write.subTensorInsertOp.getOperation() << "\n");
|
||||
if (write.insertSliceOp)
|
||||
LLVM_DEBUG(DBGS() << "findMatchingTransferRead inserSliceOp: "
|
||||
<< *write.insertSliceOp.getOperation() << "\n");
|
||||
|
||||
for (Operation *user : srcTensor.getUsers()) {
|
||||
LLVM_DEBUG(DBGS() << "findMatchingTransferRead inspect user: " << *user
|
||||
<< "\n");
|
||||
|
||||
// If HoistableWrite involves a SubTensorInsertOp, we need to find a
|
||||
// matching SubTensorOp.
|
||||
SubTensorOp subTensorOp;
|
||||
// If HoistableWrite involves a InsertSliceOp, we need to find a
|
||||
// matching ExtractSliceOp.
|
||||
tensor::ExtractSliceOp sliceOp;
|
||||
Operation *maybeTransferReadUser = user;
|
||||
if (write.subTensorInsertOp) {
|
||||
subTensorOp = dyn_cast<SubTensorOp>(user);
|
||||
if (!subTensorOp || subTensorOp.getResult().getType() !=
|
||||
write.subTensorInsertOp.source().getType())
|
||||
if (write.insertSliceOp) {
|
||||
sliceOp = dyn_cast<tensor::ExtractSliceOp>(user);
|
||||
if (!sliceOp || sliceOp.getResult().getType() !=
|
||||
write.insertSliceOp.source().getType())
|
||||
continue;
|
||||
|
||||
LLVM_DEBUG(DBGS() << "check whether sameOffsetsSizesAndStrides: "
|
||||
<< *subTensorOp << " vs " << *write.subTensorInsertOp
|
||||
<< "\n");
|
||||
if (!sameOffsetsSizesAndStrides(subTensorOp, write.subTensorInsertOp))
|
||||
<< *sliceOp << " vs " << *write.insertSliceOp << "\n");
|
||||
if (!sameOffsetsSizesAndStrides(sliceOp, write.insertSliceOp))
|
||||
continue;
|
||||
|
||||
LLVM_DEBUG(DBGS() << "sameOffsetsSizesAndStrides: SUCCESS\n");
|
||||
// If we got here, subTensorOp is hoistable iff it has exactly 2 uses:
|
||||
// If we got here, sliceOp is hoistable iff it has exactly 2 uses:
|
||||
// 1. the transfer_write we want to hoist.
|
||||
// 2. a matching transfer_read.
|
||||
// Anything else, we skip.
|
||||
bool skip = false;
|
||||
Operation *otherUser = nullptr;
|
||||
for (Operation *u : subTensorOp->getUsers()) {
|
||||
for (Operation *u : sliceOp->getUsers()) {
|
||||
if (u == write.transferWriteOp)
|
||||
continue;
|
||||
if (otherUser) {
|
||||
|
@ -149,7 +150,7 @@ static HoistableRead findMatchingTransferRead(HoistableWrite write,
|
|||
auto read = dyn_cast<vector::TransferReadOp>(maybeTransferReadUser);
|
||||
if (read && read.indices() == write.transferWriteOp.indices() &&
|
||||
read.getVectorType() == write.transferWriteOp.getVectorType())
|
||||
return HoistableRead{read, subTensorOp};
|
||||
return HoistableRead{read, sliceOp};
|
||||
}
|
||||
return HoistableRead();
|
||||
}
|
||||
|
@ -168,13 +169,13 @@ static bool tensorChunkAccessedByUnknownOp(HoistableWrite write,
|
|||
Operation *user = use.getOwner();
|
||||
// Skip the candidate use, only inspect the "other" uses.
|
||||
if (user == candidateRead.transferReadOp ||
|
||||
user == candidateRead.subTensorOp || user == write.transferWriteOp ||
|
||||
user == write.subTensorInsertOp)
|
||||
user == candidateRead.extractSliceOp ||
|
||||
user == write.transferWriteOp || user == write.insertSliceOp)
|
||||
continue;
|
||||
// Consider all transitive uses through a subtensor / subtensor_insert.
|
||||
// Consider all transitive uses through a extract_slice / insert_slice.
|
||||
// TODO: atm we just bail because a stronger analysis is needed for these
|
||||
// cases.
|
||||
if (isa<SubTensorOp, SubTensorInsertOp>(user))
|
||||
if (isa<tensor::ExtractSliceOp, tensor::InsertSliceOp>(user))
|
||||
return true;
|
||||
// Consider all transitive uses through a vector.transfer_write.
|
||||
if (auto writeUser = dyn_cast<vector::TransferWriteOp>(user)) {
|
||||
|
@ -214,7 +215,7 @@ static bool tensorChunkAccessedByUnknownOp(HoistableWrite write,
|
|||
|
||||
/// Return the `forOp`-invariant HoistableWrite that produces `yieldOperand`.
|
||||
/// Return the null HoistableWrite() if it is not comprised of a
|
||||
/// vector.transfer_write + optional subtensor_insert or if any of the indexings
|
||||
/// vector.transfer_write + optional insert_slice or if any of the indexings
|
||||
/// is `forOp`-dependent.
|
||||
static HoistableWrite
|
||||
getLoopInvariantTransferWriteOpDefining(scf::ForOp forOp,
|
||||
|
@ -229,26 +230,26 @@ getLoopInvariantTransferWriteOpDefining(scf::ForOp forOp,
|
|||
return HoistableWrite{write, nullptr};
|
||||
}
|
||||
|
||||
if (auto subTensorInsertOp = v.getDefiningOp<SubTensorInsertOp>()) {
|
||||
// Inserted subTensor must come from vector.transfer_write.
|
||||
if (auto insertSliceOp = v.getDefiningOp<tensor::InsertSliceOp>()) {
|
||||
// Inserted slice must come from vector.transfer_write.
|
||||
auto write =
|
||||
subTensorInsertOp.source().getDefiningOp<vector::TransferWriteOp>();
|
||||
insertSliceOp.source().getDefiningOp<vector::TransferWriteOp>();
|
||||
if (!write)
|
||||
return HoistableWrite();
|
||||
|
||||
// Tensor inserted into must be a BBArg at position matching yieldOperand's.
|
||||
auto bbArg = subTensorInsertOp.dest().dyn_cast<BlockArgument>();
|
||||
auto bbArg = insertSliceOp.dest().dyn_cast<BlockArgument>();
|
||||
if (!bbArg || bbArg.getOwner()->getParentOp() != forOp ||
|
||||
bbArg.getArgNumber() != /*num iv=*/1 + yieldOperand.getOperandNumber())
|
||||
return HoistableWrite();
|
||||
|
||||
// Indexing inserted into must not depend on `forOp`.
|
||||
for (Value operand : subTensorInsertOp->getOperands().drop_front(
|
||||
SubTensorInsertOp::getOffsetSizeAndStrideStartOperandIndex()))
|
||||
for (Value operand : insertSliceOp->getOperands().drop_front(
|
||||
tensor::InsertSliceOp::getOffsetSizeAndStrideStartOperandIndex()))
|
||||
if (!forOp.isDefinedOutsideOfLoop(operand))
|
||||
return HoistableWrite();
|
||||
|
||||
return HoistableWrite{write, subTensorInsertOp};
|
||||
return HoistableWrite{write, insertSliceOp};
|
||||
}
|
||||
|
||||
return HoistableWrite();
|
||||
|
@ -260,18 +261,18 @@ static void hoistReadWrite(HoistableRead read, HoistableWrite write,
|
|||
scf::ForOp forOp = cast<scf::ForOp>(tensorBBArg.getOwner()->getParentOp());
|
||||
assert(read.transferReadOp && write.transferWriteOp &&
|
||||
"expected transfer_read and transfer_write ops to be set");
|
||||
assert(((read.subTensorOp && write.subTensorInsertOp) ||
|
||||
(!read.subTensorOp && !write.subTensorInsertOp)) &&
|
||||
"expected matching subtensor / subtensor_insert");
|
||||
assert(((read.extractSliceOp && write.insertSliceOp) ||
|
||||
(!read.extractSliceOp && !write.insertSliceOp)) &&
|
||||
"expected matching extract_slice / insert_slice");
|
||||
LLVM_DEBUG(DBGS() << "In forOp:\n"
|
||||
<< *forOp.getOperation()
|
||||
<< "\nHoist: " << *read.transferReadOp.getOperation()
|
||||
<< "\nHoist: " << *write.transferWriteOp.getOperation()
|
||||
<< "\nInvolving: " << tensorBBArg << "\n");
|
||||
|
||||
// If a read subtensor is present, hoist it.
|
||||
if (read.subTensorOp && failed(forOp.moveOutOfLoop({read.subTensorOp})))
|
||||
llvm_unreachable("Unexpected failure moving subtensor out of loop");
|
||||
// If a read slice is present, hoist it.
|
||||
if (read.extractSliceOp && failed(forOp.moveOutOfLoop({read.extractSliceOp})))
|
||||
llvm_unreachable("Unexpected failure moving extract_slice out of loop");
|
||||
|
||||
// Hoist the transfer_read op.
|
||||
if (failed(forOp.moveOutOfLoop({read.transferReadOp})))
|
||||
|
@ -282,20 +283,20 @@ static void hoistReadWrite(HoistableRead read, HoistableWrite write,
|
|||
unsigned initArgNumber = tensorBBArg.getArgNumber() - /*numIvs=*/1;
|
||||
|
||||
// Update the source tensor.
|
||||
if (read.subTensorOp)
|
||||
read.subTensorOp.sourceMutable().assign(forOp.initArgs()[initArgNumber]);
|
||||
if (read.extractSliceOp)
|
||||
read.extractSliceOp.sourceMutable().assign(forOp.initArgs()[initArgNumber]);
|
||||
else
|
||||
read.transferReadOp.sourceMutable().assign(forOp.initArgs()[initArgNumber]);
|
||||
|
||||
// Hoist write after.
|
||||
if (write.subTensorInsertOp)
|
||||
write.subTensorInsertOp->moveAfter(forOp);
|
||||
if (write.insertSliceOp)
|
||||
write.insertSliceOp->moveAfter(forOp);
|
||||
write.transferWriteOp->moveAfter(forOp);
|
||||
|
||||
// Update the yield.
|
||||
auto yieldOp = cast<scf::YieldOp>(forOp.region().front().getTerminator());
|
||||
if (write.subTensorInsertOp)
|
||||
yieldOp->setOperand(initArgNumber, write.subTensorInsertOp.dest());
|
||||
if (write.insertSliceOp)
|
||||
yieldOp->setOperand(initArgNumber, write.insertSliceOp.dest());
|
||||
else
|
||||
yieldOp->setOperand(initArgNumber, write.transferWriteOp.source());
|
||||
|
||||
|
@ -306,13 +307,13 @@ static void hoistReadWrite(HoistableRead read, HoistableWrite write,
|
|||
// Transfer write has been hoisted, need to update the vector and tensor
|
||||
// source. Replace the result of the loop to use the new tensor created
|
||||
// outside the loop.
|
||||
// Depending on whether a subtensor_insert is present or not, it carries the
|
||||
// Depending on whether a insert_slice is present or not, it carries the
|
||||
// update on the tensor operands.
|
||||
if (write.subTensorInsertOp) {
|
||||
if (write.insertSliceOp) {
|
||||
newForOp.getResult(initArgNumber)
|
||||
.replaceAllUsesWith(write.subTensorInsertOp.getResult());
|
||||
write.transferWriteOp.sourceMutable().assign(read.subTensorOp.result());
|
||||
write.subTensorInsertOp.destMutable().assign(read.subTensorOp.source());
|
||||
.replaceAllUsesWith(write.insertSliceOp.getResult());
|
||||
write.transferWriteOp.sourceMutable().assign(read.extractSliceOp.result());
|
||||
write.insertSliceOp.destMutable().assign(read.extractSliceOp.source());
|
||||
} else {
|
||||
newForOp.getResult(initArgNumber)
|
||||
.replaceAllUsesWith(write.transferWriteOp.getResult(0));
|
||||
|
@ -350,9 +351,9 @@ void mlir::linalg::hoistRedundantVectorTransfersOnTensor(FuncOp func) {
|
|||
LLVM_DEBUG(dbgs() << "\n";
|
||||
DBGS() << "Candidate write for hoisting: "
|
||||
<< *write.transferWriteOp.getOperation() << "\n");
|
||||
if (write.subTensorInsertOp)
|
||||
LLVM_DEBUG(DBGS() << "Candidate subtensor_insert for hoisting: "
|
||||
<< *write.subTensorInsertOp.getOperation() << "\n");
|
||||
if (write.insertSliceOp)
|
||||
LLVM_DEBUG(DBGS() << "Candidate insert_slice for hoisting: "
|
||||
<< *write.insertSliceOp.getOperation() << "\n");
|
||||
if (llvm::any_of(write.transferWriteOp.indices(),
|
||||
[&forOp](Value index) {
|
||||
return !forOp.isDefinedOutsideOfLoop(index);
|
||||
|
@ -788,8 +789,8 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
|
|||
// The implementation proceeds in a stack-like fashion:
|
||||
// 1. Iteratively clone and step into the loops, pushing the `packedTensor`
|
||||
// deeper in the stack.
|
||||
// 2. Create a SubTensorInsert at the top of the stack.
|
||||
// 3. Iteratively pop and yield the result of the SubTensorInsertOp across
|
||||
// 2. Create a InsertSliceOp at the top of the stack.
|
||||
// 3. Iteratively pop and yield the result of the InsertSliceOp across
|
||||
// the cloned loops.
|
||||
SmallVector<Value> clonedLoopIvs, leadingPackedTensorIndexings;
|
||||
clonedLoopIvs.reserve(nLoops);
|
||||
|
@ -799,10 +800,10 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
|
|||
backwardSlice.insert(padTensorOp);
|
||||
// Stack step 1. iteratively clone loops and push `packedTensor`.
|
||||
for (Operation *op : backwardSlice) {
|
||||
// Specifically sit out in the subtenso(packedTensor) case: this is the
|
||||
// Specifically sit out in the extract_slice(packedTensor) case: this is the
|
||||
// piece we seek to replace.
|
||||
if (auto subTensor = dyn_cast<SubTensorOp>(op))
|
||||
if (bvm.lookupOrDefault(subTensor.source()) == packedTensor)
|
||||
if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
|
||||
if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
|
||||
continue;
|
||||
auto effects = dyn_cast<MemoryEffectOpInterface>(op);
|
||||
bool hasNoEffects = !effects || effects.hasNoEffect();
|
||||
|
@ -839,7 +840,7 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
|
|||
packedTensor = clonedForOp.getRegionIterArgs().front();
|
||||
}
|
||||
|
||||
// Stack step 2. create SubTensorInsertOp at the top of the stack.
|
||||
// Stack step 2. create InsertSliceOp at the top of the stack.
|
||||
// offsets = [clonedLoopIvs, 0 .. 0].
|
||||
SmallVector<OpFoldResult> offsets(leadingPackedTensorIndexings.begin(),
|
||||
leadingPackedTensorIndexings.end());
|
||||
|
@ -856,8 +857,8 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
|
|||
SmallVector<OpFoldResult> strides(nLoops + paddedRank, b.getIndexAttr(1));
|
||||
|
||||
Value inserted =
|
||||
b.create<SubTensorInsertOp>(loc, bvm.lookup(padTensorOp.result()),
|
||||
packedTensor, offsets, sizes, strides);
|
||||
b.create<tensor::InsertSliceOp>(loc, bvm.lookup(padTensorOp.result()),
|
||||
packedTensor, offsets, sizes, strides);
|
||||
|
||||
// Stack step 3. iteratively pop the stack and propagate the yield.
|
||||
Value valueToYield = inserted;
|
||||
|
@ -869,7 +870,7 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
|
|||
}
|
||||
|
||||
// Now the packed tensor is ready, replace the original padding op by a
|
||||
// 1x..x1 SubTensor [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
|
||||
// 1x..x1 slice [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1].
|
||||
b.setInsertionPoint(padTensorOp);
|
||||
SmallVector<Value> loopIterationCounts =
|
||||
llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {
|
||||
|
@ -888,8 +889,8 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
|
|||
packedTensor =
|
||||
scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0);
|
||||
padTensorOp.replaceAllUsesWith(
|
||||
b.create<SubTensorOp>(loc, padTensorOp.getResultType(), packedTensor,
|
||||
offsets, sizes, strides)
|
||||
b.create<tensor::ExtractSliceOp>(loc, padTensorOp.getResultType(),
|
||||
packedTensor, offsets, sizes, strides)
|
||||
->getResult(0));
|
||||
|
||||
Operation *toErase = padTensorOp;
|
||||
|
|
|
@ -254,18 +254,18 @@ tileLinalgOpImpl(OpBuilder &b, LinalgOp op, ValueRange tileSizes,
|
|||
|
||||
res = op.clone(b, loc, resultTensorTypes, tiledOperands);
|
||||
|
||||
// Insert a subtensor_insert for each output tensor.
|
||||
// Insert a insert_slice for each output tensor.
|
||||
unsigned resultIdx = 0;
|
||||
for (OpOperand *opOperand : op.getOutputTensorOperands()) {
|
||||
// TODO: use an interface/adaptor to avoid leaking position in
|
||||
// `tiledOperands`.
|
||||
Value outputTensor = tiledOperands[opOperand->getOperandNumber()];
|
||||
if (auto subtensor = outputTensor.getDefiningOp<SubTensorOp>()) {
|
||||
tensorResults.push_back(b.create<SubTensorInsertOp>(
|
||||
loc, subtensor.source().getType(), res->getResult(resultIdx),
|
||||
subtensor.source(), subtensor.offsets(), subtensor.sizes(),
|
||||
subtensor.strides(), subtensor.static_offsets(),
|
||||
subtensor.static_sizes(), subtensor.static_strides()));
|
||||
if (auto sliceOp = outputTensor.getDefiningOp<tensor::ExtractSliceOp>()) {
|
||||
tensorResults.push_back(b.create<tensor::InsertSliceOp>(
|
||||
loc, sliceOp.source().getType(), res->getResult(resultIdx),
|
||||
sliceOp.source(), sliceOp.offsets(), sliceOp.sizes(),
|
||||
sliceOp.strides(), sliceOp.static_offsets(), sliceOp.static_sizes(),
|
||||
sliceOp.static_strides()));
|
||||
} else {
|
||||
tensorResults.push_back(res->getResult(resultIdx));
|
||||
}
|
||||
|
@ -406,7 +406,7 @@ void mlir::linalg::populateLinalgTilingCanonicalizationPatterns(
|
|||
scf::ForOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
scf::ParallelOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
ConstantIndexOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
SubTensorOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
memref::SubViewOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
memref::ViewOp::getCanonicalizationPatterns(patterns, ctx);
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
#include "mlir/Dialect/Linalg/Analysis/DependenceAnalysis.h"
|
||||
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
|
||||
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
|
||||
#include "mlir/Dialect/Vector/VectorOps.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
|
@ -128,14 +129,13 @@ static LogicalResult padOperandToSmallestStaticBoundingBox(
|
|||
// Already static shape, no need to pad.
|
||||
if (llvm::none_of(opToPad.getShape(opOperand), ShapedType::isDynamic))
|
||||
return success();
|
||||
auto subtensor = opOperand->get().getDefiningOp<SubTensorOp>();
|
||||
// Not a subtensor, cannot construct a static bounding box.
|
||||
if (!subtensor)
|
||||
auto sliceOp = opOperand->get().getDefiningOp<tensor::ExtractSliceOp>();
|
||||
// Not a slice op, cannot construct a static bounding box.
|
||||
if (!sliceOp)
|
||||
return failure();
|
||||
SmallVector<int64_t> staticSizes;
|
||||
staticSizes.reserve(opToPad.getRank(opOperand));
|
||||
auto shapedOp =
|
||||
cast<OffsetSizeAndStrideOpInterface>(subtensor.getOperation());
|
||||
auto shapedOp = cast<OffsetSizeAndStrideOpInterface>(sliceOp.getOperation());
|
||||
for (auto size : shapedOp.getMixedSizes()) {
|
||||
auto indexAttr = size.is<Attribute>()
|
||||
? size.get<Attribute>().dyn_cast<IntegerAttr>()
|
||||
|
@ -195,8 +195,8 @@ static LogicalResult rewriteAsPaddedOp(PatternRewriter &rewriter,
|
|||
linalg::LinalgOp paddedOp =
|
||||
opToPad.clone(rewriter, loc, resultTensorTypes, newOperands);
|
||||
|
||||
// Recover the subtensor out of the new static results. This keeps the
|
||||
// original linalg op around because it uses the dims of the original results.
|
||||
// Recover the slice out of the new static results. This keeps the original
|
||||
// linalg op around because it uses the dims of the original results.
|
||||
// This later folds away.
|
||||
SmallVector<Value> paddedSubviewResults;
|
||||
paddedSubviewResults.reserve(opToPad->getNumResults());
|
||||
|
@ -211,7 +211,7 @@ static LogicalResult rewriteAsPaddedOp(PatternRewriter &rewriter,
|
|||
return dimOp.getResult();
|
||||
}));
|
||||
SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));
|
||||
paddedSubviewResults.push_back(rewriter.create<SubTensorOp>(
|
||||
paddedSubviewResults.push_back(rewriter.create<tensor::ExtractSliceOp>(
|
||||
loc, std::get<1>(it), offsets, sizes, strides));
|
||||
}
|
||||
// Replace the transient `opToPad` locally, except for uses that we just
|
||||
|
@ -679,7 +679,7 @@ LogicalResult PadTensorOpTransformationPattern::matchAndRewrite(
|
|||
rewriter.create<linalg::FillOp>(loc, initTensor, padValue).result();
|
||||
|
||||
// Copy original contents into new tensor
|
||||
// Uses linalg.generic, but could be done with std.subtensor_insert
|
||||
// Uses linalg.generic, but could be done with tensor.insert_slice
|
||||
SmallVector<AffineExpr, 4> outputExprs;
|
||||
for (unsigned i = 0; i < resultShapedType.getRank(); ++i) {
|
||||
outputExprs.push_back(getAffineDimExpr(i, rewriter.getContext()) +
|
||||
|
@ -719,13 +719,13 @@ static OpFoldResult asOpFoldResult(OpBuilder &builder, Value val) {
|
|||
return val;
|
||||
}
|
||||
|
||||
LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
||||
SubTensorOp subTensorOp, PatternRewriter &rewriter) const {
|
||||
auto padOp = subTensorOp.source().getDefiningOp<PadTensorOp>();
|
||||
LogicalResult ExtractSliceOfPadTensorSwapPattern::matchAndRewrite(
|
||||
tensor::ExtractSliceOp sliceOp, PatternRewriter &rewriter) const {
|
||||
auto padOp = sliceOp.source().getDefiningOp<PadTensorOp>();
|
||||
if (!padOp)
|
||||
return failure();
|
||||
// Only unit stride supported.
|
||||
if (!subTensorOp.hasUnitStride())
|
||||
if (!sliceOp.hasUnitStride())
|
||||
return failure();
|
||||
// Only constant padding value supported.
|
||||
Value padValue = padOp.getConstantPaddingValue();
|
||||
|
@ -734,7 +734,7 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
|
||||
// Helper variables and functions for various arithmetic operations. These are
|
||||
// used extensively for computing new offset/length and padding values.
|
||||
Location loc = subTensorOp.getLoc();
|
||||
Location loc = sliceOp.getLoc();
|
||||
AffineExpr dim0, dim1;
|
||||
bindDims(rewriter.getContext(), dim0, dim1);
|
||||
// Add two integers.
|
||||
|
@ -786,8 +786,8 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
int64_t rank = padOp.getSourceType().getRank();
|
||||
for (unsigned dim = 0; dim < rank; ++dim) {
|
||||
auto low = asValue(rewriter, loc, padOp.getMixedLowPad()[dim]);
|
||||
auto offset = asValue(rewriter, loc, subTensorOp.getMixedOffsets()[dim]);
|
||||
auto length = asValue(rewriter, loc, subTensorOp.getMixedSizes()[dim]);
|
||||
auto offset = asValue(rewriter, loc, sliceOp.getMixedOffsets()[dim]);
|
||||
auto length = asValue(rewriter, loc, sliceOp.getMixedSizes()[dim]);
|
||||
auto srcSize = rewriter.createOrFold<memref::DimOp>(
|
||||
loc, padOp.source(), dim);
|
||||
|
||||
|
@ -805,19 +805,19 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
//
|
||||
// The original read could also have started in the high padding zone.
|
||||
// In that case, set the offset to the end of source tensor. The new
|
||||
// SubTensorOp length will be zero in that case. (Effectively reading no
|
||||
// ExtractSliceOp length will be zero in that case. (Effectively reading no
|
||||
// data from the source.)
|
||||
Value newOffset = min(max(sub(offset, low), zero), srcSize);
|
||||
newOffsets.push_back(asOpFoldResult(rewriter, newOffset));
|
||||
|
||||
// The original SubTensorOp was reading until position `offset + length`.
|
||||
// The original ExtractSliceOp was reading until position `offset + length`.
|
||||
// Therefore, the corresponding position within the source tensor is:
|
||||
//
|
||||
// offset + length - low
|
||||
//
|
||||
// In case the original SubTensorOp stopped reading within the low padding
|
||||
// zone, this value can be negative. In that case, the end position of the
|
||||
// read should be zero. (Similar to newOffset.)
|
||||
// In case the original ExtractSliceOp stopped reading within the low
|
||||
// padding zone, this value can be negative. In that case, the end position
|
||||
// of the read should be zero. (Similar to newOffset.)
|
||||
//
|
||||
// The original read could also have stopped in the high padding zone.
|
||||
// In that case, set the end positition of the read should be the end of the
|
||||
|
@ -825,7 +825,7 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
//
|
||||
// endLoc = min(max(offset - low + length, 0), srcSize)
|
||||
//
|
||||
// The new SubTensorOp length is `endLoc - newOffset`.
|
||||
// The new ExtractSliceOp length is `endLoc - newOffset`.
|
||||
Value endLoc = min(max(add(sub(offset, low), length), zero), srcSize);
|
||||
Value newLength = sub(endLoc, newOffset);
|
||||
newLengths.push_back(asOpFoldResult(rewriter, newLength));
|
||||
|
@ -842,7 +842,7 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
}
|
||||
|
||||
// The amount of high padding is simply the number of elements remaining,
|
||||
// so that the result has the same length as the original SubTensorOp.
|
||||
// so that the result has the same length as the original ExtractSliceOp.
|
||||
Value newHigh = sub(sub(length, newLength), newLow);
|
||||
appendIndex(newHigh, newHighs, staticNewHighs);
|
||||
|
||||
|
@ -852,22 +852,20 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
|
||||
// Insert cast to ensure that types match. (May be folded away.)
|
||||
auto castResult = [&](Value val) -> Value {
|
||||
auto castOp = rewriter.create<tensor::CastOp>(
|
||||
loc, subTensorOp.getType(), val);
|
||||
auto castOp = rewriter.create<tensor::CastOp>(loc, sliceOp.getType(), val);
|
||||
return castOp;
|
||||
};
|
||||
|
||||
// In cases where the original data source is unused: Emit a GenerateOp and
|
||||
// do not generate a SubTensorOp. (The result shape of the SubTensorOp would
|
||||
// do not generate a SliceOp. (The result shape of the SliceOp would
|
||||
// have a dimension of size 0, the semantics of which is unclear.)
|
||||
auto createGenerateOp = [&]() {
|
||||
// The shape of the GenerateOp is the same as the existing SubTensorOp.
|
||||
RankedTensorType type = subTensorOp.getType();
|
||||
// The shape of the GenerateOp is the same as the existing SliceOp.
|
||||
RankedTensorType type = sliceOp.getType();
|
||||
SmallVector<Value> dynDims;
|
||||
for (unsigned i = 0; i < type.getRank(); ++i) {
|
||||
if (type.isDynamicDim(i))
|
||||
dynDims.push_back(
|
||||
asValue(rewriter, loc, subTensorOp.getMixedOffsets()[i]));
|
||||
dynDims.push_back(asValue(rewriter, loc, sliceOp.getMixedOffsets()[i]));
|
||||
}
|
||||
|
||||
// Create GenerateOp.
|
||||
|
@ -891,14 +889,14 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
return castResult(generateOp);
|
||||
};
|
||||
|
||||
// Emit a SubTensorOp and a PadTensorOp. Should not be used in cases where
|
||||
// the result shape of the new SubTensorOp has a zero dimension.
|
||||
// Emit a SliceOp and a PadTensorOp. Should not be used in cases where
|
||||
// the result shape of the new SliceOp has a zero dimension.
|
||||
auto createPadTensorOfSubTensor = [&]() {
|
||||
// Create pad_tensor(subtensor(x)).
|
||||
auto newSubTensorOp = rewriter.create<SubTensorOp>(
|
||||
auto newSliceOp = rewriter.create<tensor::ExtractSliceOp>(
|
||||
loc, padOp.source(), newOffsets, newLengths, newStrides);
|
||||
auto newPadTensorOp = rewriter.create<PadTensorOp>(
|
||||
loc, newSubTensorOp, staticNewLows, staticNewHighs, newLows, newHighs);
|
||||
loc, newSliceOp, staticNewLows, staticNewHighs, newLows, newHighs);
|
||||
|
||||
// Copy region to new PadTensorOp.
|
||||
BlockAndValueMapping bvm;
|
||||
|
@ -911,27 +909,29 @@ LogicalResult SubTensorOfPadTensorSwapPattern::matchAndRewrite(
|
|||
// Rewrite subtensor(pad_tensor(x)) into a GenerateOp it is statically known
|
||||
// that the original data source x is not used.
|
||||
if (hasZeroLen) {
|
||||
rewriter.replaceOp(subTensorOp, createGenerateOp());
|
||||
rewriter.replaceOp(sliceOp, createGenerateOp());
|
||||
return success();
|
||||
}
|
||||
|
||||
// If there are dynamic dimensions: Generate an scf.if check to avoid creating
|
||||
// SubTensorOps with result dimensions of size 0 at runtime.
|
||||
// SliceOps with result dimensions of size 0 at runtime.
|
||||
if (dynHasZeroLenCond) {
|
||||
auto result = rewriter.create<scf::IfOp>(
|
||||
loc, subTensorOp.getType(), dynHasZeroLenCond,
|
||||
/*thenBuilder=*/[&](OpBuilder &b, Location loc) {
|
||||
loc, sliceOp.getType(), dynHasZeroLenCond,
|
||||
/*thenBuilder=*/
|
||||
[&](OpBuilder &b, Location loc) {
|
||||
b.create<scf::YieldOp>(loc, createGenerateOp());
|
||||
},
|
||||
/*elseBuilder=*/[&](OpBuilder &b, Location loc) {
|
||||
/*elseBuilder=*/
|
||||
[&](OpBuilder &b, Location loc) {
|
||||
b.create<scf::YieldOp>(loc, createPadTensorOfSubTensor());
|
||||
});
|
||||
rewriter.replaceOp(subTensorOp, result.getResult(0));
|
||||
rewriter.replaceOp(sliceOp, result.getResult(0));
|
||||
return success();
|
||||
}
|
||||
|
||||
// All shapes are static and the data source is actually used. Rewrite into
|
||||
// pad_tensor(subtensor(x)).
|
||||
rewriter.replaceOp(subTensorOp, createPadTensorOfSubTensor());
|
||||
rewriter.replaceOp(sliceOp, createPadTensorOfSubTensor());
|
||||
return success();
|
||||
}
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
|
||||
#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
|
||||
#include "mlir/Dialect/Linalg/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/Dialect/Utils/StructuredOpsUtils.h"
|
||||
#include "mlir/Dialect/Vector/VectorOps.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
|
@ -677,9 +678,9 @@ static SmallVector<Value> ofrToIndexValues(OpBuilder &builder, Location loc,
|
|||
}
|
||||
|
||||
/// Rewrite a PadTensorOp into a sequence of InitTensorOp, FillOp and
|
||||
/// SubTensorInsertOp. For now, only constant padding values are supported.
|
||||
/// InsertSliceOp. For now, only constant padding values are supported.
|
||||
/// If there is enough static type information, TransferReadOps and
|
||||
/// TransferWriteOps may be generated instead of SubTensorInsertOps.
|
||||
/// TransferWriteOps may be generated instead of InsertSliceOps.
|
||||
struct GenericPadTensorOpVectorizationPattern
|
||||
: public OpRewritePattern<PadTensorOp> {
|
||||
using OpRewritePattern<PadTensorOp>::OpRewritePattern;
|
||||
|
@ -723,7 +724,7 @@ struct GenericPadTensorOpVectorizationPattern
|
|||
return success();
|
||||
|
||||
// Neither source type nor PadTensorOp result type have static shape. Such
|
||||
// PadTensorOps cannot be vectorized. Generate a SubTensorInsertOp instead
|
||||
// PadTensorOps cannot be vectorized. Generate a InsertSliceOp instead
|
||||
// for copying the PadOp source.
|
||||
|
||||
auto sourceType = padOp.getSourceType();
|
||||
|
@ -737,10 +738,10 @@ struct GenericPadTensorOpVectorizationPattern
|
|||
srcSizes.push_back(rewriter.getIndexAttr(sourceType.getDimSize(dim)));
|
||||
}
|
||||
}
|
||||
// Strides of SubTensorInsertOp are all 1.
|
||||
// Strides of InsertSliceOp are all 1.
|
||||
SmallVector<OpFoldResult> strides(sourceType.getRank(),
|
||||
rewriter.getIndexAttr(1));
|
||||
rewriter.replaceOpWithNewOp<SubTensorInsertOp>(
|
||||
rewriter.replaceOpWithNewOp<tensor::InsertSliceOp>(
|
||||
padOp, padOp.source(), fill, padOp.getMixedLowPad(), srcSizes, strides);
|
||||
|
||||
return success();
|
||||
|
@ -913,27 +914,29 @@ struct PadTensorOpVectorizationWithTransferReadPattern
|
|||
/// write. In such cases, the TransferWriteOp can write to the non-padded tensor
|
||||
/// value and apply out-of-bounds masking. E.g.:
|
||||
/// ```
|
||||
/// %0 = subtensor ...[...] [%s0, %s1] [1, 1] : tensor<...> to tensor<?x?xf32>
|
||||
/// %0 = tensor.extract_slice ...[...] [%s0, %s1] [1, 1]
|
||||
/// : tensor<...> to tensor<?x?xf32>
|
||||
/// %1 = linalg.pad_tensor %0 ... : tensor<?x?xf32> to tensor<17x5xf32>
|
||||
/// %2 = vector.transfer_write %vec, %1[...]
|
||||
/// : vector<17x5xf32>, tensor<17x5xf32>
|
||||
/// %r = subtensor %2[0, 0] [%s0, %s1] [1, 1]
|
||||
/// %r = tensor.extract_slice %2[0, 0] [%s0, %s1] [1, 1]
|
||||
/// : tensor<17x5xf32> to tensor<?x?xf32>
|
||||
/// ```
|
||||
/// is rewritten to:
|
||||
/// ```
|
||||
/// %0 = subtensor ...[...] [%s0, %s1] [1, 1] : tensor<...> to tensor<?x?xf32>
|
||||
/// %0 = tensor.extract_slice ...[...] [%s0, %s1] [1, 1]
|
||||
/// : tensor<...> to tensor<?x?xf32>
|
||||
/// %r = vector.transfer_write %vec, %0[...] : vector<17x5xf32>, tensor<?x?xf32>
|
||||
/// ```
|
||||
/// Note: It is important that the SubTensorOp %r resizes the result of the
|
||||
/// Note: It is important that the ExtractSliceOp %r resizes the result of the
|
||||
/// TransferWriteOp to the same size as the input of the TensorPadOp (or an even
|
||||
/// smaller size). Otherwise, %r's new (dynamic) dimensions would differ from
|
||||
/// %r's old dimensions.
|
||||
///
|
||||
/// This rewrite is possible if:
|
||||
/// - Low padding is static 0.
|
||||
/// - `xferOp` has exactly one use, which is a SubTensorOp. This SubTensorOp
|
||||
/// trims the same amount of padding that was added beforehand.
|
||||
/// - `xferOp` has exactly one use, which is an ExtractSliceOp. This
|
||||
/// ExtractSliceOp trims the same amount of padding that was added beforehand.
|
||||
/// - Single, scalar padding value.
|
||||
struct PadTensorOpVectorizationWithTransferWritePattern
|
||||
: public VectorizePadTensorOpUserPattern<vector::TransferWriteOp> {
|
||||
|
@ -947,9 +950,9 @@ struct PadTensorOpVectorizationWithTransferWritePattern
|
|||
// Pad value must be a constant.
|
||||
auto padValue = padOp.getConstantPaddingValue();
|
||||
if (!padValue) return failure();
|
||||
// TransferWriteOp result must be directly consumed by a SubTensorOp.
|
||||
// TransferWriteOp result must be directly consumed by an ExtractSliceOp.
|
||||
if (!xferOp->hasOneUse()) return failure();
|
||||
auto trimPadding = dyn_cast<SubTensorOp>(*xferOp->user_begin());
|
||||
auto trimPadding = dyn_cast<tensor::ExtractSliceOp>(*xferOp->user_begin());
|
||||
if (!trimPadding) return failure();
|
||||
// Only static zero offsets supported when trimming padding.
|
||||
if (!trimPadding.hasZeroOffset()) return failure();
|
||||
|
@ -976,7 +979,8 @@ struct PadTensorOpVectorizationWithTransferWritePattern
|
|||
/// This is a conservative analysis. In case equal tensor sizes cannot be
|
||||
/// proven statically, this analysis returns `false` even though the tensor
|
||||
/// sizes may turn out to be equal at runtime.
|
||||
bool hasSameTensorSize(Value beforePadding, SubTensorOp afterTrimming) const {
|
||||
bool hasSameTensorSize(Value beforePadding,
|
||||
tensor::ExtractSliceOp afterTrimming) const {
|
||||
// If the input to PadTensorOp is a CastOp, try with with both CastOp result
|
||||
// and CastOp operand.
|
||||
if (auto castOp = beforePadding.getDefiningOp<tensor::CastOp>())
|
||||
|
@ -1002,21 +1006,22 @@ struct PadTensorOpVectorizationWithTransferWritePattern
|
|||
if (t1.getNumDynamicDims() == 0) return true;
|
||||
|
||||
// All dynamic sizes must be the same. The only supported case at the moment
|
||||
// is when `beforePadding` is a SubTensorOp (or a cast thereof).
|
||||
// is when `beforePadding` is an ExtractSliceOp (or a cast thereof).
|
||||
|
||||
// Apart from CastOp, only SubTensorOp is supported.
|
||||
auto beforeSubtensor = beforePadding.getDefiningOp<SubTensorOp>();
|
||||
if (!beforeSubtensor) return false;
|
||||
// Apart from CastOp, only ExtractSliceOp is supported.
|
||||
auto beforeSlice = beforePadding.getDefiningOp<tensor::ExtractSliceOp>();
|
||||
if (!beforeSlice)
|
||||
return false;
|
||||
|
||||
assert(static_cast<size_t>(t1.getRank())
|
||||
== beforeSubtensor.getMixedSizes().size());
|
||||
assert(static_cast<size_t>(t1.getRank()) ==
|
||||
beforeSlice.getMixedSizes().size());
|
||||
assert(static_cast<size_t>(t2.getRank())
|
||||
== afterTrimming.getMixedSizes().size());
|
||||
|
||||
for (unsigned i = 0; i < t1.getRank(); ++i) {
|
||||
// Skip static dimensions.
|
||||
if (!t1.isDynamicDim(i)) continue;
|
||||
auto size1 = beforeSubtensor.getMixedSizes()[i];
|
||||
auto size1 = beforeSlice.getMixedSizes()[i];
|
||||
auto size2 = afterTrimming.getMixedSizes()[i];
|
||||
|
||||
// Case 1: Same value or same constant int.
|
||||
|
@ -1042,10 +1047,11 @@ struct PadTensorOpVectorizationWithTransferWritePattern
|
|||
}
|
||||
};
|
||||
|
||||
/// Rewrite use of PadTensorOp result in SubtensorInsertOp. E.g.:
|
||||
/// Rewrite use of PadTensorOp result in InsertSliceOp. E.g.:
|
||||
/// ```
|
||||
/// %0 = linalg.pad_tensor %src ... : tensor<?x?xf32> to tensor<17x5xf32>
|
||||
/// %r = subtensor_insert %0 into %dest[%a, %b, 0, 0] [1, 1, 17, 5] [1, 1, 1, 1]
|
||||
/// %r = tensor.insert_slice %0
|
||||
/// into %dest[%a, %b, 0, 0] [1, 1, 17, 5] [1, 1, 1, 1]
|
||||
/// : tensor<17x5xf32> into tensor<?x?x17x5xf32>
|
||||
/// ```
|
||||
/// is rewritten to:
|
||||
|
@ -1063,13 +1069,13 @@ struct PadTensorOpVectorizationWithTransferWritePattern
|
|||
/// (Implies that sizes of `insertOp` are all static.)
|
||||
/// - Only unit strides in `insertOp`.
|
||||
/// - Single, scalar padding value.
|
||||
struct PadTensorOpVectorizationWithSubTensorInsertPattern
|
||||
: public VectorizePadTensorOpUserPattern<SubTensorInsertOp> {
|
||||
using VectorizePadTensorOpUserPattern<SubTensorInsertOp>
|
||||
::VectorizePadTensorOpUserPattern;
|
||||
struct PadTensorOpVectorizationWithInsertSlicePattern
|
||||
: public VectorizePadTensorOpUserPattern<tensor::InsertSliceOp> {
|
||||
using VectorizePadTensorOpUserPattern<
|
||||
tensor::InsertSliceOp>::VectorizePadTensorOpUserPattern;
|
||||
|
||||
LogicalResult rewriteUser(PatternRewriter &rewriter, PadTensorOp padOp,
|
||||
SubTensorInsertOp insertOp) const override {
|
||||
tensor::InsertSliceOp insertOp) const override {
|
||||
// Low padding must be static 0.
|
||||
if (!padOp.hasZeroLowPad()) return failure();
|
||||
// Only unit stride supported.
|
||||
|
@ -1103,8 +1109,8 @@ struct PadTensorOpVectorizationWithSubTensorInsertPattern
|
|||
auto read = rewriter.create<vector::TransferReadOp>(
|
||||
padOp.getLoc(), vecType, padOp.source(), readIndices, padValue);
|
||||
|
||||
// Generate TransferWriteOp: Write to SubTensorInsertOp's dest tensor at
|
||||
// specified offsets. Write is fully in-bounds because a SubTensorInsertOp's
|
||||
// Generate TransferWriteOp: Write to InsertSliceOp's dest tensor at
|
||||
// specified offsets. Write is fully in-bounds because a InsertSliceOp's
|
||||
// source must fit into the destination at the specified offsets.
|
||||
auto writeIndices =
|
||||
ofrToIndexValues(rewriter, padOp.getLoc(), insertOp.getMixedOffsets());
|
||||
|
@ -1123,7 +1129,7 @@ void mlir::linalg::populatePadTensorOpVectorizationPatterns(
|
|||
// Try these specialized patterns first before resorting to the generic one.
|
||||
patterns.add<PadTensorOpVectorizationWithTransferReadPattern,
|
||||
PadTensorOpVectorizationWithTransferWritePattern,
|
||||
PadTensorOpVectorizationWithSubTensorInsertPattern>(
|
||||
PadTensorOpVectorizationWithInsertSlicePattern>(
|
||||
patterns.getContext(), baseBenefit.getBenefit() + 1);
|
||||
}
|
||||
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include "mlir/Dialect/SCF/SCF.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/Dialect/StandardOps/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineExprVisitor.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
|
@ -556,7 +557,7 @@ SmallVector<Value, 4> makeTiledShapes(OpBuilder &b, Location loc,
|
|||
}
|
||||
LLVM_DEBUG(llvm::dbgs() << ": tiled: figure out subshape...\n");
|
||||
|
||||
// Construct a new subview / subtensor for the tile.
|
||||
// Construct a new subview / extract_slice for the tile.
|
||||
SmallVector<OpFoldResult, 4> offsets, sizes, strides;
|
||||
offsets.reserve(rank);
|
||||
sizes.reserve(rank);
|
||||
|
@ -585,7 +586,7 @@ SmallVector<Value, 4> makeTiledShapes(OpBuilder &b, Location loc,
|
|||
Value size = makeComposedAffineApply(b, loc, s0 + 1, closedIntSize);
|
||||
LLVM_DEBUG(llvm::dbgs() << "makeTiledShapes: raw size: " << size << "\n");
|
||||
|
||||
// The size of the subview / subtensor should be trimmed to avoid
|
||||
// The size of the subview / extract_slice should be trimmed to avoid
|
||||
// out-of-bounds accesses, unless we statically know the subshape size
|
||||
// divides the shape size evenly.
|
||||
int64_t shapeSize = shape[r];
|
||||
|
@ -619,8 +620,8 @@ SmallVector<Value, 4> makeTiledShapes(OpBuilder &b, Location loc,
|
|||
tiledShapes.push_back(
|
||||
b.create<memref::SubViewOp>(loc, shapedOp, offsets, sizes, strides));
|
||||
else
|
||||
tiledShapes.push_back(
|
||||
b.create<SubTensorOp>(loc, shapedOp, offsets, sizes, strides));
|
||||
tiledShapes.push_back(b.create<tensor::ExtractSliceOp>(
|
||||
loc, shapedOp, offsets, sizes, strides));
|
||||
}
|
||||
|
||||
return tiledShapes;
|
||||
|
|
|
@ -717,10 +717,10 @@ OpFoldResult DimOp::fold(ArrayRef<Attribute> operands) {
|
|||
// The size at the given index is now known to be a dynamic size.
|
||||
unsigned unsignedIndex = index.getValue().getZExtValue();
|
||||
|
||||
if (auto subtensor = dyn_cast_or_null<mlir::SubTensorOp>(definingOp)) {
|
||||
assert(subtensor.isDynamicSize(unsignedIndex) &&
|
||||
"Expected dynamic subtensor size");
|
||||
return subtensor.getDynamicSize(unsignedIndex);
|
||||
if (auto sliceOp = dyn_cast_or_null<tensor::ExtractSliceOp>(definingOp)) {
|
||||
assert(sliceOp.isDynamicSize(unsignedIndex) &&
|
||||
"Expected dynamic slice size");
|
||||
return sliceOp.getDynamicSize(unsignedIndex);
|
||||
}
|
||||
|
||||
// Fold dim to the size argument for an `AllocOp`, `ViewOp`, or `SubViewOp`.
|
||||
|
@ -1314,7 +1314,7 @@ void ReinterpretCastOp::build(OpBuilder &b, OperationState &result,
|
|||
}
|
||||
|
||||
// TODO: ponder whether we want to allow missing trailing sizes/strides that are
|
||||
// completed automatically, like we have for subview and subtensor.
|
||||
// completed automatically, like we have for subview and extract_slice.
|
||||
static LogicalResult verify(ReinterpretCastOp op) {
|
||||
// The source and result memrefs should be in the same memory space.
|
||||
auto srcType = op.source().getType().cast<BaseMemRefType>();
|
||||
|
|
|
@ -10,7 +10,6 @@
|
|||
|
||||
#include "mlir/Dialect/CommonFolders.h"
|
||||
#include "mlir/Dialect/StandardOps/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/AffineExpr.h"
|
||||
#include "mlir/IR/AffineMap.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
|
@ -34,32 +33,6 @@
|
|||
|
||||
using namespace mlir;
|
||||
|
||||
/// Helper function to dispatch an OpFoldResult into either the `dynamicVec` if
|
||||
/// it is a Value or into `staticVec` if it is an IntegerAttr.
|
||||
/// In the case of a Value, a copy of the `sentinel` value is also pushed to
|
||||
/// `staticVec`. This is useful to extract mixed static and dynamic entries that
|
||||
/// come from an AttrSizedOperandSegments trait.
|
||||
static void dispatchIndexOpFoldResult(OpFoldResult ofr,
|
||||
SmallVectorImpl<Value> &dynamicVec,
|
||||
SmallVectorImpl<int64_t> &staticVec,
|
||||
int64_t sentinel) {
|
||||
if (auto v = ofr.dyn_cast<Value>()) {
|
||||
dynamicVec.push_back(v);
|
||||
staticVec.push_back(sentinel);
|
||||
return;
|
||||
}
|
||||
APInt apInt = ofr.dyn_cast<Attribute>().cast<IntegerAttr>().getValue();
|
||||
staticVec.push_back(apInt.getSExtValue());
|
||||
}
|
||||
|
||||
static void dispatchIndexOpFoldResults(ArrayRef<OpFoldResult> ofrs,
|
||||
SmallVectorImpl<Value> &dynamicVec,
|
||||
SmallVectorImpl<int64_t> &staticVec,
|
||||
int64_t sentinel) {
|
||||
for (auto ofr : ofrs)
|
||||
dispatchIndexOpFoldResult(ofr, dynamicVec, staticVec, sentinel);
|
||||
}
|
||||
|
||||
/// If ofr is a constant integer, i.e., an IntegerAttr or a ConstantOp with an
|
||||
/// IntegerAttr, return the integer.
|
||||
llvm::Optional<int64_t> mlir::getConstantIntValue(OpFoldResult ofr) {
|
||||
|
@ -227,7 +200,6 @@ static void printStandardCastOp(Operation *op, OpAsmPrinter &p) {
|
|||
}
|
||||
|
||||
void StandardOpsDialect::initialize() {
|
||||
getContext()->loadDialect<tensor::TensorDialect>();
|
||||
addOperations<
|
||||
#define GET_OP_LIST
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.cpp.inc"
|
||||
|
@ -286,14 +258,6 @@ OpFoldResult AddIOp::fold(ArrayRef<Attribute> operands) {
|
|||
[](APInt a, APInt b) { return a + b; });
|
||||
}
|
||||
|
||||
/// Extract int64_t values from the assumed ArrayAttr of IntegerAttr.
|
||||
static SmallVector<int64_t, 4> extractFromI64ArrayAttr(Attribute attr) {
|
||||
return llvm::to_vector<4>(
|
||||
llvm::map_range(attr.cast<ArrayAttr>(), [](Attribute a) -> int64_t {
|
||||
return a.cast<IntegerAttr>().getInt();
|
||||
}));
|
||||
}
|
||||
|
||||
/// Canonicalize a sum of a constant and (constant - something) to simply be
|
||||
/// a sum of constants minus something. This transformation does similar
|
||||
/// transformations for additions of a constant with a subtract/add of
|
||||
|
@ -2082,499 +2046,6 @@ bool UIToFPOp::areCastCompatible(TypeRange inputs, TypeRange outputs) {
|
|||
return areVectorCastSimpleCompatible(a, b, areCastCompatible);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SubTensorOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// A subtensor result type can be fully inferred from the source type and the
|
||||
/// static representation of offsets, sizes and strides. Special sentinels
|
||||
/// encode the dynamic case.
|
||||
Type SubTensorOp::inferResultType(RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> leadingStaticOffsets,
|
||||
ArrayRef<int64_t> leadingStaticSizes,
|
||||
ArrayRef<int64_t> leadingStaticStrides) {
|
||||
// A subtensor may specify only a leading subset of offset/sizes/strides in
|
||||
// which case we complete with offset=0, sizes from memref type and strides=1.
|
||||
unsigned rank = sourceRankedTensorType.getRank();
|
||||
assert(leadingStaticSizes.size() <= rank &&
|
||||
"unexpected leadingStaticSizes overflow");
|
||||
auto staticSizes = llvm::to_vector<4>(leadingStaticSizes);
|
||||
unsigned numTrailingSizes = rank - staticSizes.size();
|
||||
llvm::append_range(staticSizes, sourceRankedTensorType.getShape().take_back(
|
||||
numTrailingSizes));
|
||||
return RankedTensorType::get(staticSizes,
|
||||
sourceRankedTensorType.getElementType());
|
||||
}
|
||||
|
||||
Type SubTensorOp::inferResultType(RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> leadingStaticOffsets,
|
||||
ArrayRef<OpFoldResult> leadingStaticSizes,
|
||||
ArrayRef<OpFoldResult> leadingStaticStrides) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
|
||||
staticOffsets, ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
|
||||
staticStrides, ShapedType::kDynamicStrideOrOffset);
|
||||
return SubTensorOp::inferResultType(sourceRankedTensorType, staticOffsets,
|
||||
staticSizes, staticStrides);
|
||||
}
|
||||
|
||||
/// A subtensor result type can be fully inferred from the source type and the
|
||||
/// static representation of offsets, sizes and strides. Special sentinels
|
||||
/// encode the dynamic case.
|
||||
Type SubTensorOp::inferRankReducedResultType(
|
||||
unsigned resultRank, RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> leadingStaticOffsets,
|
||||
ArrayRef<int64_t> leadingStaticSizes,
|
||||
ArrayRef<int64_t> leadingStaticStrides) {
|
||||
auto inferredType =
|
||||
inferResultType(sourceRankedTensorType, leadingStaticOffsets,
|
||||
leadingStaticSizes, leadingStaticStrides)
|
||||
.cast<RankedTensorType>();
|
||||
int rankDiff = inferredType.getRank() - resultRank;
|
||||
if (rankDiff > 0) {
|
||||
auto shape = inferredType.getShape();
|
||||
llvm::SmallDenseSet<unsigned> dimsToProject;
|
||||
mlir::getPositionsOfShapeOne(rankDiff, shape, dimsToProject);
|
||||
SmallVector<int64_t> projectedShape;
|
||||
for (unsigned pos = 0, e = shape.size(); pos < e; ++pos)
|
||||
if (!dimsToProject.contains(pos))
|
||||
projectedShape.push_back(shape[pos]);
|
||||
inferredType =
|
||||
RankedTensorType::get(projectedShape, inferredType.getElementType());
|
||||
}
|
||||
return inferredType;
|
||||
}
|
||||
|
||||
Type SubTensorOp::inferRankReducedResultType(
|
||||
unsigned resultRank, RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> leadingStaticOffsets,
|
||||
ArrayRef<OpFoldResult> leadingStaticSizes,
|
||||
ArrayRef<OpFoldResult> leadingStaticStrides) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
|
||||
staticOffsets, ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
|
||||
staticStrides, ShapedType::kDynamicStrideOrOffset);
|
||||
return SubTensorOp::inferRankReducedResultType(
|
||||
resultRank, sourceRankedTensorType, staticOffsets, staticSizes,
|
||||
staticStrides);
|
||||
}
|
||||
|
||||
// Build a SubTensorOp with mixed static and dynamic entries and custom result
|
||||
// type. If the type passed is nullptr, it is inferred.
|
||||
void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
|
||||
RankedTensorType resultType, Value source,
|
||||
ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> sizes,
|
||||
ArrayRef<OpFoldResult> strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
auto sourceRankedTensorType = source.getType().cast<RankedTensorType>();
|
||||
// Structuring implementation this way avoids duplication between builders.
|
||||
if (!resultType) {
|
||||
resultType =
|
||||
SubTensorOp::inferResultType(sourceRankedTensorType, staticOffsets,
|
||||
staticSizes, staticStrides)
|
||||
.cast<RankedTensorType>();
|
||||
}
|
||||
build(b, result, resultType, source, dynamicOffsets, dynamicSizes,
|
||||
dynamicStrides, b.getI64ArrayAttr(staticOffsets),
|
||||
b.getI64ArrayAttr(staticSizes), b.getI64ArrayAttr(staticStrides));
|
||||
result.addAttributes(attrs);
|
||||
}
|
||||
|
||||
// Build a SubTensorOp with mixed static and dynamic entries and inferred result
|
||||
// type.
|
||||
void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
|
||||
Value source, ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> sizes,
|
||||
ArrayRef<OpFoldResult> strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
build(b, result, RankedTensorType(), source, offsets, sizes, strides, attrs);
|
||||
}
|
||||
|
||||
// Build a SubTensorOp with dynamic entries and custom result type. If the type
|
||||
// passed is nullptr, it is inferred.
|
||||
void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
|
||||
RankedTensorType resultType, Value source,
|
||||
ValueRange offsets, ValueRange sizes,
|
||||
ValueRange strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
|
||||
llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
|
||||
llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
|
||||
llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
|
||||
build(b, result, resultType, source, offsetValues, sizeValues, strideValues);
|
||||
}
|
||||
|
||||
// Build a SubTensorOp with dynamic entries and inferred result type.
|
||||
void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result,
|
||||
Value source, ValueRange offsets,
|
||||
ValueRange sizes, ValueRange strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
build(b, result, RankedTensorType(), source, offsets, sizes, strides, attrs);
|
||||
}
|
||||
|
||||
enum SubTensorVerificationResult {
|
||||
Success,
|
||||
RankTooLarge,
|
||||
SizeMismatch,
|
||||
ElemTypeMismatch,
|
||||
};
|
||||
|
||||
/// Checks if `original` Type type can be rank reduced to `reduced` type.
|
||||
/// This function is slight variant of `is subsequence` algorithm where
|
||||
/// not matching dimension must be 1.
|
||||
static SubTensorVerificationResult
|
||||
isRankReducedType(Type originalType, Type candidateReducedType,
|
||||
std::string *errMsg = nullptr) {
|
||||
if (originalType == candidateReducedType)
|
||||
return SubTensorVerificationResult::Success;
|
||||
if (!originalType.isa<RankedTensorType>())
|
||||
return SubTensorVerificationResult::Success;
|
||||
if (originalType.isa<RankedTensorType>() &&
|
||||
!candidateReducedType.isa<RankedTensorType>())
|
||||
return SubTensorVerificationResult::Success;
|
||||
|
||||
ShapedType originalShapedType = originalType.cast<ShapedType>();
|
||||
ShapedType candidateReducedShapedType =
|
||||
candidateReducedType.cast<ShapedType>();
|
||||
|
||||
// Rank and size logic is valid for all ShapedTypes.
|
||||
ArrayRef<int64_t> originalShape = originalShapedType.getShape();
|
||||
ArrayRef<int64_t> candidateReducedShape =
|
||||
candidateReducedShapedType.getShape();
|
||||
unsigned originalRank = originalShape.size(),
|
||||
candidateReducedRank = candidateReducedShape.size();
|
||||
if (candidateReducedRank > originalRank)
|
||||
return SubTensorVerificationResult::RankTooLarge;
|
||||
|
||||
auto optionalUnusedDimsMask =
|
||||
computeRankReductionMask(originalShape, candidateReducedShape);
|
||||
|
||||
// Sizes cannot be matched in case empty vector is returned.
|
||||
if (!optionalUnusedDimsMask.hasValue())
|
||||
return SubTensorVerificationResult::SizeMismatch;
|
||||
|
||||
if (originalShapedType.getElementType() !=
|
||||
candidateReducedShapedType.getElementType())
|
||||
return SubTensorVerificationResult::ElemTypeMismatch;
|
||||
|
||||
// We are done for the tensor case.
|
||||
if (originalType.isa<RankedTensorType>())
|
||||
return SubTensorVerificationResult::Success;
|
||||
|
||||
return SubTensorVerificationResult::Success;
|
||||
}
|
||||
|
||||
template <typename OpTy>
|
||||
static LogicalResult
|
||||
produceSubTensorErrorMsg(SubTensorVerificationResult result, OpTy op,
|
||||
Type expectedType, StringRef errMsg = "") {
|
||||
auto memrefType = expectedType.cast<ShapedType>();
|
||||
switch (result) {
|
||||
case SubTensorVerificationResult::Success:
|
||||
return success();
|
||||
case SubTensorVerificationResult::RankTooLarge:
|
||||
return op.emitError("expected result rank to be smaller or equal to ")
|
||||
<< "the source rank. " << errMsg;
|
||||
case SubTensorVerificationResult::SizeMismatch:
|
||||
return op.emitError("expected result type to be ")
|
||||
<< expectedType
|
||||
<< " or a rank-reduced version. (mismatch of result sizes) "
|
||||
<< errMsg;
|
||||
case SubTensorVerificationResult::ElemTypeMismatch:
|
||||
return op.emitError("expected result element type to be ")
|
||||
<< memrefType.getElementType() << errMsg;
|
||||
}
|
||||
llvm_unreachable("unexpected subtensor verification result");
|
||||
}
|
||||
/// Verifier for SubTensorOp.
|
||||
static LogicalResult verify(SubTensorOp op) {
|
||||
// Verify result type against inferred type.
|
||||
auto expectedType = SubTensorOp::inferResultType(
|
||||
op.getSourceType(), extractFromI64ArrayAttr(op.static_offsets()),
|
||||
extractFromI64ArrayAttr(op.static_sizes()),
|
||||
extractFromI64ArrayAttr(op.static_strides()));
|
||||
auto result = isRankReducedType(expectedType, op.getType());
|
||||
return produceSubTensorErrorMsg(result, op, expectedType);
|
||||
}
|
||||
|
||||
/// Infer the canonical type of the result of a subtensor operation. Returns a
|
||||
/// type with rank `resultRank` that is either the rank of the rank-reduced
|
||||
/// type, or the non-rank-reduced type.
|
||||
static RankedTensorType getCanonicalSubTensorResultType(
|
||||
unsigned resultRank, RankedTensorType sourceType,
|
||||
ArrayRef<OpFoldResult> mixedOffsets, ArrayRef<OpFoldResult> mixedSizes,
|
||||
ArrayRef<OpFoldResult> mixedStrides) {
|
||||
auto resultType =
|
||||
SubTensorOp::inferRankReducedResultType(
|
||||
resultRank, sourceType, mixedOffsets, mixedSizes, mixedStrides)
|
||||
.cast<RankedTensorType>();
|
||||
if (resultType.getRank() != resultRank) {
|
||||
resultType = SubTensorOp::inferResultType(sourceType, mixedOffsets,
|
||||
mixedSizes, mixedStrides)
|
||||
.cast<RankedTensorType>();
|
||||
}
|
||||
return resultType;
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// Pattern to rewrite a subtensor op with tensor::Cast arguments.
|
||||
/// This essentially pushes memref_cast past its consuming subtensor when
|
||||
/// `canFoldIntoConsumerOp` is true.
|
||||
///
|
||||
/// Example:
|
||||
/// ```
|
||||
/// %0 = tensorcast %V : tensor<16x16xf32> to tensor<?x?xf32>
|
||||
/// %1 = subtensor %0[0, 0][3, 4][1, 1] : tensor<?x?xf32> to tensor<3x4xf32>
|
||||
/// ```
|
||||
/// is rewritten into:
|
||||
/// ```
|
||||
/// %0 = subtensor %V[0, 0][3, 4][1, 1] : tensor<16x16xf32> to tensor<3x4xf32>
|
||||
/// %1 = tensor.cast %0: tensor<3x4xf32> to tensor<3x4xf32>
|
||||
/// ```
|
||||
class SubTensorOpCastFolder final : public OpRewritePattern<SubTensorOp> {
|
||||
public:
|
||||
using OpRewritePattern<SubTensorOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(SubTensorOp subTensorOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
// Any constant operand, just return to let SubViewOpConstantFolder kick in.
|
||||
if (llvm::any_of(subTensorOp.getOperands(), [](Value operand) {
|
||||
return matchPattern(operand, matchConstantIndex());
|
||||
}))
|
||||
return failure();
|
||||
|
||||
auto castOp = subTensorOp.source().getDefiningOp<tensor::CastOp>();
|
||||
if (!castOp)
|
||||
return failure();
|
||||
|
||||
if (!canFoldIntoConsumerOp(castOp))
|
||||
return failure();
|
||||
|
||||
/// Deduce the type of the result to use for the canonicalized operation.
|
||||
RankedTensorType resultType = getCanonicalSubTensorResultType(
|
||||
subTensorOp.getType().getRank(), subTensorOp.getSourceType(),
|
||||
subTensorOp.getMixedOffsets(), subTensorOp.getMixedSizes(),
|
||||
subTensorOp.getMixedStrides());
|
||||
Value newSubTensor = rewriter.create<SubTensorOp>(
|
||||
subTensorOp.getLoc(), resultType, castOp.source(),
|
||||
subTensorOp.offsets(), subTensorOp.sizes(), subTensorOp.strides(),
|
||||
subTensorOp.static_offsets(), subTensorOp.static_sizes(),
|
||||
subTensorOp.static_strides());
|
||||
rewriter.replaceOpWithNewOp<tensor::CastOp>(
|
||||
subTensorOp, subTensorOp.getType(), newSubTensor);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
/// Return the canonical type of the result of a subtensor.
|
||||
struct SubTensorReturnTypeCanonicalizer {
|
||||
RankedTensorType operator()(SubTensorOp op,
|
||||
ArrayRef<OpFoldResult> mixedOffsets,
|
||||
ArrayRef<OpFoldResult> mixedSizes,
|
||||
ArrayRef<OpFoldResult> mixedStrides) {
|
||||
return getCanonicalSubTensorResultType(op.getType().getRank(),
|
||||
op.getSourceType(), mixedOffsets,
|
||||
mixedSizes, mixedStrides);
|
||||
}
|
||||
};
|
||||
|
||||
/// A canonicalizer wrapper to replace SubTensorOps.
|
||||
struct SubTensorCanonicalizer {
|
||||
void operator()(PatternRewriter &rewriter, SubTensorOp op,
|
||||
SubTensorOp newOp) {
|
||||
Value replacement = newOp.getResult();
|
||||
if (replacement.getType() != op.getType())
|
||||
replacement = rewriter.create<tensor::CastOp>(op.getLoc(), op.getType(),
|
||||
replacement);
|
||||
rewriter.replaceOp(op, replacement);
|
||||
}
|
||||
};
|
||||
|
||||
void SubTensorOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<OpWithOffsetSizesAndStridesConstantArgumentFolder<
|
||||
SubTensorOp, SubTensorReturnTypeCanonicalizer,
|
||||
SubTensorCanonicalizer>,
|
||||
SubTensorOpCastFolder>(context);
|
||||
}
|
||||
|
||||
//
|
||||
static LogicalResult
|
||||
foldIdentityOffsetSizeAndStrideOpInterface(OffsetSizeAndStrideOpInterface op,
|
||||
ShapedType shapedType) {
|
||||
OpBuilder b(op.getContext());
|
||||
for (OpFoldResult ofr : op.getMixedOffsets())
|
||||
if (!isEqualConstantIntOrValue(ofr, b.getIndexAttr(0)))
|
||||
return failure();
|
||||
// Rank-reducing noops only need to inspect the leading dimensions: llvm::zip
|
||||
// is appropriate.
|
||||
auto shape = shapedType.getShape();
|
||||
for (auto it : llvm::zip(op.getMixedSizes(), shape))
|
||||
if (!isEqualConstantIntOrValue(std::get<0>(it),
|
||||
b.getIndexAttr(std::get<1>(it))))
|
||||
return failure();
|
||||
for (OpFoldResult ofr : op.getMixedStrides())
|
||||
if (!isEqualConstantIntOrValue(ofr, b.getIndexAttr(1)))
|
||||
return failure();
|
||||
return success();
|
||||
}
|
||||
|
||||
OpFoldResult SubTensorOp::fold(ArrayRef<Attribute>) {
|
||||
if (getSourceType() == getType() &&
|
||||
succeeded(foldIdentityOffsetSizeAndStrideOpInterface(*this, getType())))
|
||||
return this->source();
|
||||
return OpFoldResult();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SubTensorInsertOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Build a SubTensorInsertOp with mixed static and dynamic entries.
|
||||
void mlir::SubTensorInsertOp::build(OpBuilder &b, OperationState &result,
|
||||
Value source, Value dest,
|
||||
ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> sizes,
|
||||
ArrayRef<OpFoldResult> strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
build(b, result, dest.getType(), source, dest, dynamicOffsets, dynamicSizes,
|
||||
dynamicStrides, b.getI64ArrayAttr(staticOffsets),
|
||||
b.getI64ArrayAttr(staticSizes), b.getI64ArrayAttr(staticStrides));
|
||||
result.addAttributes(attrs);
|
||||
}
|
||||
|
||||
// Build a SubTensorInsertOp with dynamic entries.
|
||||
void mlir::SubTensorInsertOp::build(OpBuilder &b, OperationState &result,
|
||||
Value source, Value dest,
|
||||
ValueRange offsets, ValueRange sizes,
|
||||
ValueRange strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
|
||||
llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
|
||||
llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
|
||||
llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
|
||||
build(b, result, source, dest, offsetValues, sizeValues, strideValues);
|
||||
}
|
||||
|
||||
OpFoldResult SubTensorInsertOp::fold(ArrayRef<Attribute>) {
|
||||
if (getSourceType().hasStaticShape() && getType().hasStaticShape() &&
|
||||
getSourceType() == getType() &&
|
||||
succeeded(foldIdentityOffsetSizeAndStrideOpInterface(*this, getType())))
|
||||
return this->source();
|
||||
return OpFoldResult();
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// Pattern to rewrite a subtensor_insert op with constant arguments.
|
||||
class SubTensorInsertOpConstantArgumentFolder final
|
||||
: public OpRewritePattern<SubTensorInsertOp> {
|
||||
public:
|
||||
using OpRewritePattern<SubTensorInsertOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(SubTensorInsertOp subTensorInsertOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
// No constant operand, just return.
|
||||
if (llvm::none_of(subTensorInsertOp.getOperands(), [](Value operand) {
|
||||
return matchPattern(operand, matchConstantIndex());
|
||||
}))
|
||||
return failure();
|
||||
|
||||
// At least one of offsets/sizes/strides is a new constant.
|
||||
// Form the new list of operands and constant attributes from the
|
||||
// existing.
|
||||
SmallVector<OpFoldResult> mixedOffsets(subTensorInsertOp.getMixedOffsets());
|
||||
SmallVector<OpFoldResult> mixedSizes(subTensorInsertOp.getMixedSizes());
|
||||
SmallVector<OpFoldResult> mixedStrides(subTensorInsertOp.getMixedStrides());
|
||||
canonicalizeSubViewPart(mixedOffsets, ShapedType::isDynamicStrideOrOffset);
|
||||
canonicalizeSubViewPart(mixedSizes, ShapedType::isDynamic);
|
||||
canonicalizeSubViewPart(mixedStrides, ShapedType::isDynamicStrideOrOffset);
|
||||
|
||||
// Create the new op in canonical form.
|
||||
rewriter.replaceOpWithNewOp<SubTensorInsertOp>(
|
||||
subTensorInsertOp, subTensorInsertOp.source(), subTensorInsertOp.dest(),
|
||||
mixedOffsets, mixedSizes, mixedStrides);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
/// Fold tensor_casts with subtensor_insert operations.
|
||||
struct SubTensorInsertOpCastFolder final
|
||||
: public OpRewritePattern<SubTensorInsertOp> {
|
||||
using OpRewritePattern<SubTensorInsertOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(SubTensorInsertOp subTensorInsertOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
if (llvm::any_of(subTensorInsertOp.getOperands(), [](Value operand) {
|
||||
return matchPattern(operand, matchConstantIndex());
|
||||
}))
|
||||
return failure();
|
||||
|
||||
auto getSourceOfCastOp = [](Value v) -> Optional<Value> {
|
||||
auto castOp = v.getDefiningOp<tensor::CastOp>();
|
||||
if (!castOp || !canFoldIntoConsumerOp(castOp))
|
||||
return llvm::None;
|
||||
return castOp.source();
|
||||
};
|
||||
Optional<Value> sourceCastSource =
|
||||
getSourceOfCastOp(subTensorInsertOp.source());
|
||||
Optional<Value> destCastSource =
|
||||
getSourceOfCastOp(subTensorInsertOp.dest());
|
||||
if (!sourceCastSource && !destCastSource)
|
||||
return failure();
|
||||
|
||||
Value replacement = rewriter.create<SubTensorInsertOp>(
|
||||
subTensorInsertOp.getLoc(),
|
||||
(sourceCastSource ? *sourceCastSource : subTensorInsertOp.source()),
|
||||
(destCastSource ? *destCastSource : subTensorInsertOp.dest()),
|
||||
subTensorInsertOp.getMixedOffsets(), subTensorInsertOp.getMixedSizes(),
|
||||
subTensorInsertOp.getMixedStrides());
|
||||
|
||||
if (replacement.getType() != subTensorInsertOp.getType()) {
|
||||
replacement = rewriter.create<tensor::CastOp>(
|
||||
subTensorInsertOp.getLoc(), subTensorInsertOp.getType(), replacement);
|
||||
}
|
||||
rewriter.replaceOp(subTensorInsertOp, replacement);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
void SubTensorInsertOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<SubTensorInsertOpConstantArgumentFolder,
|
||||
SubTensorInsertOpCastFolder>(context);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// SwitchOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "mlir/Dialect/StandardOps/Utils/Utils.h"
|
||||
#include "mlir/Dialect/Tensor/IR/Tensor.h"
|
||||
#include "mlir/IR/BlockAndValueMapping.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
|
@ -25,7 +26,7 @@ using namespace mlir::tensor;
|
|||
/// source tensor. This is useful to fold a tensor.cast into a consuming op and
|
||||
/// implement canonicalization patterns for ops in different dialects that may
|
||||
/// consume the results of tensor.cast operations. Such foldable tensor.cast
|
||||
/// operations are typically inserted as `subtensor` ops and are canonicalized,
|
||||
/// operations are typically inserted as `slice` ops and are canonicalized,
|
||||
/// to preserve the type compatibility of their uses.
|
||||
///
|
||||
/// Returns true when all conditions are met:
|
||||
|
@ -511,6 +512,530 @@ static LogicalResult verify(ReshapeOp op) {
|
|||
return success();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// ExtractSliceOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
/// Helper function to dispatch an OpFoldResult into either the `dynamicVec` if
|
||||
/// it is a Value or into `staticVec` if it is an IntegerAttr.
|
||||
/// In the case of a Value, a copy of the `sentinel` value is also pushed to
|
||||
/// `staticVec`. This is useful to extract mixed static and dynamic entries that
|
||||
/// come from an AttrSizedOperandSegments trait.
|
||||
static void dispatchIndexOpFoldResult(OpFoldResult ofr,
|
||||
SmallVectorImpl<Value> &dynamicVec,
|
||||
SmallVectorImpl<int64_t> &staticVec,
|
||||
int64_t sentinel) {
|
||||
if (auto v = ofr.dyn_cast<Value>()) {
|
||||
dynamicVec.push_back(v);
|
||||
staticVec.push_back(sentinel);
|
||||
return;
|
||||
}
|
||||
APInt apInt = ofr.dyn_cast<Attribute>().cast<IntegerAttr>().getValue();
|
||||
staticVec.push_back(apInt.getSExtValue());
|
||||
}
|
||||
|
||||
static void dispatchIndexOpFoldResults(ArrayRef<OpFoldResult> ofrs,
|
||||
SmallVectorImpl<Value> &dynamicVec,
|
||||
SmallVectorImpl<int64_t> &staticVec,
|
||||
int64_t sentinel) {
|
||||
for (auto ofr : ofrs)
|
||||
dispatchIndexOpFoldResult(ofr, dynamicVec, staticVec, sentinel);
|
||||
}
|
||||
|
||||
/// An extract_slice op result type can be fully inferred from the source type
|
||||
/// and the static representation of offsets, sizes and strides. Special
|
||||
/// sentinels encode the dynamic case.
|
||||
Type ExtractSliceOp::inferResultType(RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> leadingStaticOffsets,
|
||||
ArrayRef<int64_t> leadingStaticSizes,
|
||||
ArrayRef<int64_t> leadingStaticStrides) {
|
||||
// An extract_slice op may specify only a leading subset of offset/sizes/
|
||||
// strides in which case we complete with offset=0, sizes from memref type and
|
||||
// strides=1.
|
||||
unsigned rank = sourceRankedTensorType.getRank();
|
||||
assert(leadingStaticSizes.size() <= rank &&
|
||||
"unexpected leadingStaticSizes overflow");
|
||||
auto staticSizes = llvm::to_vector<4>(leadingStaticSizes);
|
||||
unsigned numTrailingSizes = rank - staticSizes.size();
|
||||
llvm::append_range(staticSizes, sourceRankedTensorType.getShape().take_back(
|
||||
numTrailingSizes));
|
||||
return RankedTensorType::get(staticSizes,
|
||||
sourceRankedTensorType.getElementType());
|
||||
}
|
||||
|
||||
/// Extract int64_t values from the assumed ArrayAttr of IntegerAttr.
|
||||
static SmallVector<int64_t, 4> extractFromI64ArrayAttr(Attribute attr) {
|
||||
return llvm::to_vector<4>(
|
||||
llvm::map_range(attr.cast<ArrayAttr>(), [](Attribute a) -> int64_t {
|
||||
return a.cast<IntegerAttr>().getInt();
|
||||
}));
|
||||
}
|
||||
|
||||
Type ExtractSliceOp::inferResultType(
|
||||
RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> leadingStaticOffsets,
|
||||
ArrayRef<OpFoldResult> leadingStaticSizes,
|
||||
ArrayRef<OpFoldResult> leadingStaticStrides) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
|
||||
staticOffsets, ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
|
||||
staticStrides, ShapedType::kDynamicStrideOrOffset);
|
||||
return ExtractSliceOp::inferResultType(sourceRankedTensorType, staticOffsets,
|
||||
staticSizes, staticStrides);
|
||||
}
|
||||
|
||||
/// An extract_slice op result type can be fully inferred from the source type
|
||||
/// and the static representation of offsets, sizes and strides. Special
|
||||
/// sentinels encode the dynamic case.
|
||||
Type ExtractSliceOp::inferRankReducedResultType(
|
||||
unsigned resultRank, RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<int64_t> leadingStaticOffsets,
|
||||
ArrayRef<int64_t> leadingStaticSizes,
|
||||
ArrayRef<int64_t> leadingStaticStrides) {
|
||||
auto inferredType =
|
||||
inferResultType(sourceRankedTensorType, leadingStaticOffsets,
|
||||
leadingStaticSizes, leadingStaticStrides)
|
||||
.cast<RankedTensorType>();
|
||||
int rankDiff = inferredType.getRank() - resultRank;
|
||||
if (rankDiff > 0) {
|
||||
auto shape = inferredType.getShape();
|
||||
llvm::SmallDenseSet<unsigned> dimsToProject;
|
||||
mlir::getPositionsOfShapeOne(rankDiff, shape, dimsToProject);
|
||||
SmallVector<int64_t> projectedShape;
|
||||
for (unsigned pos = 0, e = shape.size(); pos < e; ++pos)
|
||||
if (!dimsToProject.contains(pos))
|
||||
projectedShape.push_back(shape[pos]);
|
||||
inferredType =
|
||||
RankedTensorType::get(projectedShape, inferredType.getElementType());
|
||||
}
|
||||
return inferredType;
|
||||
}
|
||||
|
||||
Type ExtractSliceOp::inferRankReducedResultType(
|
||||
unsigned resultRank, RankedTensorType sourceRankedTensorType,
|
||||
ArrayRef<OpFoldResult> leadingStaticOffsets,
|
||||
ArrayRef<OpFoldResult> leadingStaticSizes,
|
||||
ArrayRef<OpFoldResult> leadingStaticStrides) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(leadingStaticOffsets, dynamicOffsets,
|
||||
staticOffsets, ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(leadingStaticSizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(leadingStaticStrides, dynamicStrides,
|
||||
staticStrides, ShapedType::kDynamicStrideOrOffset);
|
||||
return ExtractSliceOp::inferRankReducedResultType(
|
||||
resultRank, sourceRankedTensorType, staticOffsets, staticSizes,
|
||||
staticStrides);
|
||||
}
|
||||
|
||||
/// Build an ExtractSliceOp with mixed static and dynamic entries and custom
|
||||
/// result type. If the type passed is nullptr, it is inferred.
|
||||
void ExtractSliceOp::build(OpBuilder &b, OperationState &result,
|
||||
RankedTensorType resultType, Value source,
|
||||
ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> sizes,
|
||||
ArrayRef<OpFoldResult> strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
auto sourceRankedTensorType = source.getType().cast<RankedTensorType>();
|
||||
// Structuring implementation this way avoids duplication between builders.
|
||||
if (!resultType) {
|
||||
resultType =
|
||||
ExtractSliceOp::inferResultType(sourceRankedTensorType, staticOffsets,
|
||||
staticSizes, staticStrides)
|
||||
.cast<RankedTensorType>();
|
||||
}
|
||||
build(b, result, resultType, source, dynamicOffsets, dynamicSizes,
|
||||
dynamicStrides, b.getI64ArrayAttr(staticOffsets),
|
||||
b.getI64ArrayAttr(staticSizes), b.getI64ArrayAttr(staticStrides));
|
||||
result.addAttributes(attrs);
|
||||
}
|
||||
|
||||
/// Build an ExtractSliceOp with mixed static and dynamic entries and inferred
|
||||
/// result type.
|
||||
void ExtractSliceOp::build(OpBuilder &b, OperationState &result, Value source,
|
||||
ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> sizes,
|
||||
ArrayRef<OpFoldResult> strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
build(b, result, RankedTensorType(), source, offsets, sizes, strides, attrs);
|
||||
}
|
||||
|
||||
/// Build an ExtractSliceOp with dynamic entries and custom result type. If the
|
||||
/// type passed is nullptr, it is inferred.
|
||||
void ExtractSliceOp::build(OpBuilder &b, OperationState &result,
|
||||
RankedTensorType resultType, Value source,
|
||||
ValueRange offsets, ValueRange sizes,
|
||||
ValueRange strides, ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
|
||||
llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
|
||||
llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
|
||||
llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
|
||||
build(b, result, resultType, source, offsetValues, sizeValues, strideValues);
|
||||
}
|
||||
|
||||
/// Build an ExtractSliceOp with dynamic entries and inferred result type.
|
||||
void ExtractSliceOp::build(OpBuilder &b, OperationState &result, Value source,
|
||||
ValueRange offsets, ValueRange sizes,
|
||||
ValueRange strides, ArrayRef<NamedAttribute> attrs) {
|
||||
build(b, result, RankedTensorType(), source, offsets, sizes, strides, attrs);
|
||||
}
|
||||
|
||||
enum SliceVerificationResult {
|
||||
Success,
|
||||
RankTooLarge,
|
||||
SizeMismatch,
|
||||
ElemTypeMismatch,
|
||||
};
|
||||
|
||||
/// Checks if `original` Type type can be rank reduced to `reduced` type.
|
||||
/// This function is slight variant of `is subsequence` algorithm where
|
||||
/// not matching dimension must be 1.
|
||||
static SliceVerificationResult
|
||||
isRankReducedType(Type originalType, Type candidateReducedType,
|
||||
std::string *errMsg = nullptr) {
|
||||
if (originalType == candidateReducedType)
|
||||
return SliceVerificationResult::Success;
|
||||
if (!originalType.isa<RankedTensorType>())
|
||||
return SliceVerificationResult::Success;
|
||||
if (originalType.isa<RankedTensorType>() &&
|
||||
!candidateReducedType.isa<RankedTensorType>())
|
||||
return SliceVerificationResult::Success;
|
||||
|
||||
ShapedType originalShapedType = originalType.cast<ShapedType>();
|
||||
ShapedType candidateReducedShapedType =
|
||||
candidateReducedType.cast<ShapedType>();
|
||||
|
||||
// Rank and size logic is valid for all ShapedTypes.
|
||||
ArrayRef<int64_t> originalShape = originalShapedType.getShape();
|
||||
ArrayRef<int64_t> candidateReducedShape =
|
||||
candidateReducedShapedType.getShape();
|
||||
unsigned originalRank = originalShape.size(),
|
||||
candidateReducedRank = candidateReducedShape.size();
|
||||
if (candidateReducedRank > originalRank)
|
||||
return SliceVerificationResult::RankTooLarge;
|
||||
|
||||
auto optionalUnusedDimsMask =
|
||||
computeRankReductionMask(originalShape, candidateReducedShape);
|
||||
|
||||
// Sizes cannot be matched in case empty vector is returned.
|
||||
if (!optionalUnusedDimsMask.hasValue())
|
||||
return SliceVerificationResult::SizeMismatch;
|
||||
|
||||
if (originalShapedType.getElementType() !=
|
||||
candidateReducedShapedType.getElementType())
|
||||
return SliceVerificationResult::ElemTypeMismatch;
|
||||
|
||||
// We are done for the tensor case.
|
||||
if (originalType.isa<RankedTensorType>())
|
||||
return SliceVerificationResult::Success;
|
||||
|
||||
return SliceVerificationResult::Success;
|
||||
}
|
||||
|
||||
template <typename OpTy>
|
||||
static LogicalResult produceSliceErrorMsg(SliceVerificationResult result,
|
||||
OpTy op, Type expectedType,
|
||||
StringRef errMsg = "") {
|
||||
auto memrefType = expectedType.cast<ShapedType>();
|
||||
switch (result) {
|
||||
case SliceVerificationResult::Success:
|
||||
return success();
|
||||
case SliceVerificationResult::RankTooLarge:
|
||||
return op.emitError("expected result rank to be smaller or equal to ")
|
||||
<< "the source rank. " << errMsg;
|
||||
case SliceVerificationResult::SizeMismatch:
|
||||
return op.emitError("expected result type to be ")
|
||||
<< expectedType
|
||||
<< " or a rank-reduced version. (mismatch of result sizes) "
|
||||
<< errMsg;
|
||||
case SliceVerificationResult::ElemTypeMismatch:
|
||||
return op.emitError("expected result element type to be ")
|
||||
<< memrefType.getElementType() << errMsg;
|
||||
}
|
||||
llvm_unreachable("unexpected extract_slice op verification result");
|
||||
}
|
||||
|
||||
/// Verifier for ExtractSliceOp.
|
||||
static LogicalResult verify(ExtractSliceOp op) {
|
||||
// Verify result type against inferred type.
|
||||
auto expectedType = ExtractSliceOp::inferResultType(
|
||||
op.getSourceType(), extractFromI64ArrayAttr(op.static_offsets()),
|
||||
extractFromI64ArrayAttr(op.static_sizes()),
|
||||
extractFromI64ArrayAttr(op.static_strides()));
|
||||
auto result = isRankReducedType(expectedType, op.getType());
|
||||
return produceSliceErrorMsg(result, op, expectedType);
|
||||
}
|
||||
|
||||
/// Infer the canonical type of the result of an extract_slice op. Returns a
|
||||
/// type with rank `resultRank` that is either the rank of the rank-reduced
|
||||
/// type, or the non-rank-reduced type.
|
||||
static RankedTensorType
|
||||
getCanonicalSliceResultType(unsigned resultRank, RankedTensorType sourceType,
|
||||
ArrayRef<OpFoldResult> mixedOffsets,
|
||||
ArrayRef<OpFoldResult> mixedSizes,
|
||||
ArrayRef<OpFoldResult> mixedStrides) {
|
||||
auto resultType =
|
||||
ExtractSliceOp::inferRankReducedResultType(
|
||||
resultRank, sourceType, mixedOffsets, mixedSizes, mixedStrides)
|
||||
.cast<RankedTensorType>();
|
||||
if (resultType.getRank() != resultRank) {
|
||||
resultType = ExtractSliceOp::inferResultType(sourceType, mixedOffsets,
|
||||
mixedSizes, mixedStrides)
|
||||
.cast<RankedTensorType>();
|
||||
}
|
||||
return resultType;
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// Pattern to rewrite an extract_slice op with tensor::Cast arguments.
|
||||
/// This essentially pushes memref_cast past its consuming slice when
|
||||
/// `canFoldIntoConsumerOp` is true.
|
||||
///
|
||||
/// Example:
|
||||
/// ```
|
||||
/// %0 = tensor.cast %V : tensor<16x16xf32> to tensor<?x?xf32>
|
||||
/// %1 = tensor.extract_slice %0[0, 0][3, 4][1, 1] : tensor<?x?xf32> to
|
||||
/// tensor<3x4xf32>
|
||||
/// ```
|
||||
/// is rewritten into:
|
||||
/// ```
|
||||
/// %0 = tensor.extract_slice %V[0, 0][3, 4][1, 1] : tensor<16x16xf32> to
|
||||
/// tensor<3x4xf32> %1 = tensor.cast %0: tensor<3x4xf32> to tensor<3x4xf32>
|
||||
/// ```
|
||||
class ExtractSliceOpCastFolder final : public OpRewritePattern<ExtractSliceOp> {
|
||||
public:
|
||||
using OpRewritePattern<ExtractSliceOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(ExtractSliceOp sliceOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
// Any constant operand, just return to let SubViewOpConstantFolder kick in.
|
||||
if (llvm::any_of(sliceOp.getOperands(), [](Value operand) {
|
||||
return matchPattern(operand, matchConstantIndex());
|
||||
}))
|
||||
return failure();
|
||||
|
||||
auto castOp = sliceOp.source().getDefiningOp<tensor::CastOp>();
|
||||
if (!castOp)
|
||||
return failure();
|
||||
|
||||
if (!canFoldIntoConsumerOp(castOp))
|
||||
return failure();
|
||||
|
||||
/// Deduce the type of the result to use for the canonicalized operation.
|
||||
RankedTensorType resultType = getCanonicalSliceResultType(
|
||||
sliceOp.getType().getRank(), sliceOp.getSourceType(),
|
||||
sliceOp.getMixedOffsets(), sliceOp.getMixedSizes(),
|
||||
sliceOp.getMixedStrides());
|
||||
Value newSlice = rewriter.create<ExtractSliceOp>(
|
||||
sliceOp.getLoc(), resultType, castOp.source(), sliceOp.offsets(),
|
||||
sliceOp.sizes(), sliceOp.strides(), sliceOp.static_offsets(),
|
||||
sliceOp.static_sizes(), sliceOp.static_strides());
|
||||
rewriter.replaceOpWithNewOp<tensor::CastOp>(sliceOp, sliceOp.getType(),
|
||||
newSlice);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
/// Return the canonical type of the result of an extract_slice op.
|
||||
struct SliceReturnTypeCanonicalizer {
|
||||
RankedTensorType operator()(ExtractSliceOp op,
|
||||
ArrayRef<OpFoldResult> mixedOffsets,
|
||||
ArrayRef<OpFoldResult> mixedSizes,
|
||||
ArrayRef<OpFoldResult> mixedStrides) {
|
||||
return getCanonicalSliceResultType(op.getType().getRank(),
|
||||
op.getSourceType(), mixedOffsets,
|
||||
mixedSizes, mixedStrides);
|
||||
}
|
||||
};
|
||||
|
||||
/// A canonicalizer wrapper to replace ExtractSliceOps.
|
||||
struct SliceCanonicalizer {
|
||||
void operator()(PatternRewriter &rewriter, ExtractSliceOp op,
|
||||
ExtractSliceOp newOp) {
|
||||
Value replacement = newOp.getResult();
|
||||
if (replacement.getType() != op.getType())
|
||||
replacement = rewriter.create<tensor::CastOp>(op.getLoc(), op.getType(),
|
||||
replacement);
|
||||
rewriter.replaceOp(op, replacement);
|
||||
}
|
||||
};
|
||||
|
||||
void ExtractSliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<
|
||||
OpWithOffsetSizesAndStridesConstantArgumentFolder<
|
||||
ExtractSliceOp, SliceReturnTypeCanonicalizer, SliceCanonicalizer>,
|
||||
ExtractSliceOpCastFolder>(context);
|
||||
}
|
||||
|
||||
//
|
||||
static LogicalResult
|
||||
foldIdentityOffsetSizeAndStrideOpInterface(OffsetSizeAndStrideOpInterface op,
|
||||
ShapedType shapedType) {
|
||||
OpBuilder b(op.getContext());
|
||||
for (OpFoldResult ofr : op.getMixedOffsets())
|
||||
if (!isEqualConstantIntOrValue(ofr, b.getIndexAttr(0)))
|
||||
return failure();
|
||||
// Rank-reducing noops only need to inspect the leading dimensions: llvm::zip
|
||||
// is appropriate.
|
||||
auto shape = shapedType.getShape();
|
||||
for (auto it : llvm::zip(op.getMixedSizes(), shape))
|
||||
if (!isEqualConstantIntOrValue(std::get<0>(it),
|
||||
b.getIndexAttr(std::get<1>(it))))
|
||||
return failure();
|
||||
for (OpFoldResult ofr : op.getMixedStrides())
|
||||
if (!isEqualConstantIntOrValue(ofr, b.getIndexAttr(1)))
|
||||
return failure();
|
||||
return success();
|
||||
}
|
||||
|
||||
OpFoldResult ExtractSliceOp::fold(ArrayRef<Attribute>) {
|
||||
if (getSourceType() == getType() &&
|
||||
succeeded(foldIdentityOffsetSizeAndStrideOpInterface(*this, getType())))
|
||||
return this->source();
|
||||
return OpFoldResult();
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// InsertSliceOp
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
// Build a InsertSliceOp with mixed static and dynamic entries.
|
||||
void InsertSliceOp::build(OpBuilder &b, OperationState &result, Value source,
|
||||
Value dest, ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> sizes,
|
||||
ArrayRef<OpFoldResult> strides,
|
||||
ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<int64_t> staticOffsets, staticSizes, staticStrides;
|
||||
SmallVector<Value> dynamicOffsets, dynamicSizes, dynamicStrides;
|
||||
dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
dispatchIndexOpFoldResults(sizes, dynamicSizes, staticSizes,
|
||||
ShapedType::kDynamicSize);
|
||||
dispatchIndexOpFoldResults(strides, dynamicStrides, staticStrides,
|
||||
ShapedType::kDynamicStrideOrOffset);
|
||||
build(b, result, dest.getType(), source, dest, dynamicOffsets, dynamicSizes,
|
||||
dynamicStrides, b.getI64ArrayAttr(staticOffsets),
|
||||
b.getI64ArrayAttr(staticSizes), b.getI64ArrayAttr(staticStrides));
|
||||
result.addAttributes(attrs);
|
||||
}
|
||||
|
||||
// Build a InsertSliceOp with dynamic entries.
|
||||
void InsertSliceOp::build(OpBuilder &b, OperationState &result, Value source,
|
||||
Value dest, ValueRange offsets, ValueRange sizes,
|
||||
ValueRange strides, ArrayRef<NamedAttribute> attrs) {
|
||||
SmallVector<OpFoldResult> offsetValues = llvm::to_vector<4>(
|
||||
llvm::map_range(offsets, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> sizeValues = llvm::to_vector<4>(
|
||||
llvm::map_range(sizes, [](Value v) -> OpFoldResult { return v; }));
|
||||
SmallVector<OpFoldResult> strideValues = llvm::to_vector<4>(
|
||||
llvm::map_range(strides, [](Value v) -> OpFoldResult { return v; }));
|
||||
build(b, result, source, dest, offsetValues, sizeValues, strideValues);
|
||||
}
|
||||
|
||||
OpFoldResult InsertSliceOp::fold(ArrayRef<Attribute>) {
|
||||
if (getSourceType().hasStaticShape() && getType().hasStaticShape() &&
|
||||
getSourceType() == getType() &&
|
||||
succeeded(foldIdentityOffsetSizeAndStrideOpInterface(*this, getType())))
|
||||
return this->source();
|
||||
return OpFoldResult();
|
||||
}
|
||||
|
||||
namespace {
|
||||
/// Pattern to rewrite a insert_slice op with constant arguments.
|
||||
class InsertSliceOpConstantArgumentFolder final
|
||||
: public OpRewritePattern<InsertSliceOp> {
|
||||
public:
|
||||
using OpRewritePattern<InsertSliceOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(InsertSliceOp insertSliceOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
// No constant operand, just return.
|
||||
if (llvm::none_of(insertSliceOp.getOperands(), [](Value operand) {
|
||||
return matchPattern(operand, matchConstantIndex());
|
||||
}))
|
||||
return failure();
|
||||
|
||||
// At least one of offsets/sizes/strides is a new constant.
|
||||
// Form the new list of operands and constant attributes from the
|
||||
// existing.
|
||||
SmallVector<OpFoldResult> mixedOffsets(insertSliceOp.getMixedOffsets());
|
||||
SmallVector<OpFoldResult> mixedSizes(insertSliceOp.getMixedSizes());
|
||||
SmallVector<OpFoldResult> mixedStrides(insertSliceOp.getMixedStrides());
|
||||
canonicalizeSubViewPart(mixedOffsets, ShapedType::isDynamicStrideOrOffset);
|
||||
canonicalizeSubViewPart(mixedSizes, ShapedType::isDynamic);
|
||||
canonicalizeSubViewPart(mixedStrides, ShapedType::isDynamicStrideOrOffset);
|
||||
|
||||
// Create the new op in canonical form.
|
||||
rewriter.replaceOpWithNewOp<InsertSliceOp>(
|
||||
insertSliceOp, insertSliceOp.source(), insertSliceOp.dest(),
|
||||
mixedOffsets, mixedSizes, mixedStrides);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
/// Fold tensor_casts with insert_slice operations.
|
||||
struct InsertSliceOpCastFolder final : public OpRewritePattern<InsertSliceOp> {
|
||||
using OpRewritePattern<InsertSliceOp>::OpRewritePattern;
|
||||
|
||||
LogicalResult matchAndRewrite(InsertSliceOp insertSliceOp,
|
||||
PatternRewriter &rewriter) const override {
|
||||
if (llvm::any_of(insertSliceOp.getOperands(), [](Value operand) {
|
||||
return matchPattern(operand, matchConstantIndex());
|
||||
}))
|
||||
return failure();
|
||||
|
||||
auto getSourceOfCastOp = [](Value v) -> Optional<Value> {
|
||||
auto castOp = v.getDefiningOp<tensor::CastOp>();
|
||||
if (!castOp || !canFoldIntoConsumerOp(castOp))
|
||||
return llvm::None;
|
||||
return castOp.source();
|
||||
};
|
||||
Optional<Value> sourceCastSource =
|
||||
getSourceOfCastOp(insertSliceOp.source());
|
||||
Optional<Value> destCastSource = getSourceOfCastOp(insertSliceOp.dest());
|
||||
if (!sourceCastSource && !destCastSource)
|
||||
return failure();
|
||||
|
||||
Value replacement = rewriter.create<InsertSliceOp>(
|
||||
insertSliceOp.getLoc(),
|
||||
(sourceCastSource ? *sourceCastSource : insertSliceOp.source()),
|
||||
(destCastSource ? *destCastSource : insertSliceOp.dest()),
|
||||
insertSliceOp.getMixedOffsets(), insertSliceOp.getMixedSizes(),
|
||||
insertSliceOp.getMixedStrides());
|
||||
|
||||
if (replacement.getType() != insertSliceOp.getType()) {
|
||||
replacement = rewriter.create<tensor::CastOp>(
|
||||
insertSliceOp.getLoc(), insertSliceOp.getType(), replacement);
|
||||
}
|
||||
rewriter.replaceOp(insertSliceOp, replacement);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
} // namespace
|
||||
|
||||
void InsertSliceOp::getCanonicalizationPatterns(RewritePatternSet &results,
|
||||
MLIRContext *context) {
|
||||
results.add<InsertSliceOpConstantArgumentFolder, InsertSliceOpCastFolder>(
|
||||
context);
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// TableGen'd op method definitions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -616,9 +616,9 @@ func @split_at(%shape: tensor<?xindex>, %index: index) -> (tensor<?xindex>, tens
|
|||
// CHECK-NEXT: %[[ISNEG:.*]] = cmpi slt, %[[INDEX]], %[[C0]] : index
|
||||
// CHECK-NEXT: %[[SELECT:.*]] = select %[[ISNEG]], %[[POSINDEX]], %[[INDEX]] : index
|
||||
// CHECK-NEXT: %[[C1:.*]] = constant 1 : index
|
||||
// CHECK-NEXT: %[[HEAD:.*]] = subtensor %[[SHAPE]][%[[C0]]] [%[[SELECT]]] [%[[C1]]] : tensor<?xindex> to tensor<?xindex>
|
||||
// CHECK-NEXT: %[[HEAD:.*]] = tensor.extract_slice %[[SHAPE]][%[[C0]]] [%[[SELECT]]] [%[[C1]]] : tensor<?xindex> to tensor<?xindex>
|
||||
// CHECK-NEXT: %[[TAIL_SIZE:.*]] = subi %[[RANK]], %[[SELECT]] : index
|
||||
// CHECK-NEXT: %[[TAIL:.*]] = subtensor %[[SHAPE]][%[[SELECT]]] [%[[TAIL_SIZE]]] [%[[C1]]] : tensor<?xindex> to tensor<?xindex>
|
||||
// CHECK-NEXT: %[[TAIL:.*]] = tensor.extract_slice %[[SHAPE]][%[[SELECT]]] [%[[TAIL_SIZE]]] [%[[C1]]] : tensor<?xindex> to tensor<?xindex>
|
||||
// CHECK-NEXT: return %[[HEAD]], %[[TAIL]] : tensor<?xindex>, tensor<?xindex>
|
||||
%head, %tail = "shape.split_at"(%shape, %index) : (tensor<?xindex>, index) -> (tensor<?xindex>, tensor<?xindex>)
|
||||
return %head, %tail : tensor<?xindex>, tensor<?xindex>
|
||||
|
|
|
@ -679,10 +679,10 @@ func @concat(%arg0: tensor<5x1xf32>, %arg1: tensor<6x1xf32>) -> () {
|
|||
// CHECK: [[CST:%.+]] = constant 0.0
|
||||
// CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST]])
|
||||
// CHECK: [[ARG0_DIM0:%.+]] = memref.dim %arg0, [[AXIS]]
|
||||
// CHECK: [[INSERT0:%.+]] = subtensor_insert %arg0 into [[FILL]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
// CHECK: [[INSERT0:%.+]] = tensor.insert_slice %arg0 into [[FILL]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
// CHECK: [[NEW_OFFSET:%.+]] = addi [[OFFSET]], [[ARG0_DIM0]]
|
||||
// CHECK: [[ARG1_DIM0:%.+]] = memref.dim %arg1, [[AXIS]]
|
||||
// CHECK: [[INSERT1:%.+]] = subtensor_insert %arg1 into [[INSERT0]]{{\[}}[[NEW_OFFSET]], [[OFFSET]]] {{\[}}[[ARG1_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
// CHECK: [[INSERT1:%.+]] = tensor.insert_slice %arg1 into [[INSERT0]]{{\[}}[[NEW_OFFSET]], [[OFFSET]]] {{\[}}[[ARG1_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
%0 = "tosa.concat"(%arg0, %arg1) { axis = 0 : i64} : (tensor<5x1xf32>, tensor<6x1xf32>) -> (tensor<11x1xf32>)
|
||||
|
||||
// CHECK: [[AXIS:%.+]] = constant 1
|
||||
|
@ -698,10 +698,10 @@ func @concat(%arg0: tensor<5x1xf32>, %arg1: tensor<6x1xf32>) -> () {
|
|||
// CHECK: [[CST:%.+]] = constant 0.0
|
||||
// CHECK: [[FILL:%.+]] = linalg.fill([[INIT]], [[CST]])
|
||||
// CHECK: [[ARG0_DIM1:%.+]] = memref.dim %arg0, [[AXIS]]
|
||||
// CHECK: [[INSERT0:%.+]] = subtensor_insert %arg0 into [[FILL]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
// CHECK: [[INSERT0:%.+]] = tensor.insert_slice %arg0 into [[FILL]]{{\[}}[[OFFSET]], [[OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG0_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
// CHECK: [[NEW_OFFSET:%.+]] = addi [[OFFSET]], [[ARG0_DIM1]]
|
||||
// CHECK: [[ARG1_DIM1:%.+]] = memref.dim %arg0, [[AXIS]]
|
||||
// CHECK: [[INSERT1:%.+]] = subtensor_insert %arg0 into [[INSERT0]]{{\[}}[[OFFSET]], [[NEW_OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG1_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
// CHECK: [[INSERT1:%.+]] = tensor.insert_slice %arg0 into [[INSERT0]]{{\[}}[[OFFSET]], [[NEW_OFFSET]]] {{\[}}[[ARG0_DIM0]], [[ARG1_DIM1]]] {{\[}}[[STRIDE]], [[STRIDE]]]
|
||||
%1 = "tosa.concat"(%arg0, %arg0) { axis = 1 : i64} : (tensor<5x1xf32>, tensor<5x1xf32>) -> (tensor<5x2xf32>)
|
||||
return
|
||||
}
|
||||
|
|
|
@ -12,7 +12,7 @@ func @const_test() -> (tensor<i32>) {
|
|||
// -----
|
||||
|
||||
func @slice(%arg0: tensor<6xf32>) ->() {
|
||||
// CHECK: [[SLICE:%.+]] = subtensor %arg0[2] [1] [1]
|
||||
// CHECK: [[SLICE:%.+]] = tensor.extract_slice %arg0[2] [1] [1]
|
||||
%0 = "tosa.slice"(%arg0) {start = [2], size = [1]} : (tensor<6xf32>) -> (tensor<1xf32>)
|
||||
return
|
||||
}
|
||||
|
|
|
@ -166,9 +166,9 @@ func @generic_with_init_tensor(%arg0: tensor<2x3x4xvector<3x4xi4>>,
|
|||
|
||||
func private @make_index() -> index
|
||||
|
||||
// CHECK-LABEL: func @bufferize_subtensor(
|
||||
// CHECK-LABEL: func @bufferize_slice(
|
||||
// CHECK-SAME: %[[T:[0-9a-z]*]]: tensor<?x?xf32>
|
||||
func @bufferize_subtensor(%t : tensor<?x?xf32>) -> (tensor<2x3xf32>, tensor<2x?xf32>) {
|
||||
func @bufferize_slice(%t : tensor<?x?xf32>) -> (tensor<2x3xf32>, tensor<2x?xf32>) {
|
||||
// CHECK: %[[IDX:.*]] = call @make_index() : () -> index
|
||||
%i0 = call @make_index() : () -> index
|
||||
|
||||
|
@ -178,14 +178,14 @@ func @bufferize_subtensor(%t : tensor<?x?xf32>) -> (tensor<2x3xf32>, tensor<2x?x
|
|||
// CHECK-SAME: memref<?x?xf32> to memref<2x3xf32, #[[$MAP0]]>
|
||||
// CHECK-NEXT: linalg.copy(%[[SM0]], %[[A0]]) : memref<2x3xf32, #[[$MAP0]]>, memref<2x3xf32>
|
||||
// CHECK-NEXT: %[[RT0:.*]] = memref.tensor_load %[[A0]] : memref<2x3xf32>
|
||||
%st0 = subtensor %t[0, 0][2, 3][1, 1] : tensor<?x?xf32> to tensor<2x3xf32>
|
||||
%st0 = tensor.extract_slice %t[0, 0][2, 3][1, 1] : tensor<?x?xf32> to tensor<2x3xf32>
|
||||
|
||||
// CHECK-NEXT: %[[A1:.*]] = memref.alloc(%[[IDX]]) : memref<2x?xf32>
|
||||
// CHECK-NEXT: %[[SM1:.*]] = memref.subview %[[M]][0, %[[IDX]]] [2, %[[IDX]]] [1, 2]
|
||||
// CHECK-SAME: memref<?x?xf32> to memref<2x?xf32, #[[$MAP1]]>
|
||||
// CHECK-NEXT: linalg.copy(%[[SM1]], %[[A1]]) : memref<2x?xf32, #[[$MAP1]]>, memref<2x?xf32>
|
||||
// CHECK-NEXT: %[[RT1:.*]] = memref.tensor_load %[[A1]] : memref<2x?xf32>
|
||||
%st1 = subtensor %t[0, %i0][2, %i0][1, 2] : tensor<?x?xf32> to tensor<2x?xf32>
|
||||
%st1 = tensor.extract_slice %t[0, %i0][2, %i0][1, 2] : tensor<?x?xf32> to tensor<2x?xf32>
|
||||
|
||||
// CHECK-NEXT: return %[[RT0]], %[[RT1]]
|
||||
return %st0, %st1 : tensor<2x3xf32>, tensor<2x?xf32>
|
||||
|
@ -198,11 +198,11 @@ func @bufferize_subtensor(%t : tensor<?x?xf32>) -> (tensor<2x3xf32>, tensor<2x?x
|
|||
|
||||
func private @make_index() -> index
|
||||
|
||||
// CHECK-LABEL: func @bufferize_subtensor_insert(
|
||||
// CHECK-LABEL: func @bufferize_insert_slice(
|
||||
// CHECK-SAME: %[[T:[0-9a-z]*]]: tensor<?x?xf32>
|
||||
// CHECK-SAME: %[[ST0:[0-9a-z]*]]: tensor<2x3xf32>
|
||||
// CHECK-SAME: %[[ST1:[0-9a-z]*]]: tensor<2x?xf32>
|
||||
func @bufferize_subtensor_insert(%t : tensor<?x?xf32>, %st0 : tensor<2x3xf32>, %st1 : tensor<2x?xf32>) ->
|
||||
func @bufferize_insert_slice(%t : tensor<?x?xf32>, %st0 : tensor<2x3xf32>, %st1 : tensor<2x?xf32>) ->
|
||||
(tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
|
@ -222,7 +222,7 @@ func @bufferize_subtensor_insert(%t : tensor<?x?xf32>, %st0 : tensor<2x3xf32>, %
|
|||
// CHECK-SAME: memref<?x?xf32> to memref<2x3xf32, #[[$MAP0]]>
|
||||
// CHECK-NEXT: linalg.copy(%[[SM0]], %[[SUBVIEW0]]) : memref<2x3xf32>, memref<2x3xf32, #[[$MAP0]]>
|
||||
// CHECK-NEXT: %[[RT0:.*]] = memref.tensor_load %[[M_COPY0]] : memref<?x?xf32>
|
||||
%t0 = subtensor_insert %st0 into %t[0, 0][2, 3][1, 1] : tensor<2x3xf32> into tensor<?x?xf32>
|
||||
%t0 = tensor.insert_slice %st0 into %t[0, 0][2, 3][1, 1] : tensor<2x3xf32> into tensor<?x?xf32>
|
||||
|
||||
// CHECK-DAG: %[[SM1:.*]] = memref.buffer_cast %[[ST1]] : memref<2x?xf32>
|
||||
// CHECK-NEXT: %[[M_COPY1:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) : memref<?x?xf32>
|
||||
|
@ -231,7 +231,7 @@ func @bufferize_subtensor_insert(%t : tensor<?x?xf32>, %st0 : tensor<2x3xf32>, %
|
|||
// CHECK-SAME: memref<?x?xf32> to memref<2x?xf32, #[[$MAP1]]>
|
||||
// CHECK-NEXT: linalg.copy(%[[SM1]], %[[SUBVIEW1]]) : memref<2x?xf32>, memref<2x?xf32, #[[$MAP1]]>
|
||||
// CHECK-NEXT: %[[RT1:.*]] = memref.tensor_load %[[M_COPY1]] : memref<?x?xf32>
|
||||
%t1 = subtensor_insert %st1 into %t[0, %i0][2, %i0][1, 2] : tensor<2x?xf32> into tensor<?x?xf32>
|
||||
%t1 = tensor.insert_slice %st1 into %t[0, %i0][2, %i0][1, 2] : tensor<2x?xf32> into tensor<?x?xf32>
|
||||
|
||||
// CHECK: return %[[RT0]], %[[RT1]]
|
||||
return %t0, %t1: tensor<?x?xf32>, tensor<?x?xf32>
|
||||
|
|
|
@ -648,15 +648,15 @@ func @keep_not_noop(%arg0 : tensor<?x?xf32>, %arg1 : tensor<?x?xf32>)
|
|||
|
||||
// -----
|
||||
|
||||
func @fold_init_tensor_with_subtensor
|
||||
func @fold_init_tensor_with_slice
|
||||
(%arg0 : index, %arg1 : index) -> tensor<5x?x20xf32>
|
||||
{
|
||||
%0 = linalg.init_tensor[%arg0, 10, 40] : tensor<?x10x40xf32>
|
||||
%1 = subtensor %0[0, 0, 0] [5, %arg1, 20] [1, 1, 1]
|
||||
%1 = tensor.extract_slice %0[0, 0, 0] [5, %arg1, 20] [1, 1, 1]
|
||||
: tensor<?x10x40xf32> to tensor<5x?x20xf32>
|
||||
return %1 : tensor<5x?x20xf32>
|
||||
}
|
||||
// CHECK: func @fold_init_tensor_with_subtensor
|
||||
// CHECK: func @fold_init_tensor_with_slice
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: index
|
||||
// CHECK: %[[T0:.+]] = linalg.init_tensor [5, %[[ARG1]], 20]
|
||||
|
@ -723,13 +723,13 @@ func @propogate_casts(%arg0 : tensor<?x?xf32>, %arg1 : f32, %arg2 : index,
|
|||
%1 = linalg.fill(%0, %arg1) : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
|
||||
%2 = memref.dim %arg0, %c0 : tensor<?x?xf32>
|
||||
%3 = memref.dim %arg0, %c1 : tensor<?x?xf32>
|
||||
%4 = subtensor_insert %arg0 into %1[%arg2, %arg3] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%4 = tensor.insert_slice %arg0 into %1[%arg2, %arg3] [%2, %3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
return %4 : tensor<?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @propogate_casts
|
||||
// CHECK: %[[INIT:.+]] = linalg.init_tensor [21, 42]
|
||||
// CHECK: %[[FILL:.+]] = linalg.fill(%[[INIT]], %{{.+}})
|
||||
// CHECK: %[[INSERTED:.+]] = subtensor_insert %{{.+}} into %[[FILL]]
|
||||
// CHECK: %[[INSERTED:.+]] = tensor.insert_slice %{{.+}} into %[[FILL]]
|
||||
// CHECK: %[[RESULT:.+]] = tensor.cast %[[INSERTED]]
|
||||
// CHECK: return %[[RESULT]]
|
||||
|
||||
|
|
|
@ -6,43 +6,43 @@
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_fun
|
||||
func @subtensor_fun(%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true})
|
||||
// CHECK-LABEL: func @extract_slice_fun
|
||||
func @extract_slice_fun(%A : tensor<?xf32>, %B : tensor<?xf32> {linalg.inplaceable = true})
|
||||
-> (tensor<4xf32>, tensor<8xf32>)
|
||||
{
|
||||
// subtensor is not used in a write, it is not compelled to bufferize out of
|
||||
// place. Let callers decide whether they want to create aliasing subviews at
|
||||
// all call sites or whether they allocate.
|
||||
// tensor.extract_slice is not used in a write, it is not compelled to
|
||||
// bufferize out of place. Let callers decide whether they want to create
|
||||
// aliasing subviews at all call sites or whether they allocate.
|
||||
// This is true irrespective of whether the function argument is inplaceable.
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r0 = subtensor %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r1 = subtensor %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
|
||||
%r1 = tensor.extract_slice %B[0][8][1] : tensor<?xf32> to tensor<8xf32>
|
||||
|
||||
return %r0, %r1: tensor<4xf32>, tensor<8xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert_fun
|
||||
func @subtensor_insert_fun(
|
||||
// CHECK-LABEL: func @insert_slice_fun
|
||||
func @insert_slice_fun(
|
||||
%A : tensor<?xf32>,
|
||||
%B : tensor<?xf32> {linalg.inplaceable = true},
|
||||
%C : tensor<4xf32>)
|
||||
-> (tensor<?xf32>, tensor<?xf32>)
|
||||
{
|
||||
// must bufferize out of place.
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
%r0 = subtensor_insert %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// bufferizes inplace.
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r1 = subtensor_insert %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
return %r0, %r1: tensor<?xf32>, tensor<?xf32>
|
||||
}
|
||||
|
@ -85,34 +85,34 @@ func @conflict_on_B(
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_subtensor
|
||||
func @subtensor_subtensor(
|
||||
// CHECK-LABEL: func @extract_slice_extract_slice
|
||||
func @extract_slice_extract_slice(
|
||||
%A : tensor<?xf32> {linalg.inplaceable = true}, %B : tensor<?xf32>)
|
||||
-> (tensor<2xf32>, tensor<2xf32>)
|
||||
{
|
||||
// subtensor is not used in a write, it is not compelled to bufferize out of
|
||||
// place. Let callers decide whether they want to create aliasing subviews at
|
||||
// all call sites or whether they allocate.
|
||||
// tensor.extract_slice is not used in a write, it is not compelled to
|
||||
// bufferize out of place. Let callers decide whether they want to create
|
||||
// aliasing subviews at all call sites or whether they allocate.
|
||||
// This is true irrespective of whether the function argument is inplaceable.
|
||||
// CHECK: {__inplace_results_attr__ = ["true"]}
|
||||
%r0 = subtensor %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// CHECK: {__inplace_results_attr__ = ["true"]}
|
||||
%r1 = subtensor %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
|
||||
%r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32>
|
||||
|
||||
// CHECK: {__inplace_results_attr__ = ["true"]}
|
||||
%r2 = subtensor %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// CHECK: {__inplace_results_attr__ = ["true"]}
|
||||
%r3 = subtensor %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
|
||||
%r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32>
|
||||
|
||||
return %r1, %r3: tensor<2xf32>, tensor<2xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert_subtensor_insert
|
||||
func @subtensor_insert_subtensor_insert(
|
||||
// CHECK-LABEL: func @insert_slice_insert_slice
|
||||
func @insert_slice_insert_slice(
|
||||
%A : tensor<?xf32> {linalg.inplaceable = true},
|
||||
%A2 : tensor<4xf32> {linalg.inplaceable = true},
|
||||
%A3 : tensor<2xf32> {linalg.inplaceable = true},
|
||||
|
@ -120,102 +120,106 @@ func @subtensor_insert_subtensor_insert(
|
|||
-> (tensor<?xf32>, tensor<?xf32>)
|
||||
{
|
||||
// CHECK: {__inplace_results_attr__ = ["true"]}
|
||||
%r0 = subtensor_insert %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
|
||||
%r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32>
|
||||
|
||||
// CHECK: {__inplace_results_attr__ = ["true"]}
|
||||
%r1 = subtensor_insert %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// CHECK: {__inplace_results_attr__ = ["false"]}
|
||||
%r2 = subtensor_insert %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
|
||||
%r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32>
|
||||
|
||||
// CHECK: {__inplace_results_attr__ = ["false"]}
|
||||
%r3 = subtensor_insert %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
return %r1, %r3: tensor<?xf32>, tensor<?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_nonmatching_subtensor_insert
|
||||
func @subtensor_nonmatching_subtensor_insert(
|
||||
// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice
|
||||
func @extract_slice_nonmatching_insert_slice(
|
||||
%A : tensor<?xf32> {linalg.inplaceable = true},
|
||||
%B : tensor<?xf32>, %idx: index)
|
||||
-> (tensor<?xf32>, tensor<?xf32>)
|
||||
{
|
||||
// %r1 bufferizes inplace because %A is inplaceable.
|
||||
// %r0 is an overlapping subtensor that does not match, it must be out of place.
|
||||
// CHECK: subtensor
|
||||
// %r0 is an overlapping tensor.extract_slice that does not match, it must be
|
||||
// out of place.
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
%r0 = subtensor %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// %r1 can bufferize inplace fine.
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r1 = subtensor_insert %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// %r3 does bufferizes inplace because %B is not inplaceable.
|
||||
// %r0 is an overlapping subtensor that does not match, but does not alias with
|
||||
// the buffer coming from %r3 so it can actually bufferize inplace.
|
||||
// CHECK: subtensor
|
||||
// %r0 is an overlapping tensor.extract_slice that does not match, but does
|
||||
// not alias with the buffer coming from %r3 so it can actually bufferize
|
||||
// inplace.
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r2 = subtensor %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// %r3 cannot bufferize inplace since %B is not inplaceable.
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
%r3 = subtensor_insert %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
return %r1, %r3: tensor<?xf32>, tensor<?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_matching_subtensor_insert
|
||||
func @subtensor_matching_subtensor_insert(
|
||||
// CHECK-LABEL: func @extract_slice_matching_insert_slice
|
||||
func @extract_slice_matching_insert_slice(
|
||||
%A : tensor<?xf32> {linalg.inplaceable = true},
|
||||
%B : tensor<?xf32>)
|
||||
-> (tensor<?xf32>, tensor<?xf32>)
|
||||
{
|
||||
// %r1 bufferizes inplace because %A is inplaceable.
|
||||
// %r0 is a subtensor that matches, it can also be bufferized inplace.
|
||||
// CHECK: subtensor
|
||||
// %r0 is a tensor.extract_slice that matches, it can also be bufferized
|
||||
// inplace.
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r0 = subtensor %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r1 = subtensor_insert %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// %r2 is a subtensor that matches %r3, it can be bufferized inplace.
|
||||
// CHECK: subtensor
|
||||
// %r2 is a tensor.extract_slice that matches %r3, it can be bufferized
|
||||
// inplace.
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%r2 = subtensor %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r2 = tensor.extract_slice %B[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// subtensor_insert cannot bufferize inplace.
|
||||
// tensor.insert_slice cannot bufferize inplace.
|
||||
// This should have been captured by a canonicalization pattern and it would
|
||||
// be unproductive to have special logic in bufferization to encode matching
|
||||
// subtensor_insert(subtensor(A), A).
|
||||
// CHECK: subtensor_insert
|
||||
// insert_slice(extract_slice(A), A).
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
%r3 = subtensor_insert %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
return %r1, %r3: tensor<?xf32>, tensor<?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_linalg_readonly_use
|
||||
func @subtensor_linalg_readonly_use(
|
||||
// CHECK-LABEL: func @extract_slice_linalg_readonly_use
|
||||
func @extract_slice_linalg_readonly_use(
|
||||
%A : tensor<?x?xf32>,
|
||||
%B : tensor<4x4xf32>,
|
||||
%C : tensor<4x4xf32> {linalg.inplaceable = true})
|
||||
-> (tensor<4x4xf32>, tensor<4x4xf32>)
|
||||
{
|
||||
// subtensor is only used as a read, no interference irrespective of user's
|
||||
// inplace status.
|
||||
// CHECK: subtensor
|
||||
// tensor.extract_slice is only used as a read, no interference irrespective
|
||||
// of user's inplace status.
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%sA = subtensor %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
|
||||
// matmul output operand is not inplaceable at the function boundary.
|
||||
// CHECK: linalg.matmul
|
||||
|
@ -236,8 +240,8 @@ func @subtensor_linalg_readonly_use(
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_to_linalg_write_use
|
||||
func @subtensor_to_linalg_write_use(
|
||||
// CHECK-LABEL: func @extract_slice_to_linalg_write_use
|
||||
func @extract_slice_to_linalg_write_use(
|
||||
%A : tensor<4x4xf32>,
|
||||
%B : tensor<?x?xf32>,
|
||||
%C : tensor<?x?xf32> {linalg.inplaceable = true})
|
||||
|
@ -245,9 +249,9 @@ func @subtensor_to_linalg_write_use(
|
|||
{
|
||||
// Step 3. %sB forward propagates to a write in %D but it is not inplace.
|
||||
// So this is only ever read and can bufferize inplace.
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%sB = subtensor %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
|
||||
// Step 2. %sB has a read interference in %E, it does not bufferize inplace.
|
||||
// CHECK: linalg.matmul
|
||||
|
@ -259,12 +263,12 @@ func @subtensor_to_linalg_write_use(
|
|||
// Step 4. %sC forward propagates to an inplace write in %E.
|
||||
// %sC backward propagates to %C which is inplaceable.
|
||||
// As a consequence this is bufferized inplace.
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%sC = subtensor %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
|
||||
// Step 1. %sC backprops to the subtensor producer which is not considered an
|
||||
// interference. This bufferizes inplace.
|
||||
// Step 1. %sC backprops to the tensor.extract_slice producer which is not
|
||||
// considered an interference. This bufferizes inplace.
|
||||
// CHECK: linalg.matmul
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%E = linalg.matmul ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>)
|
||||
|
@ -280,8 +284,8 @@ func @subtensor_to_linalg_write_use(
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_to_linalg_write_use
|
||||
func @subtensor_to_linalg_write_use(
|
||||
// CHECK-LABEL: func @extract_slice_to_linalg_write_use
|
||||
func @extract_slice_to_linalg_write_use(
|
||||
%A : tensor<4x4xf32>,
|
||||
%B : tensor<?x?xf32>,
|
||||
%C : tensor<?x?xf32> {linalg.inplaceable = true})
|
||||
|
@ -290,12 +294,12 @@ func @subtensor_to_linalg_write_use(
|
|||
// Step 4. %sB forward propagates to an inplace write in %D.
|
||||
// %sB backward propagates to %B which is not inplaceable.
|
||||
// As a consequence this is bufferized out of place.
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
%sB = subtensor %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
|
||||
// Step 1. %sB backprops to the subtensor producer which is not considered an
|
||||
// interference. This bufferizes inplace.
|
||||
// Step 1. %sB backprops to the tensor.extract_slice producer which is not
|
||||
// considered an interference. This bufferizes inplace.
|
||||
// CHECK: linalg.matmul
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%D = linalg.matmul ins(%B, %C: tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
|
@ -305,12 +309,12 @@ func @subtensor_to_linalg_write_use(
|
|||
// Step 3. %sC forward propagates to an inplace write in %E.
|
||||
// %sC backward propagates to %C which is inplaceable.
|
||||
// As a consequence this is bufferized inplace.
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%sC = subtensor %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
|
||||
// Step 1. %sC backprops to the subtensor producer which is not considered an
|
||||
// interference. This bufferizes inplace.
|
||||
// Step 1. %sC backprops to the tensor.extract_slice producer which is not
|
||||
// considered an interference. This bufferizes inplace.
|
||||
// CHECK: linalg.matmul
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>)
|
||||
|
@ -322,8 +326,8 @@ func @subtensor_to_linalg_write_use(
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @nested_subtensor_and_insert
|
||||
func @nested_subtensor_and_insert(
|
||||
// CHECK-LABEL: func @nested_extract_slice_and_insert
|
||||
func @nested_extract_slice_and_insert(
|
||||
%A : tensor<?x?xf32>,
|
||||
%B : tensor<?x?xf32> {linalg.inplaceable = true},
|
||||
%C : tensor<?x?xf32> {linalg.inplaceable = true},
|
||||
|
@ -332,75 +336,78 @@ func @nested_subtensor_and_insert(
|
|||
{
|
||||
%f0 = constant 0.0 : f32
|
||||
|
||||
// 2-level matching subtensor / subtensor_insert into non inplaceable %A.
|
||||
// 2-level matching tensor.extract_slice / tensor.insert_slice into non
|
||||
// inplaceable %A.
|
||||
// - %rA is not inplaceable because %A is not inplaceable at function boundary.
|
||||
// - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable
|
||||
// - this propagates to %FA and %ssA being inplaceable.
|
||||
// - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not
|
||||
// inplaceable and so %sA is not inplaceable.
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
// CHECK-NEXT: subtensor
|
||||
// CHECK-NEXT: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: fill
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor_insert
|
||||
// CHECK-NEXT: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor_insert
|
||||
// CHECK-NEXT: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
%sA = subtensor %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%ssA = subtensor %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%FA = linalg.fill(%ssA, %f0) : tensor<4x4xf32>, f32 -> tensor<4x4xf32>
|
||||
%rsA = subtensor_insert %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
|
||||
%rA = subtensor_insert %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
|
||||
%rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
|
||||
// 3-level matching subtensor / subtensor_insert into inplaceable %B.
|
||||
// CHECK-NEXT: subtensor
|
||||
// 3-level matching tensor.extract_slice / tensor.insert_slice into
|
||||
// inplaceable %B.
|
||||
// CHECK-NEXT: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor
|
||||
// Atm, this 2nd subtensor fails to bufferize inplace because clobbering
|
||||
// analysis conservatively test for equivalent buffers.
|
||||
// CHECK-NEXT: tensor.extract_slice
|
||||
// Atm, this 2nd tensor.extract_slice fails to bufferize inplace because
|
||||
// clobbering analysis conservatively test for equivalent buffers.
|
||||
// TODO: This is currently too restrictive and misses clobberings.
|
||||
// When available, use container-containee analysis.
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
// CHECK-NEXT: subtensor
|
||||
// CHECK-NEXT: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: fill
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor_insert
|
||||
// CHECK-NEXT: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor_insert
|
||||
// CHECK-NEXT: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor_insert
|
||||
// CHECK-NEXT: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%sB = subtensor %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%ssB = subtensor %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
|
||||
%sssB = subtensor %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
|
||||
%sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
|
||||
%sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32>
|
||||
%FB = linalg.fill(%sssB, %f0) : tensor<4x4xf32>, f32 -> tensor<4x4xf32>
|
||||
%rssB = subtensor_insert %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
|
||||
%rsB = subtensor_insert %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
|
||||
%rB = subtensor_insert %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32>
|
||||
%rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor<?x?xf32>
|
||||
%rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
|
||||
// 2-level matching subtensor / subtensor_insert into inplaceable %C with a twist.
|
||||
// 2-level matching tensor.extract_slice / tensor.insert_slice into
|
||||
// inplaceable %C with a twist.
|
||||
// Throw a wrench in the system: %rsC production sizes do not match %ssC.
|
||||
// CHECK-NEXT: subtensor
|
||||
// CHECK-NEXT: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// The subtensor_insert that would be candidate for matching does not actually
|
||||
// match. That subtensor_insert can still be bufferized inplace nonetheless
|
||||
// but this subtensor, which bufferizes to an inplace write, cannot.
|
||||
// CHECK-NEXT: subtensor
|
||||
// The tensor.insert_slice that would be candidate for matching does not actually
|
||||
// match. That tensor.insert_slice can still be bufferized inplace nonetheless
|
||||
// but this tensor.extract_slice, which bufferizes to an inplace write, cannot.
|
||||
// CHECK-NEXT: tensor.extract_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["false"]}
|
||||
// CHECK-NEXT: fill
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor_insert
|
||||
// CHECK-NEXT: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
// CHECK-NEXT: subtensor_insert
|
||||
// CHECK-NEXT: tensor.insert_slice
|
||||
// CHECK-SAME: {__inplace_results_attr__ = ["true"]}
|
||||
%sC = subtensor %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%ssC = subtensor %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor<?x?xf32> to tensor<4x4xf32>
|
||||
%FC = linalg.fill(%ssC, %f0) : tensor<4x4xf32>, f32 -> tensor<4x4xf32>
|
||||
%rsC = subtensor_insert %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
|
||||
%rC = subtensor_insert %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor<?x?xf32>
|
||||
%rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
|
||||
return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
|
||||
}
|
||||
|
|
|
@ -118,8 +118,8 @@ func @vec_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %vec : vec
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert_fun
|
||||
func @subtensor_insert_fun(%A0 : tensor<?xf32>, %A1 : tensor<?xf32> {linalg.inplaceable = true},
|
||||
// CHECK-LABEL: func @insert_slice_fun
|
||||
func @insert_slice_fun(%A0 : tensor<?xf32>, %A1 : tensor<?xf32> {linalg.inplaceable = true},
|
||||
%t0 : tensor<4xf32>, %t1 : tensor<4xf32> {linalg.inplaceable = true})
|
||||
-> (tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>)
|
||||
{
|
||||
|
@ -128,40 +128,40 @@ func @subtensor_insert_fun(%A0 : tensor<?xf32>, %A1 : tensor<?xf32> {linalg.inpl
|
|||
// CHECK: %[[BUFFER_CAST_t0:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
|
||||
// CHECK: %[[BUFFER_CAST_t1:.*]] = memref.buffer_cast {{.*}} : memref<4xf32
|
||||
|
||||
// Alloc and copy the whole result tensor. Copy the subtensor.
|
||||
// Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
|
||||
// CHECK: %[[REALLOC_A0:.*]] = memref.alloc
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_A0]]
|
||||
// CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]]
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_t0]], %[[SV_A0]])
|
||||
%r0 = subtensor_insert %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// Alloc and copy the whole result tensor. Copy the subtensor.
|
||||
// Alloc and copy the whole result tensor. Copy the tensor.extract_slice.
|
||||
// CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_A0]]
|
||||
// CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]]
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A0_2]])
|
||||
%r1 = subtensor_insert %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// Still alloc the large tensor because %A1 is read after. Copy the subtensor.
|
||||
// Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice.
|
||||
// CHECK: %[[REALLOC_A1:.*]] = memref.alloc
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_A1]]
|
||||
// CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]]
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_t0]], %[[SV_A1]])
|
||||
%r2 = subtensor_insert %t0 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// Do not realloc the large tensor. Copy the subtensor.
|
||||
// Do not realloc the large tensor. Copy the tensor.extract_slice.
|
||||
// CHECK-NOT: alloc
|
||||
// CHECK: %[[SV_A1_2:.*]] = memref.subview %[[BUFFER_CAST_A1]]
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A1_2]])
|
||||
%r3 = subtensor_insert %t1 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
return %r0, %r1, %r2, %r3: tensor<?xf32>, tensor<?xf32>, tensor<?xf32>, tensor<?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert_fun
|
||||
func @subtensor_insert_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
|
||||
// CHECK-LABEL: func @insert_slice_fun
|
||||
func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
|
||||
-> tensor<?xf32>
|
||||
{
|
||||
%f0 = constant 0.0 : f32
|
||||
|
@ -172,7 +172,7 @@ func @subtensor_insert_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t :
|
|||
// CHECK-NOT: alloc
|
||||
// CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]]
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]])
|
||||
%r0 = subtensor_insert %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
/// Overwrite BUFFER_CAST_A inplace.
|
||||
// CHECK: linalg.fill(%[[BUFFER_CAST_A]]
|
||||
|
@ -182,8 +182,8 @@ func @subtensor_insert_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t :
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert_fun
|
||||
func @subtensor_insert_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
|
||||
// CHECK-LABEL: func @insert_slice_fun
|
||||
func @insert_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
|
||||
-> tensor<?xf32>
|
||||
{
|
||||
%f0 = constant 0.0 : f32
|
||||
|
@ -198,15 +198,15 @@ func @subtensor_insert_fun(%A : tensor<?xf32> {linalg.inplaceable = true}, %t :
|
|||
// CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]]
|
||||
/// Overwrite BUFFER_CAST_A inplace by copying into the subview.
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]])
|
||||
%r1 = subtensor_insert %t into %r0[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
return %r1: tensor<?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert_fun_not_inplace
|
||||
func @subtensor_insert_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
|
||||
// CHECK-LABEL: func @insert_slice_fun_not_inplace
|
||||
func @insert_slice_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
|
||||
-> tensor<?xf32>
|
||||
{
|
||||
// CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
|
||||
|
@ -217,14 +217,14 @@ func @subtensor_insert_fun_not_inplace(%A : tensor<?xf32>, %t : tensor<4xf32>)
|
|||
// CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref<?xf32> to memref<4xf32>
|
||||
// CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32>
|
||||
// CHECK: memref.dealloc %[[ALLOC]] : memref<?xf32>
|
||||
%r0 = subtensor_insert %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
return %r0: tensor<?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert_fun_not_inplace
|
||||
func @subtensor_insert_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
|
||||
// CHECK-LABEL: func @insert_slice_fun_not_inplace
|
||||
func @insert_slice_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable = true}, %t : tensor<4xf32>)
|
||||
-> (tensor<?xf32>, tensor<?xf32>)
|
||||
{
|
||||
%f0 = constant 0.0 : f32
|
||||
|
@ -232,10 +232,10 @@ func @subtensor_insert_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable =
|
|||
// CHECK-DAG: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32{{.*}}
|
||||
// CHECK-DAG: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast {{.*}} : memref<4xf32{{.*}}
|
||||
|
||||
// subtensor_insert is bufferized first, %A is inplaceable so we can make this inplace
|
||||
// tensor.insert_slice is bufferized first, %A is inplaceable so we can make this inplace
|
||||
// CHECK-DAG: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]][0] [4] [1] : memref<?xf32, {{.*}}> to memref<4xf32, {{.*}}>
|
||||
// CHECK-DAG: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, {{.*}}>, memref<4xf32, {{.*}}>
|
||||
%r0 = subtensor_insert %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
%r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor<?xf32>
|
||||
|
||||
// fill would interfere with %r0 that is also being returned.
|
||||
// So we need to bufferize it out of place and make a new alloc.
|
||||
|
@ -253,8 +253,8 @@ func @subtensor_insert_fun_not_inplace(%A : tensor<?xf32> {linalg.inplaceable =
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor_fun
|
||||
func @subtensor_fun(%A : tensor<?xf32> {linalg.inplaceable = true})
|
||||
// CHECK-LABEL: func @extract_slice_fun
|
||||
func @extract_slice_fun(%A : tensor<?xf32> {linalg.inplaceable = true})
|
||||
-> tensor<4xf32>
|
||||
{
|
||||
// This bufferizes to a pattern that the cross-function boundary pass needs to
|
||||
|
@ -268,9 +268,8 @@ func @subtensor_fun(%A : tensor<?xf32> {linalg.inplaceable = true})
|
|||
// CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref<?xf32
|
||||
// CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]][0] [4] [1]
|
||||
// CHECK: %[[RES:.*]] = memref.tensor_load %[[SV]]
|
||||
%r0 = subtensor %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
%r0 = tensor.extract_slice %A[0][4][1] : tensor<?xf32> to tensor<4xf32>
|
||||
|
||||
// CHECK: return %[[RES]]
|
||||
return %r0: tensor<4xf32>
|
||||
}
|
||||
|
||||
|
|
|
@ -299,28 +299,28 @@ func @fold_unit_dim_for_init_tensor(%input: tensor<1x1000xf32>) -> tensor<1xf32>
|
|||
|
||||
// -----
|
||||
|
||||
func @fold_subtensor(
|
||||
func @fold_slice(
|
||||
%arg0 : tensor<1x?x?x1x?x1x1xf32>, %arg1 : tensor<1x?x?x?x?x1x1xf32>,
|
||||
%arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index,
|
||||
%arg6 : index, %arg7 : index) -> (tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32>) {
|
||||
%0 = subtensor %arg0[0, %arg2, %arg3, 0, %arg4, 0, 0]
|
||||
[1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
|
||||
%0 = tensor.extract_slice %arg0[0, %arg2, %arg3, 0, %arg4, 0, 0]
|
||||
[1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
|
||||
tensor<1x?x?x1x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32>
|
||||
%1 = subtensor %arg1[%arg2, 0, %arg3, 0, 0, %arg4, 0]
|
||||
[1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
|
||||
%1 = tensor.extract_slice %arg1[%arg2, 0, %arg3, 0, 0, %arg4, 0]
|
||||
[1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] :
|
||||
tensor<1x?x?x?x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32>
|
||||
return %0, %1 : tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32>
|
||||
}
|
||||
// CHECK: func @fold_subtensor
|
||||
// CHECK: func @fold_slice
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<1x?x?x1x?x1x1xf32>
|
||||
// CHECK-SAME: %[[ARG1:.+]]: tensor<1x?x?x?x?x1x1xf32>
|
||||
// CHECK: %[[SUBTENSOR1:.+]] = subtensor %[[ARG0]]
|
||||
// CHECK: %[[SLICE1:.+]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK-SAME: to tensor<?x?x?xf32>
|
||||
// CHECK: %[[RESULT1:.+]] = linalg.tensor_expand_shape %[[SUBTENSOR1]]
|
||||
// CHECK: %[[RESULT1:.+]] = linalg.tensor_expand_shape %[[SLICE1]]
|
||||
// CHECK-SAME: [0, 1], [2], [3, 4, 5, 6]
|
||||
// CHECK: %[[SUBTENSOR2:.+]] = subtensor %[[ARG1]]
|
||||
// CHECK: %[[SLICE2:.+]] = tensor.extract_slice %[[ARG1]]
|
||||
// CHECK-SAME: to tensor<?x?x?xf32>
|
||||
// CHECK: %[[RESULT2:.+]] = linalg.tensor_expand_shape %[[SUBTENSOR2]]
|
||||
// CHECK: %[[RESULT2:.+]] = linalg.tensor_expand_shape %[[SLICE2]]
|
||||
// CHECK-SAME: [0, 1], [2], [3, 4, 5, 6]
|
||||
// CHECK: return %[[RESULT1]], %[[RESULT2]]
|
||||
|
||||
|
@ -430,25 +430,25 @@ func @unit_dim_for_reduction_inner(%arg0: tensor<?x1x?x1xf32>) -> tensor<?x1xf32
|
|||
|
||||
// -----
|
||||
|
||||
func @subtensor_unit_dims(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
|
||||
%0 = subtensor %arg0[0, 2] [1, 1] [1, 1] : tensor<1x3xf32> to tensor<1x1xf32>
|
||||
func @slice_unit_dims(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
|
||||
%0 = tensor.extract_slice %arg0[0, 2] [1, 1] [1, 1] : tensor<1x3xf32> to tensor<1x1xf32>
|
||||
return %0 : tensor<1x1xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @subtensor_unit_dims
|
||||
// CHECK: %[[SUBTENSOR:.+]] = subtensor
|
||||
// CHECK-LABEL: func @slice_unit_dims
|
||||
// CHECK: %[[SLICE:.+]] = tensor.extract_slice
|
||||
// CHECK-SAME: tensor<1x3xf32> to tensor<f32>
|
||||
// CHECK: %[[RESULT:.+]] = linalg.tensor_expand_shape %[[SUBTENSOR]] []
|
||||
// CHECK: %[[RESULT:.+]] = linalg.tensor_expand_shape %[[SLICE]] []
|
||||
// CHECK: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @subtensor_insert_unit_dims(%arg0: tensor<1x3xf32>, %arg1: tensor<1x1xf32>) -> tensor<1x3xf32> {
|
||||
%0 = subtensor_insert %arg1 into %arg0[0, 2] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x3xf32>
|
||||
func @insert_slice_unit_dims(%arg0: tensor<1x3xf32>, %arg1: tensor<1x1xf32>) -> tensor<1x3xf32> {
|
||||
%0 = tensor.insert_slice %arg1 into %arg0[0, 2] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x3xf32>
|
||||
return %0 : tensor<1x3xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @subtensor_insert_unit_dims
|
||||
// CHECK-LABEL: func @insert_slice_unit_dims
|
||||
// CHECK: %[[RESHAPE:.+]] = linalg.tensor_collapse_shape %{{.+}} []
|
||||
// CHECK: %[[RESULT:.+]] = subtensor_insert %[[RESHAPE]]
|
||||
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[RESHAPE]]
|
||||
// CHECK-SAME: tensor<f32> into tensor<1x3xf32>
|
||||
// CHECK: return %[[RESULT]]
|
||||
|
||||
|
|
|
@ -175,18 +175,18 @@ module {
|
|||
// CHECK: %[[INIT:.+]] = linalg.init_tensor
|
||||
// CHECK: %[[R0:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[ARG5:.+]] = %[[INIT]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[R1:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[ARG7:.+]] = %[[ARG5]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK-DAG: %[[STARG3:.+]] = subtensor %[[ARG3]]
|
||||
// CHECK-DAG: %[[STARG7:.+]] = subtensor %[[ARG7]]
|
||||
// CHECK-DAG: %[[STARG0:.+]] = subtensor %[[ARG0]]
|
||||
// CHECK-DAG: %[[STARG1:.+]] = subtensor %[[ARG1]]
|
||||
// CHECK-DAG: %[[STARG2:.+]] = subtensor %[[ARG2]]
|
||||
// CHECK-DAG: %[[STARG3:.+]] = tensor.extract_slice %[[ARG3]]
|
||||
// CHECK-DAG: %[[STARG7:.+]] = tensor.extract_slice %[[ARG7]]
|
||||
// CHECK-DAG: %[[STARG0:.+]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK-DAG: %[[STARG1:.+]] = tensor.extract_slice %[[ARG1]]
|
||||
// CHECK-DAG: %[[STARG2:.+]] = tensor.extract_slice %[[ARG2]]
|
||||
// CHECK: %[[T0:.+]] = linalg.matmul
|
||||
// CHECK-SAME: ins(%[[STARG0]], %[[STARG1]] : tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
// CHECK-SAME: outs(%[[STARG2]] : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
// CHECK: %[[T1:.+]] = linalg.generic
|
||||
// CHECK-SAME: ins(%[[T0:.+]], %[[STARG3]] : tensor<?x?xf32>, tensor<?xf32>)
|
||||
// CHECK-SAME: outs(%[[STARG7]] : tensor<?x?xf32>)
|
||||
// CHECK: %[[RESULT:.+]] = subtensor_insert %[[T1]] into %[[ARG7]]
|
||||
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[T1]] into %[[ARG7]]
|
||||
// CHECK: scf.yield %[[RESULT]]
|
||||
// CHECK: }
|
||||
// CHECK: scf.yield %[[R1]]
|
||||
|
@ -229,21 +229,21 @@ module {
|
|||
// CHECK: %[[M_1:.+]] = memref.dim %[[ARG8]], %[[C0]]
|
||||
// CHECK: %[[TILE_M_1:.+]] = affine.min #[[MAP0]](%[[M_1]], %[[IV0]])
|
||||
// CHECK: %[[N3:.+]] = memref.dim %[[ARG8]], %[[C1]]
|
||||
// CHECK: %[[STARG6:.+]] = subtensor %[[ARG8]][%[[IV0]], 0]
|
||||
// CHECK: %[[STARG6:.+]] = tensor.extract_slice %[[ARG8]][%[[IV0]], 0]
|
||||
// CHECK-SAME: [%[[TILE_M_1]], %[[N3]]]
|
||||
// CHECK: %[[M_2:.+]] = memref.dim %[[ARG4]], %[[C0]]
|
||||
// CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[M_2]], %[[M]]]
|
||||
// CHECK: %[[N2:.+]] = memref.dim %[[ARG4]], %[[C1]]
|
||||
// CHECK: %[[STARG4:.+]] = subtensor %[[ARG4]][%[[IV0]], 0]
|
||||
// CHECK: %[[STARG4:.+]] = tensor.extract_slice %[[ARG4]][%[[IV0]], 0]
|
||||
// CHECK-SAME: [%[[TILE_M_2]], %[[N2]]]
|
||||
// CHECK: %[[TILE_M_3:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[M]], %[[M]]]
|
||||
// CHECK: %[[N0:.+]] = memref.dim %[[ARG0]], %[[C1]]
|
||||
// CHECK: %[[STARG0:.+]] = subtensor %[[ARG0]][%[[IV0]], 0]
|
||||
// CHECK: %[[STARG0:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], 0]
|
||||
// CHECK-SAME: [%[[TILE_M_3]], %[[N0]]]
|
||||
// CHECK: %[[M_3:.+]] = memref.dim %[[ARG2]], %[[C0]]
|
||||
// CHECK: %[[TILE_M_4:.+]] = affine.min #[[MAP1]](%[[IV0]])[%[[M_3]], %[[M]]]
|
||||
// CHECK: %[[N1:.+]] = memref.dim %[[ARG2]], %[[C1]]
|
||||
// CHECK: %[[STARG2:.+]] = subtensor %[[ARG2]][%[[IV0]], 0]
|
||||
// CHECK: %[[STARG2:.+]] = tensor.extract_slice %[[ARG2]][%[[IV0]], 0]
|
||||
// CHECK-SAME: [%[[TILE_M_4]], %[[N1]]]
|
||||
// CHECK: %[[T0:.+]] = linalg.matmul
|
||||
// CHECK-SAME: ins(%[[STARG0]], %[[ARG1]] : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
|
@ -254,7 +254,7 @@ module {
|
|||
// CHECK: %[[T2:.+]] = linalg.matmul
|
||||
// CHECK-SAME: ins(%[[T1]], %arg5 : tensor<?x?xf32>, tensor<?x?xf32>
|
||||
// CHECK-SAME: ) outs(%[[STARG6]] : tensor<?x?xf32>)
|
||||
// CHECK: %[[R1:.+]] = subtensor_insert %[[T2]]
|
||||
// CHECK: %[[R1:.+]] = tensor.insert_slice %[[T2]]
|
||||
// CHECK-SAME: into %[[ARG8]][%[[IV0]], 0] [%[[TILE_M_1]], %[[N3]]]
|
||||
// CHECK: scf.yield %[[R1]] : tensor<?x?xf32>
|
||||
// CHECK: }
|
||||
|
|
|
@ -38,16 +38,16 @@ module {
|
|||
// CHECK: %[[M_2:.+]] = memref.dim %[[ARG6]], %[[C0]]
|
||||
// CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP1]](%[[M_2]], %[[IV0]])
|
||||
// CHECK: %[[N3:.+]] = memref.dim %[[ARG6]], %[[C1]]
|
||||
// CHECK: %[[ST_ARG6:.+]] = subtensor %[[ARG6]][%[[IV0]], 0]
|
||||
// CHECK: %[[ST_ARG6:.+]] = tensor.extract_slice %[[ARG6]][%[[IV0]], 0]
|
||||
// CHECK-SAME: [%[[TILE_M_2]], %[[N3]]]
|
||||
// CHECK: %[[TILE_M_3:.+]] = affine.min #[[MAP5]](%[[IV0]])[%[[M]], %[[M]]]
|
||||
// CHECK: %[[N1:.+]] = memref.dim %[[ARG0]], %[[C1]]
|
||||
// CHECK: %[[ST_ARG0:.+]] = subtensor %[[ARG0]][%[[IV0]], 0]
|
||||
// CHECK: %[[ST_ARG0:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], 0]
|
||||
// CHECK-SAME: [%[[TILE_M_3]], %[[N1]]]
|
||||
// CHECK: %[[M_3:.+]] = memref.dim %[[ARG2]], %[[C0]]
|
||||
// CHECK: %[[TILE_M_4:.+]] = affine.min #[[MAP5]](%[[IV0]])[%[[M_3]], %[[M]]]
|
||||
// CHECK: %[[N2_2:.+]] = memref.dim %[[ARG2]], %[[C1]]
|
||||
// CHECK: %[[ST_ARG2:.+]] = subtensor %[[ARG2]][%[[IV0]], 0]
|
||||
// CHECK: %[[ST_ARG2:.+]] = tensor.extract_slice %[[ARG2]][%[[IV0]], 0]
|
||||
// CHECK-SAME: [%[[TILE_M_4]], %[[N2_2]]]
|
||||
// CHECK: %[[LHS:.+]] = linalg.matmul
|
||||
// CHECK-SAME: __internal_linalg_transform__ = "after_lhs_fusion_producer"
|
||||
|
@ -62,30 +62,30 @@ module {
|
|||
// CHECK-SAME: %[[C0]] to %[[N2]] step %[[C16]]
|
||||
// CHECK-SAME: iter_args(%[[ARG10:.+]] = %[[ARG8]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[TILE_N2:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[N2]]]
|
||||
// CHECK: %[[ST_LHS:.+]] = subtensor %[[LHS]][0, %[[IV2]]]
|
||||
// CHECK: %[[ST_LHS:.+]] = tensor.extract_slice %[[LHS]][0, %[[IV2]]]
|
||||
// CHECK-SAME: [%[[TILE_M_3]], %[[TILE_N2]]]
|
||||
// CHECK: %[[N2_3:.+]] = memref.dim %[[ARG3]], %[[C0]]
|
||||
// CHECK: %[[TILE_N2_2:.+]] = affine.min #[[MAP2]](%[[IV2]])[%[[N2_3]]]
|
||||
// CHECK: %[[TILE_N3:.+]] = affine.min #[[MAP3]](%[[IV1]])[%[[N3_2]]]
|
||||
// CHECK: %[[ST_ARG3:.+]] = subtensor %[[ARG3]][%[[IV2]], %[[IV1]]]
|
||||
// CHECK: %[[ST_ARG3:.+]] = tensor.extract_slice %[[ARG3]][%[[IV2]], %[[IV1]]]
|
||||
// CHECK-SAME: [%[[TILE_N2_2]], %[[TILE_N3]]]
|
||||
// CHECK: %[[M_4:.+]] = memref.dim %[[ARG10]], %[[C0]]
|
||||
// CHECK: %[[N3_3:.+]] = memref.dim %[[ARG10]], %[[C1]]
|
||||
// CHECK: %[[TILE_N3_2:.+]] = affine.min #[[MAP4]](%[[N3_3]], %[[IV1]])
|
||||
// CHECK: %[[ST_ARG4:.+]] = subtensor %[[ARG10]][0, %[[IV1]]]
|
||||
// CHECK: %[[ST_ARG4:.+]] = tensor.extract_slice %[[ARG10]][0, %[[IV1]]]
|
||||
// CHECK-SAME: [%[[M_4]], %[[TILE_N3_2]]]
|
||||
// CHECK: %[[ST_RESULT:.+]] = linalg.matmul
|
||||
// CHECK-SAME: __internal_linalg_transform__ = "after_lhs_fusion"
|
||||
// CHECK-SAME: ins(%[[ST_LHS]], %[[ST_ARG3]]
|
||||
// CHECK-SAME: : tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
// CHECK-SAME: outs(%[[ST_ARG4]] : tensor<?x?xf32>)
|
||||
// CHECK: %[[UPDATE1:.+]] = subtensor_insert %[[ST_RESULT]]
|
||||
// CHECK: %[[UPDATE1:.+]] = tensor.insert_slice %[[ST_RESULT]]
|
||||
// CHECK-SAME: into %[[ARG10]][0, %[[IV1]]] [%[[M_4]], %[[TILE_N3_2]]]
|
||||
// CHECK: scf.yield %[[UPDATE1]]
|
||||
// CHECK: }
|
||||
// CHECK: scf.yield %[[YIELD1]]
|
||||
// CHECK: }
|
||||
// CHECK: %[[UPDATE0:.+]] = subtensor_insert %[[YIELD0]] into
|
||||
// CHECK: %[[UPDATE0:.+]] = tensor.insert_slice %[[YIELD0]] into
|
||||
// CHECK-SAME: %[[ARG6]][%[[IV0]], 0] [%[[TILE_M_2]], %[[N3]]]
|
||||
// CHECK: scf.yield %[[UPDATE0]]
|
||||
// CHECK: }
|
||||
|
@ -114,9 +114,9 @@ module {
|
|||
// TLOOP-SAME: %[[AB_INIT_:.*]] = %[[AB_INIT]]: tensor<?x?xf32>)
|
||||
// TLOOP-SAME: outs (%[[ABC_INIT_:.*]] = %[[ABC_INIT]]: tensor<?x?xf32>) {
|
||||
|
||||
// TLOOP: %[[ABC_INIT_SUB:.*]] = subtensor %[[ABC_INIT_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[A_SUB:.*]] = subtensor %[[A_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[AB_INIT_SUB:.*]] = subtensor %[[AB_INIT_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[ABC_INIT_SUB:.*]] = tensor.extract_slice %[[ABC_INIT_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[AB_INIT_SUB:.*]] = tensor.extract_slice %[[AB_INIT_]][%[[IV0]], 0]
|
||||
|
||||
// TLOOP: %[[AB_SUB:.*]] = linalg.matmul
|
||||
// TLOOP-SAME: ins(%[[A_SUB]], %[[B_]] : {{.*}}) outs(%[[AB_INIT_SUB]]
|
||||
|
@ -132,19 +132,19 @@ module {
|
|||
// TLOOP-SAME: outs (%[[ABC_INIT_SUB_:.*]] = %[[ABC_INIT_SUB]]: [[TY]])
|
||||
// TLOOP-SAME: iterators["parallel", "reduction"] {
|
||||
|
||||
// TLOOP: %[[AB_SUB_SUB:.*]] = subtensor %[[AB_SUB_]][0, %[[IV2]]]
|
||||
// TLOOP: %[[C__SUB:.*]] = subtensor %[[C__]][%[[IV2]], %[[IV1]]]
|
||||
// TLOOP: %[[ABS_INIT_SUB_SUB:.*]] = subtensor %[[ABC_INIT_SUB_]][0, %[[IV1]]]
|
||||
// TLOOP: %[[AB_SUB_SUB:.*]] = tensor.extract_slice %[[AB_SUB_]][0, %[[IV2]]]
|
||||
// TLOOP: %[[C__SUB:.*]] = tensor.extract_slice %[[C__]][%[[IV2]], %[[IV1]]]
|
||||
// TLOOP: %[[ABS_INIT_SUB_SUB:.*]] = tensor.extract_slice %[[ABC_INIT_SUB_]][0, %[[IV1]]]
|
||||
|
||||
// TLOOP: %[[ABC_SUB_SUB:.*]] = linalg.matmul
|
||||
// TLOOP-SAME: ins(%[[AB_SUB_SUB]], %[[C__SUB]] : [[TY]], [[TY]])
|
||||
// TLOOP-SAME: outs(%[[ABS_INIT_SUB_SUB]] : [[TY]]) -> [[TY]]
|
||||
|
||||
// TLOOP: %[[RES0:.*]] = subtensor_insert %[[ABC_SUB_SUB]]
|
||||
// TLOOP: %[[RES0:.*]] = tensor.insert_slice %[[ABC_SUB_SUB]]
|
||||
// TLOOP-SAME: into %[[ABC_INIT_SUB_]][0, %[[IV1]]]
|
||||
// TLOOP: linalg.yield %[[RES0]] : [[TY]]
|
||||
// TLOOP: }
|
||||
// TLOOP: %[[RES1:.*]] = subtensor_insert %[[ABC_SUB_]] into %[[ABC_INIT_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[RES1:.*]] = tensor.insert_slice %[[ABC_SUB_]] into %[[ABC_INIT_]][%[[IV0]], 0]
|
||||
// TLOOP: linalg.yield %[[RES1]] : [[TY]]
|
||||
// TLOOP: }
|
||||
// TLOOP: return %[[ABC]] : [[TY]]
|
||||
|
@ -186,10 +186,10 @@ module {
|
|||
// CHECK-SAME: iter_args(%[[ARG4:.+]] = %{{[a-zA-Z0-9_]+}})
|
||||
// CHECK: %[[YIELD:.+]] = scf.for %[[IV1:[a-zA-Z0-9_]+]]
|
||||
// CHECK-SAME: iter_args(%[[ARG6:.+]] = %[[ARG4]])
|
||||
// CHECK: %[[ST_ARG6:.+]] = subtensor %[[ARG6]][%[[IV0]], %[[IV1]]]
|
||||
// CHECK: %[[ST_ARG0:.+]] = subtensor %[[ARG0]][%[[IV0]], 0]
|
||||
// CHECK: %[[ST_ARG1:.+]] = subtensor %[[ARG1]][0, %[[IV1]]]
|
||||
// CHECK: %[[ST_ARG2:.+]] = subtensor %[[ARG2]][%[[IV0]], %[[IV1]]]
|
||||
// CHECK: %[[ST_ARG6:.+]] = tensor.extract_slice %[[ARG6]][%[[IV0]], %[[IV1]]]
|
||||
// CHECK: %[[ST_ARG0:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], 0]
|
||||
// CHECK: %[[ST_ARG1:.+]] = tensor.extract_slice %[[ARG1]][0, %[[IV1]]]
|
||||
// CHECK: %[[ST_ARG2:.+]] = tensor.extract_slice %[[ARG2]][%[[IV0]], %[[IV1]]]
|
||||
// CHECK: %[[LHS:.+]] = linalg.matmul
|
||||
// CHECK-SAME: ins(%[[ST_ARG0]], %[[ST_ARG1]]
|
||||
// CHECK-SAME: : tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
|
@ -197,7 +197,7 @@ module {
|
|||
// CHECK: %[[ST_RESULT:.+]] = linalg.generic
|
||||
// CHECK-SAME: ins(%[[LHS]] : tensor<?x?xf32>)
|
||||
// CHECK-SAME: outs(%[[ST_ARG6]] : tensor<?x?xf32>)
|
||||
// CHECK: %[[UPDATE:.+]] = subtensor_insert %[[ST_RESULT]]
|
||||
// CHECK: %[[UPDATE:.+]] = tensor.insert_slice %[[ST_RESULT]]
|
||||
// CHECK-SAME: into %[[ARG6]][%[[IV0]], %[[IV1]]]
|
||||
// CHECK: scf.yield %[[UPDATE]]
|
||||
// CHECK: scf.yield %[[YIELD]]
|
||||
|
@ -226,10 +226,10 @@ module {
|
|||
// TLOOP-SAME: %[[AB_:.*]] = %[[AB]]: [[TY]])
|
||||
// TLOOP-SAME: outs (%[[INIT_:.*]] = %[[INIT]]: [[TY]]) {
|
||||
|
||||
// TLOOP: %[[INIT_SUB:.*]] = subtensor %[[INIT_]][%[[IV0]], %[[IV1]]]
|
||||
// TLOOP: %[[A_SUB:.*]] = subtensor %[[A_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[B_SUB:.*]] = subtensor %[[B_]][0, %[[IV1]]]
|
||||
// TLOOP: %[[AB_SUB_INIT:.*]] = subtensor %[[AB_]][%[[IV0]], %[[IV1]]]
|
||||
// TLOOP: %[[INIT_SUB:.*]] = tensor.extract_slice %[[INIT_]][%[[IV0]], %[[IV1]]]
|
||||
// TLOOP: %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[IV0]], 0]
|
||||
// TLOOP: %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[IV1]]]
|
||||
// TLOOP: %[[AB_SUB_INIT:.*]] = tensor.extract_slice %[[AB_]][%[[IV0]], %[[IV1]]]
|
||||
|
||||
// TLOOP: %[[AB_SUB:.*]] = linalg.matmul
|
||||
// TLOOP-SAME: ins(%[[A_SUB]], %[[B_SUB]] : [[TY]], [[TY]])
|
||||
|
@ -238,7 +238,7 @@ module {
|
|||
// TLOOP: %[[DOUBLE_AB:.*]] = linalg.generic
|
||||
// TLOOP-SAME: ins(%[[AB_SUB]] : [[TY]]) outs(%[[INIT_SUB]] : [[TY]])
|
||||
|
||||
// TLOOP: %[[RESULT_SUB:.*]] = subtensor_insert
|
||||
// TLOOP: %[[RESULT_SUB:.*]] = tensor.insert_slice
|
||||
// TLOOP-SAME: %[[DOUBLE_AB:.*]] into %[[INIT_]][%[[IV0]], %[[IV1]]]
|
||||
|
||||
// TLOOP: linalg.yield %[[RESULT_SUB]] : [[TY]]
|
||||
|
@ -267,13 +267,13 @@ module {
|
|||
// CHECK-NOT: fill
|
||||
// CHECK: scf.for %[[I:.*]]{{.*}}iter_args(%{{.*}} = %[[ARG0]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: scf.for %[[J:.*]]
|
||||
// CHECK: %[[ST:.*]] = subtensor %[[ARG0]]
|
||||
// CHECK: %[[ST:.*]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK: %[[ST_FILL:.*]] = linalg.fill(%[[ST]], %[[C0]]) {__internal_linalg_transform__ = "after_out_fusion_producer"} : tensor<?x?xf32>, f32 -> tensor<?x?xf32>
|
||||
// CHECK: %[[ST_MM_RES:.*]] = scf.for %[[K:.*]]{{.*}}iter_args(%[[BB:.*]] = %[[ST_FILL]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK-NOT: fill
|
||||
// CHECK: %[[ST_MM:.*]] = linalg.matmul {__internal_linalg_transform__ = "after_out_fusion"} ins(%{{.*}}, %{{.*}} : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[BB]] : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[ST_MM]] : tensor<?x?xf32>
|
||||
// CHECK: %[[MM:.*]] = subtensor_insert %[[ST_MM_RES]] into {{.*}}
|
||||
// CHECK: %[[MM:.*]] = tensor.insert_slice %[[ST_MM_RES]] into {{.*}}
|
||||
// CHECK: scf.yield %[[MM]] : tensor<?x?xf32>
|
||||
|
||||
|
||||
|
@ -300,9 +300,9 @@ module {
|
|||
// TLOOP-SAME: outs (%[[OUT_:.*]] = %[[OUT]]: [[TY]]) {
|
||||
|
||||
// TLOOP: %[[DIM_A__1:.*]] = memref.dim %[[A_]], %[[C1]] : [[TY]]
|
||||
// TLOOP: %[[A_SUB:.*]] = subtensor %[[A_]][%[[I]], 0]
|
||||
// TLOOP: %[[B_SUB:.*]] = subtensor %[[B_]][0, %[[J]]]
|
||||
// TLOOP: %[[OUT_SUB:.*]] = subtensor %[[OUT_]][%[[I]], %[[J]]]
|
||||
// TLOOP: %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
|
||||
// TLOOP: %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
|
||||
// TLOOP: %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
|
||||
// TLOOP: %[[INIT_SUB:.*]] = linalg.fill(%[[OUT_SUB]], %[[C0_F32]])
|
||||
|
||||
// TLOOP: %[[AB_SUB:.*]] = linalg.tiled_loop (%[[K:.*]]) = (%[[C0]])
|
||||
|
@ -312,15 +312,15 @@ module {
|
|||
// TLOOP-SAME: outs (%[[INIT_SUB_:.*]] = %[[INIT_SUB]]: [[TY]])
|
||||
// TLOOP-SAME: iterators["reduction"] {
|
||||
|
||||
// TLOOP: %[[A_SUB_SUB:.*]] = subtensor %[[A_SUB_]][0, %[[K]]]
|
||||
// TLOOP: %[[B_SUB_SUB:.*]] = subtensor %[[B_SUB_]][%[[K]], 0]
|
||||
// TLOOP: %[[A_SUB_SUB:.*]] = tensor.extract_slice %[[A_SUB_]][0, %[[K]]]
|
||||
// TLOOP: %[[B_SUB_SUB:.*]] = tensor.extract_slice %[[B_SUB_]][%[[K]], 0]
|
||||
|
||||
// TLOOP: %[[AB_SUB_SUB:.*]] = linalg.matmul
|
||||
// TLOOP-SAME: ins(%[[A_SUB_SUB]], %[[B_SUB_SUB]] : [[TY]], [[TY]])
|
||||
// TLOOP-SAME: outs(%[[INIT_SUB_]] : [[TY]]) -> [[TY]]
|
||||
// TLOOP: linalg.yield %[[AB_SUB_SUB]] : [[TY]]
|
||||
// TLOOP: }
|
||||
// TLOOP: %[[SUB_RESULT:.*]] = subtensor_insert %[[AB_SUB]]
|
||||
// TLOOP: %[[SUB_RESULT:.*]] = tensor.insert_slice %[[AB_SUB]]
|
||||
// TLOOP-SAME: into %[[OUT_]][%[[I]], %[[J]]]
|
||||
// TLOOP: linalg.yield %[[SUB_RESULT]] : [[TY]]
|
||||
// TLOOP: }
|
||||
|
@ -371,9 +371,9 @@ module {
|
|||
// TLOOP-SAME: outs (%[[OUT_:.*]] = %[[OUT]]: [[TY]]) {
|
||||
|
||||
// TLOOP: %[[DIM_A__1:.*]] = memref.dim %[[A_]], %[[C1]] : [[TY]]
|
||||
// TLOOP: %[[A_SUB:.*]] = subtensor %[[A_]][%[[I]], 0]
|
||||
// TLOOP: %[[B_SUB:.*]] = subtensor %[[B_]][0, %[[J]]]
|
||||
// TLOOP: %[[OUT_SUB:.*]] = subtensor %[[OUT_]][%[[I]], %[[J]]]
|
||||
// TLOOP: %[[A_SUB:.*]] = tensor.extract_slice %[[A_]][%[[I]], 0]
|
||||
// TLOOP: %[[B_SUB:.*]] = tensor.extract_slice %[[B_]][0, %[[J]]]
|
||||
// TLOOP: %[[OUT_SUB:.*]] = tensor.extract_slice %[[OUT_]][%[[I]], %[[J]]]
|
||||
// TLOOP: %[[INIT_SUB:.*]] = linalg.generic
|
||||
// TLOOP-SAME: ins(%[[C0_F32_]]
|
||||
// TLOOP-SAME: outs(%[[OUT_SUB]]
|
||||
|
@ -385,15 +385,15 @@ module {
|
|||
// TLOOP-SAME: outs (%[[INIT_SUB_:.*]] = %[[INIT_SUB]]: [[TY]])
|
||||
// TLOOP-SAME: iterators["reduction"] {
|
||||
|
||||
// TLOOP: %[[A_SUB_SUB:.*]] = subtensor %[[A_SUB_]][0, %[[K]]]
|
||||
// TLOOP: %[[B_SUB_SUB:.*]] = subtensor %[[B_SUB_]][%[[K]], 0]
|
||||
// TLOOP: %[[A_SUB_SUB:.*]] = tensor.extract_slice %[[A_SUB_]][0, %[[K]]]
|
||||
// TLOOP: %[[B_SUB_SUB:.*]] = tensor.extract_slice %[[B_SUB_]][%[[K]], 0]
|
||||
|
||||
// TLOOP: %[[AB_SUB_SUB:.*]] = linalg.matmul
|
||||
// TLOOP-SAME: ins(%[[A_SUB_SUB]], %[[B_SUB_SUB]] : [[TY]], [[TY]])
|
||||
// TLOOP-SAME: outs(%[[INIT_SUB_]] : [[TY]]) -> [[TY]]
|
||||
// TLOOP: linalg.yield %[[AB_SUB_SUB]] : [[TY]]
|
||||
// TLOOP: }
|
||||
// TLOOP: %[[SUB_RESULT:.*]] = subtensor_insert %[[AB_SUB]]
|
||||
// TLOOP: %[[SUB_RESULT:.*]] = tensor.insert_slice %[[AB_SUB]]
|
||||
// TLOOP-SAME: into %[[OUT_]][%[[I]], %[[J]]]
|
||||
// TLOOP: linalg.yield %[[SUB_RESULT]] : [[TY]]
|
||||
// TLOOP: }
|
||||
|
|
|
@ -53,10 +53,10 @@ func @matmul_tensors(
|
|||
// CHECK: %[[A:.*]] = scf.for %[[J1:[0-9a-z]+]] =
|
||||
// Iteration count along J1
|
||||
// CHECK: %[[IDXpad0_K:[0-9]+]] = affine.apply #[[$DIV4]](%[[J1]])
|
||||
// CHECK: subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: tensor.extract_slice %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: linalg.pad_tensor %{{.*}}
|
||||
// CHECK: : tensor<?x?xf32> to tensor<2x4xf32>
|
||||
// CHECK: subtensor_insert %{{.*}} into %{{.*}}[%[[IDXpad0_K]], 0, 0]
|
||||
// CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%[[IDXpad0_K]], 0, 0]
|
||||
// CHECK-SAME: [1, 2, 4] [1, 1, 1] : tensor<2x4xf32> into tensor<?x2x4xf32>
|
||||
// Second tensor is KxN but loop order is (M, N, K) so padded tensor is NxKx4x3
|
||||
// CHECK: %[[SZpad1_N:[0-9]+]] = affine.apply #[[$DIVS3]]()[%[[dN]]]
|
||||
|
@ -69,23 +69,23 @@ func @matmul_tensors(
|
|||
// CHECK: scf.for %[[J2:[0-9a-z]+]] =
|
||||
// Iteration count along J2
|
||||
// CHECK: %[[IDXpad1_N:[0-9]+]] = affine.apply #[[$DIV4]](%[[J2]])
|
||||
// CHECK: subtensor %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: tensor.extract_slice %{{.*}} [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: linalg.pad_tensor %{{.*}}
|
||||
// CHECK: : tensor<?x?xf32> to tensor<4x3xf32>
|
||||
// CHECK: subtensor_insert %{{.*}} into %{{.*}}[%[[IDXpad1_K]], %[[IDXpad1_N]], 0, 0]
|
||||
// CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%[[IDXpad1_K]], %[[IDXpad1_N]], 0, 0]
|
||||
// CHECK-SAME: [1, 1, 4, 3] [1, 1, 1, 1] : tensor<4x3xf32> into tensor<?x?x4x3xf32>
|
||||
// 2-D loop
|
||||
// CHECK: scf.for %[[J:[0-9a-zA-Z]+]]
|
||||
// CHECK: scf.for %[[K:[0-9a-zA-Z]+]]
|
||||
// Iteration count along K
|
||||
// CHECK: %[[IDXpad0_K:[0-9]+]] = affine.apply #[[$DIV4]](%[[K]])
|
||||
// CHECK: %[[stA:.*]] = subtensor %[[A]][%[[IDXpad0_K]], 0, 0] [1, 2, 4] [1, 1, 1] :
|
||||
// CHECK: %[[stA:.*]] = tensor.extract_slice %[[A]][%[[IDXpad0_K]], 0, 0] [1, 2, 4] [1, 1, 1] :
|
||||
// CHECK-SAME: tensor<?x2x4xf32> to tensor<2x4xf32>
|
||||
// Iteration count along J
|
||||
// CHECK: %[[IDXpad1_N:[0-9]+]] = affine.apply #[[$DIV3]](%[[J]])
|
||||
// Iteration count along K
|
||||
// CHECK: %[[IDXpad1_K:[0-9]+]] = affine.apply #[[$DIV4]](%[[K]])
|
||||
// CHECK: %[[stB:.*]] = subtensor %[[B]][%[[IDXpad1_N]], %[[IDXpad1_K]], 0, 0] [1, 1, 4, 3] [1, 1, 1, 1] :
|
||||
// CHECK: %[[stB:.*]] = tensor.extract_slice %[[B]][%[[IDXpad1_N]], %[[IDXpad1_K]], 0, 0] [1, 1, 4, 3] [1, 1, 1, 1] :
|
||||
// CHECK-SAME: tensor<?x?x4x3xf32> to tensor<4x3xf32>
|
||||
// CHECK: %[[stC:.*]] = linalg.pad_tensor %{{.*}}
|
||||
// CHECK: : tensor<?x?xf32> to tensor<2x3xf32>
|
||||
|
@ -98,17 +98,17 @@ func @matmul_tensors(
|
|||
%7 = affine.min #map0(%arg3)[%6]
|
||||
%8 = memref.dim %arg0, %c1 : tensor<?x?xf32>
|
||||
%9 = affine.min #map1(%arg7)[%8]
|
||||
%10 = subtensor %arg0[%arg3, %arg7] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%10 = tensor.extract_slice %arg0[%arg3, %arg7] [%7, %9] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%11 = memref.dim %arg1, %c0 : tensor<?x?xf32>
|
||||
%12 = affine.min #map1(%arg7)[%11]
|
||||
%13 = memref.dim %arg1, %c1 : tensor<?x?xf32>
|
||||
%14 = affine.min #map2(%arg5)[%13]
|
||||
%15 = subtensor %arg1[%arg7, %arg5] [%12, %14] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%15 = tensor.extract_slice %arg1[%arg7, %arg5] [%12, %14] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%16 = memref.dim %arg8, %c0 : tensor<?x?xf32>
|
||||
%17 = affine.min #map3(%16, %arg3)
|
||||
%18 = memref.dim %arg8, %c1 : tensor<?x?xf32>
|
||||
%19 = affine.min #map4(%18, %arg5)
|
||||
%20 = subtensor %arg8[%arg3, %arg5] [%17, %19] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%20 = tensor.extract_slice %arg8[%arg3, %arg5] [%17, %19] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%21 = subi %c2, %7 : index
|
||||
%22 = subi %c4, %9 : index
|
||||
%23 = linalg.pad_tensor %10 low[%c0, %c0] high[%21, %22] {
|
||||
|
@ -128,8 +128,8 @@ func @matmul_tensors(
|
|||
linalg.yield %cst : f32
|
||||
} : tensor<?x?xf32> to tensor<2x3xf32>
|
||||
%30 = linalg.matmul ins(%23, %26 : tensor<2x4xf32>, tensor<4x3xf32>) outs(%29 : tensor<2x3xf32>) -> tensor<2x3xf32>
|
||||
%31 = subtensor %30[0, 0] [%7, %14] [1, 1] : tensor<2x3xf32> to tensor<?x?xf32>
|
||||
%32 = subtensor_insert %31 into %arg8[%arg3, %arg5] [%17, %19] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%31 = tensor.extract_slice %30[0, 0] [%7, %14] [1, 1] : tensor<2x3xf32> to tensor<?x?xf32>
|
||||
%32 = tensor.insert_slice %31 into %arg8[%arg3, %arg5] [%17, %19] [%c1, %c1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
scf.yield %32 : tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %5 : tensor<?x?xf32>
|
||||
|
@ -173,7 +173,7 @@ func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
|
|||
// CHECK: %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor<?x?x2xf32>
|
||||
// CHECK: %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
|
||||
// CHECK: scf.for %[[III:[0-9a-z]+]] =
|
||||
// CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
|
||||
// CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
|
||||
//
|
||||
// CHECK: %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
|
||||
// CHECK: %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
|
||||
|
@ -182,33 +182,33 @@ func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
|
|||
// CHECK: %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor<?x?x2xf32>
|
||||
// CHECK: %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
|
||||
// CHECK: scf.for %[[III_2:[0-9a-z]+]] =
|
||||
// CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
|
||||
// CHECK: tensor.insert_slice %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
|
||||
// Compute.
|
||||
// CHECK: scf.for %[[II_3:[0-9a-z]+]] =
|
||||
// CHECK: scf.for %[[III_3:[0-9a-z]+]] = {{.*}} iter_args(%[[C:.*]] = %{{.*}}) -> (tensor<f32>) {
|
||||
// CHECK: %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
|
||||
// CHECK: %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
|
||||
// CHECK: %[[A:.*]] = subtensor %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
|
||||
// CHECK: %[[A:.*]] = tensor.extract_slice %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
|
||||
// CHECK: %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
|
||||
// CHECK: %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
|
||||
// CHECK: %[[B:.*]] = subtensor %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
|
||||
// CHECK: %[[B:.*]] = tensor.extract_slice %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
|
||||
// CHECK: linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>
|
||||
|
||||
%4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
|
||||
%5 = affine.min #map0(%arg3)[%2]
|
||||
%6 = subtensor %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%6 = tensor.extract_slice %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%7 = affine.min #map0(%arg3)[%3]
|
||||
%8 = subtensor %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%8 = tensor.extract_slice %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
|
||||
%10 = affine.min #map1(%5, %arg5)
|
||||
%11 = subtensor %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%11 = tensor.extract_slice %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%12 = affine.min #map1(%7, %arg5)
|
||||
%13 = subtensor %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%13 = tensor.extract_slice %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
|
||||
%15 = affine.min #map2(%10, %arg7)
|
||||
%16 = subtensor %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%16 = tensor.extract_slice %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%17 = affine.min #map2(%12, %arg7)
|
||||
%18 = subtensor %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%18 = tensor.extract_slice %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
|
||||
%19 = subi %c2, %15 : index
|
||||
%20 = linalg.pad_tensor %16 low[%c0] high[%19] {
|
||||
^bb0(%arg9: index): // no predecessors
|
||||
|
@ -245,17 +245,17 @@ func @matmul_2d_tiling(%arg0: tensor<32x128xf32>, %arg1: tensor<128x64xf32>, %ar
|
|||
%1 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %arg2) -> (tensor<32x64xf32>) {
|
||||
%2 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<32x64xf32>) {
|
||||
%3 = scf.for %arg7 = %c0 to %c128 step %c32 iter_args(%arg8 = %arg6) -> (tensor<32x64xf32>) {
|
||||
%4 = subtensor %arg0[%arg3, %arg7] [16, 32] [1, 1] : tensor<32x128xf32> to tensor<16x32xf32>
|
||||
%5 = subtensor %arg1[%arg7, %arg5] [32, 32] [1, 1] : tensor<128x64xf32> to tensor<32x32xf32>
|
||||
%6 = subtensor %arg8[%arg3, %arg5] [16, 32] [1, 1] : tensor<32x64xf32> to tensor<16x32xf32>
|
||||
%4 = tensor.extract_slice %arg0[%arg3, %arg7] [16, 32] [1, 1] : tensor<32x128xf32> to tensor<16x32xf32>
|
||||
%5 = tensor.extract_slice %arg1[%arg7, %arg5] [32, 32] [1, 1] : tensor<128x64xf32> to tensor<32x32xf32>
|
||||
%6 = tensor.extract_slice %arg8[%arg3, %arg5] [16, 32] [1, 1] : tensor<32x64xf32> to tensor<16x32xf32>
|
||||
%7 = scf.for %arg9 = %c0 to %c16 step %c2 iter_args(%arg10 = %6) -> (tensor<16x32xf32>) {
|
||||
%10 = scf.for %arg11 = %c0 to %c32 step %c4 iter_args(%arg12 = %arg10) -> (tensor<16x32xf32>) {
|
||||
%11 = scf.for %arg13 = %c0 to %c32 step %c16 iter_args(%arg14 = %arg12) -> (tensor<16x32xf32>) {
|
||||
%12 = subtensor %4[%arg9, %arg13] [2, 16] [1, 1] : tensor<16x32xf32> to tensor<2x16xf32>
|
||||
%12 = tensor.extract_slice %4[%arg9, %arg13] [2, 16] [1, 1] : tensor<16x32xf32> to tensor<2x16xf32>
|
||||
%13 = tensor.cast %12 : tensor<2x16xf32> to tensor<?x?xf32>
|
||||
%14 = subtensor %5[%arg13, %arg11] [16, 4] [1, 1] : tensor<32x32xf32> to tensor<16x4xf32>
|
||||
%14 = tensor.extract_slice %5[%arg13, %arg11] [16, 4] [1, 1] : tensor<32x32xf32> to tensor<16x4xf32>
|
||||
%15 = tensor.cast %14 : tensor<16x4xf32> to tensor<?x?xf32>
|
||||
%16 = subtensor %arg14[%arg9, %arg11] [2, 4] [1, 1] : tensor<16x32xf32> to tensor<2x4xf32>
|
||||
%16 = tensor.extract_slice %arg14[%arg9, %arg11] [2, 4] [1, 1] : tensor<16x32xf32> to tensor<2x4xf32>
|
||||
%17 = tensor.cast %16 : tensor<2x4xf32> to tensor<?x?xf32>
|
||||
%18 = linalg.pad_tensor %13 low[%c0, %c0] high[%c0, %c0] {
|
||||
^bb0(%arg15: index, %arg16: index): // no predecessors
|
||||
|
@ -271,7 +271,7 @@ func @matmul_2d_tiling(%arg0: tensor<32x128xf32>, %arg1: tensor<128x64xf32>, %ar
|
|||
} : tensor<?x?xf32> to tensor<2x4xf32>
|
||||
%21 = linalg.matmul ins(%18, %19 : tensor<2x16xf32>, tensor<16x4xf32>) outs(%20 : tensor<2x4xf32>) -> tensor<2x4xf32>
|
||||
%22 = tensor.cast %21 : tensor<2x4xf32> to tensor<?x?xf32>
|
||||
%23 = subtensor_insert %22 into %arg14[%arg9, %arg11] [%c2, %c4] [1, 1] : tensor<?x?xf32> into tensor<16x32xf32>
|
||||
%23 = tensor.insert_slice %22 into %arg14[%arg9, %arg11] [%c2, %c4] [1, 1] : tensor<?x?xf32> into tensor<16x32xf32>
|
||||
scf.yield %23 : tensor<16x32xf32>
|
||||
}
|
||||
scf.yield %11 : tensor<16x32xf32>
|
||||
|
@ -279,7 +279,7 @@ func @matmul_2d_tiling(%arg0: tensor<32x128xf32>, %arg1: tensor<128x64xf32>, %ar
|
|||
scf.yield %10 : tensor<16x32xf32>
|
||||
}
|
||||
%8 = tensor.cast %7 : tensor<16x32xf32> to tensor<?x?xf32>
|
||||
%9 = subtensor_insert %8 into %arg8[%arg3, %arg5] [%c16, %c32] [1, 1] : tensor<?x?xf32> into tensor<32x64xf32>
|
||||
%9 = tensor.insert_slice %8 into %arg8[%arg3, %arg5] [%c16, %c32] [1, 1] : tensor<?x?xf32> into tensor<32x64xf32>
|
||||
scf.yield %9 : tensor<32x64xf32>
|
||||
}
|
||||
scf.yield %3 : tensor<32x64xf32>
|
||||
|
|
|
@ -321,14 +321,14 @@ func @hoist_vector_transfer_pairs_disjoint_tensor(
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_subtensors
|
||||
// CHECK-LABEL: func @hoist_vector_transfer_pairs_tensor_and_slices
|
||||
// CHECK-SAME: %[[TENSOR0:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
|
||||
// CHECK-SAME: %[[TENSOR1:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
|
||||
// CHECK-SAME: %[[TENSOR2:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
|
||||
// CHECK-SAME: %[[TENSOR3:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
|
||||
// CHECK-SAME: %[[TENSOR4:[a-zA-Z0-9]*]]: tensor<?x?xf32>,
|
||||
// CHECK-SAME: %[[TENSOR5:[a-zA-Z0-9]*]]: tensor<?x?xf32>
|
||||
func @hoist_vector_transfer_pairs_tensor_and_subtensors(
|
||||
func @hoist_vector_transfer_pairs_tensor_and_slices(
|
||||
%tensor0: tensor<?x?xf32>, %tensor1: tensor<?x?xf32>, %tensor2: tensor<?x?xf32>,
|
||||
%tensor3: tensor<?x?xf32>, %tensor4: tensor<?x?xf32>, %tensor5: tensor<?x?xf32>,
|
||||
%val: index, %lb : index, %ub : index, %step: index) ->
|
||||
|
@ -349,7 +349,7 @@ func @hoist_vector_transfer_pairs_tensor_and_subtensors(
|
|||
-> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
|
||||
// Hoisted
|
||||
// CHECK: %[[ST0:.*]] = subtensor %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[ST0:.*]] = tensor.extract_slice %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[V0:.*]] = vector.transfer_read %[[ST0]]{{.*}} : tensor<?x?xf32>, vector<1xf32>
|
||||
|
||||
// CHECK: %[[R:.*]]:3 = scf.for %[[J:.*]] = {{.*}} iter_args(
|
||||
|
@ -362,19 +362,19 @@ func @hoist_vector_transfer_pairs_tensor_and_subtensors(
|
|||
iter_args(%arg6 = %arg0, %arg7 = %arg1, %arg8 = %arg2)
|
||||
-> (tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) {
|
||||
// Hoists.
|
||||
%st0 = subtensor %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%st0 = tensor.extract_slice %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%r0 = vector.transfer_read %st0[%c0, %c0], %cst: tensor<?x?xf32>, vector<1xf32>
|
||||
|
||||
// CHECK: %[[ST1:.*]] = subtensor %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[ST1:.*]] = tensor.extract_slice %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[V1:.*]] = vector.transfer_read %[[ST1]]{{.*}} : tensor<?x?xf32>, vector<2xf32>
|
||||
// Does not hoist (subtensor depends on %j)
|
||||
%st1 = subtensor %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// Does not hoist (slice depends on %j)
|
||||
%st1 = tensor.extract_slice %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%r1 = vector.transfer_read %st1[%c0, %c0], %cst: tensor<?x?xf32>, vector<2xf32>
|
||||
|
||||
// CHECK: %[[ST2:.*]] = subtensor %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[ST2:.*]] = tensor.extract_slice %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[V2:.*]] = vector.transfer_read %[[ST2]]{{.*}} : tensor<?x?xf32>, vector<3xf32>
|
||||
// Does not hoist, 2 subtensor %arg8.
|
||||
%st2 = subtensor %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// Does not hoist, 2 slice %arg8.
|
||||
%st2 = tensor.extract_slice %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%r2 = vector.transfer_read %st2[%c0, %c0], %cst: tensor<?x?xf32>, vector<3xf32>
|
||||
|
||||
// CHECK: %[[U0:.*]] = "some_use"(%[[V0_ARG_L2]]) : (vector<1xf32>) -> vector<1xf32>
|
||||
|
@ -388,25 +388,25 @@ func @hoist_vector_transfer_pairs_tensor_and_subtensors(
|
|||
%w0 = vector.transfer_write %u0, %st0[%c0, %c0] : vector<1xf32>, tensor<?x?xf32>
|
||||
|
||||
// CHECK-DAG: %[[STI1:.*]] = vector.transfer_write %[[U1]], %{{.*}} : vector<2xf32>, tensor<?x?xf32>
|
||||
// Does not hoist (associated subtensor depends on %j).
|
||||
// Does not hoist (associated slice depends on %j).
|
||||
%w1 = vector.transfer_write %u1, %st1[%i, %i] : vector<2xf32>, tensor<?x?xf32>
|
||||
|
||||
// CHECK-DAG: %[[STI2:.*]] = vector.transfer_write %[[U2]], %{{.*}} : vector<3xf32>, tensor<?x?xf32>
|
||||
// Does not hoist, 2 subtensor / subtensor_insert for %arg8.
|
||||
// Does not hoist, 2 slice / insert_slice for %arg8.
|
||||
%w2 = vector.transfer_write %u2, %st2[%c0, %c0] : vector<3xf32>, tensor<?x?xf32>
|
||||
|
||||
// Hoists.
|
||||
%sti0 = subtensor_insert %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%sti0 = tensor.insert_slice %w0 into %arg6[%i, %i][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
|
||||
// CHECK-DAG: subtensor_insert %[[STI1]] into %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK-DAG: tensor.insert_slice %[[STI1]] into %[[TENSOR1_ARG_L2]][%[[J]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// Does not hoist (depends on %j).
|
||||
%sti1 = subtensor_insert %w1 into %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%sti1 = tensor.insert_slice %w1 into %arg7[%j, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
|
||||
// CHECK-DAG: subtensor_insert %[[STI2]] into %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// Does not hoist, 2 subtensor / subtensor_insert for %arg8.
|
||||
%sti2 = subtensor_insert %w2 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%st22 = subtensor %sti2[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%sti22 = subtensor_insert %st22 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK-DAG: tensor.insert_slice %[[STI2]] into %[[TENSOR2_ARG_L2]][%[[I]],{{.*}}: tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// Does not hoist, 2 slice / insert_slice for %arg8.
|
||||
%sti2 = tensor.insert_slice %w2 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%st22 = tensor.extract_slice %sti2[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%sti22 = tensor.insert_slice %st22 into %arg8[%i, %c0][%step, %step][1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
|
||||
// CHECK: scf.yield {{.*}} : tensor<?x?xf32>, tensor<?x?xf32>, vector<1xf32>
|
||||
// CHECK: }
|
||||
|
@ -416,7 +416,7 @@ func @hoist_vector_transfer_pairs_tensor_and_subtensors(
|
|||
|
||||
// Hoisted
|
||||
// CHECK: %[[STI0:.*]] = vector.transfer_write %[[R]]#2, %[[ST0]]{{.*}} : vector<1xf32>, tensor<?x?xf32>
|
||||
// CHECK: subtensor_insert %[[STI0]] into %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}} : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK: tensor.insert_slice %[[STI0]] into %[[TENSOR0_ARG]][%[[I]], %[[I]]]{{.*}} : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
|
||||
// CHECK: scf.yield {{.*}} : tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
|
||||
scf.yield %1#0, %1#1, %1#2 :
|
||||
|
|
|
@ -732,11 +732,11 @@ func @tiled_loop(%lhs: tensor<24x64xi8>, %rhs: tensor<24x64xi8>,
|
|||
%prod = linalg.tiled_loop (%i) = (%c0) to (%c24) step (%c4)
|
||||
ins(%lhs_ = %lhs: tensor<24x64xi8>, %rhs_ = %rhs: tensor<24x64xi8>)
|
||||
outs(%out_ = %out: tensor<24x64xi8>) {
|
||||
%lhs_sub = subtensor %lhs_[%i, 0] [%c4, %c64] [1, 1]
|
||||
%lhs_sub = tensor.extract_slice %lhs_[%i, 0] [%c4, %c64] [1, 1]
|
||||
: tensor<24x64xi8> to tensor<?x?xi8>
|
||||
%rhs_sub = subtensor %rhs_[%i, 0] [%c4, %c64] [1, 1]
|
||||
%rhs_sub = tensor.extract_slice %rhs_[%i, 0] [%c4, %c64] [1, 1]
|
||||
: tensor<24x64xi8> to tensor<?x?xi8>
|
||||
%out_sub = subtensor %out_[%i, 0] [%c4, %c64] [1, 1]
|
||||
%out_sub = tensor.extract_slice %out_[%i, 0] [%c4, %c64] [1, 1]
|
||||
: tensor<24x64xi8> to tensor<?x?xi8>
|
||||
|
||||
%sum = linalg.generic #trait_4
|
||||
|
@ -747,7 +747,7 @@ func @tiled_loop(%lhs: tensor<24x64xi8>, %rhs: tensor<24x64xi8>,
|
|||
linalg.yield %s : i8
|
||||
} -> tensor<?x?xi8>
|
||||
|
||||
%sum_sub = subtensor_insert %sum into %out_[%i, 0][%c4, %c64][1, 1]
|
||||
%sum_sub = tensor.insert_slice %sum into %out_[%i, 0][%c4, %c64][1, 1]
|
||||
: tensor<?x?xi8> into tensor<24x64xi8>
|
||||
linalg.yield %sum_sub : tensor<24x64xi8>
|
||||
}
|
||||
|
@ -792,13 +792,13 @@ func @tiled_loop_reduction(%input_3d: tensor<16x24x32xf32>,
|
|||
outs(%o_ = %output: tensor<24xf32>)
|
||||
iterators["reduction", "parallel", "reduction"]
|
||||
distribution["block_x", "block_y", "none"] {
|
||||
%sub_3d = subtensor %i3d_[%i, %j, %k][2, 4, 8][1, 1, 1]
|
||||
%sub_3d = tensor.extract_slice %i3d_[%i, %j, %k][2, 4, 8][1, 1, 1]
|
||||
: tensor<16x24x32xf32> to tensor<2x4x8xf32>
|
||||
%sub_2d = subtensor %i2d_[%i, %k][2, 8][1, 1]
|
||||
%sub_2d = tensor.extract_slice %i2d_[%i, %k][2, 8][1, 1]
|
||||
: tensor<16x32xf32> to tensor<2x8xf32>
|
||||
%sub_1d = subtensor %i1d_[%j] [4] [1]
|
||||
%sub_1d = tensor.extract_slice %i1d_[%j] [4] [1]
|
||||
: tensor<24xf32> to tensor<4xf32>
|
||||
%sub_out = subtensor %o_[%j] [4] [1]
|
||||
%sub_out = tensor.extract_slice %o_[%j] [4] [1]
|
||||
: tensor<24xf32> to tensor<4xf32>
|
||||
%acc = linalg.generic #trait_5
|
||||
ins(%sub_3d, %sub_2d, %sub_1d
|
||||
|
@ -810,7 +810,7 @@ func @tiled_loop_reduction(%input_3d: tensor<16x24x32xf32>,
|
|||
linalg.yield %1 : f32
|
||||
} -> tensor<4xf32>
|
||||
|
||||
%sum_sub = subtensor_insert %acc into %o_[%j][%c4][1]
|
||||
%sum_sub = tensor.insert_slice %acc into %o_[%j][%c4][1]
|
||||
: tensor<4xf32> into tensor<24xf32>
|
||||
linalg.yield %sum_sub : tensor<24xf32>
|
||||
}
|
||||
|
|
|
@ -2,7 +2,7 @@
|
|||
|
||||
// CHECK-LABEL: @static_data_only(
|
||||
// CHECK-SAME: %[[ARG0:.*]]: tensor<4x5xf32>
|
||||
// CHECK: %[[RESULT:.*]] = subtensor %[[ARG0]][1, 2] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
|
||||
// CHECK: %[[RESULT:.*]] = tensor.extract_slice %[[ARG0]][1, 2] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
|
||||
// CHECK: return %[[RESULT]]
|
||||
func @static_data_only(%arg0 : tensor<4x5xf32>, %pad : f32)
|
||||
-> tensor<2x1xf32> {
|
||||
|
@ -10,7 +10,7 @@ func @static_data_only(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<4x5xf32> to tensor<11x13xf32>
|
||||
%1 = subtensor %0[1, 2] [2, 1] [1, 1] : tensor<11x13xf32> to tensor<2x1xf32>
|
||||
%1 = tensor.extract_slice %0[1, 2] [2, 1] [1, 1] : tensor<11x13xf32> to tensor<2x1xf32>
|
||||
return %1 : tensor<2x1xf32>
|
||||
}
|
||||
|
||||
|
@ -19,7 +19,7 @@ func @static_data_only(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
// CHECK-LABEL: @static_high_pad_only
|
||||
// CHECK-SAME: %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
// CHECK-NOT: subtensor
|
||||
// CHECK-NOT: tensor.extract_slice
|
||||
// CHECK: %[[RESULT:.*]] = tensor.generate
|
||||
// CHECK: tensor.yield %[[PAD]]
|
||||
// CHECK: return %[[RESULT]] : tensor<2x4xf32>
|
||||
|
@ -29,7 +29,7 @@ func @static_high_pad_only(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<4x5xf32> to tensor<11x13xf32>
|
||||
%1 = subtensor %0[4, 5] [2, 4] [1, 1] : tensor<11x13xf32> to tensor<2x4xf32>
|
||||
%1 = tensor.extract_slice %0[4, 5] [2, 4] [1, 1] : tensor<11x13xf32> to tensor<2x4xf32>
|
||||
return %1 : tensor<2x4xf32>
|
||||
}
|
||||
|
||||
|
@ -38,7 +38,7 @@ func @static_high_pad_only(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
// CHECK-LABEL: @static_low_pad_only
|
||||
// CHECK-SAME: %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
// CHECK-NOT: subtensor
|
||||
// CHECK-NOT: tensor.extract_slice
|
||||
// CHECK: %[[RESULT:.*]] = tensor.generate
|
||||
// CHECK: tensor.yield %[[PAD]]
|
||||
// CHECK: return %[[RESULT]] : tensor<2x3xf32>
|
||||
|
@ -48,7 +48,7 @@ func @static_low_pad_only(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<4x5xf32> to tensor<14x20xf32>
|
||||
%1 = subtensor %0[1, 3] [2, 3] [1, 1] : tensor<14x20xf32> to tensor<2x3xf32>
|
||||
%1 = tensor.extract_slice %0[1, 3] [2, 3] [1, 1] : tensor<14x20xf32> to tensor<2x3xf32>
|
||||
return %1 : tensor<2x3xf32>
|
||||
}
|
||||
|
||||
|
@ -57,7 +57,7 @@ func @static_low_pad_only(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
// CHECK-LABEL: @static_low_pad_only_2
|
||||
// CHECK-SAME: %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
// CHECK-NOT: subtensor
|
||||
// CHECK-NOT: tensor.extract_slice
|
||||
// CHECK: %[[RESULT:.*]] = tensor.generate
|
||||
// CHECK: tensor.yield %[[PAD]]
|
||||
// CHECK: return %[[RESULT]] : tensor<1x3xf32>
|
||||
|
@ -67,7 +67,7 @@ func @static_low_pad_only_2(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<4x5xf32> to tensor<14x20xf32>
|
||||
%1 = subtensor %0[1, 3] [1, 3] [1, 1] : tensor<14x20xf32> to tensor<1x3xf32>
|
||||
%1 = tensor.extract_slice %0[1, 3] [1, 3] [1, 1] : tensor<14x20xf32> to tensor<1x3xf32>
|
||||
return %1 : tensor<1x3xf32>
|
||||
}
|
||||
|
||||
|
@ -76,7 +76,7 @@ func @static_low_pad_only_2(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
// CHECK-LABEL: @static_mixed_data_high_pad
|
||||
// CHECK-SAME: %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
// CHECK: %[[SUBTENSOR:.*]] = subtensor %[[ARG0]][2, 4] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
|
||||
// CHECK: %[[SUBTENSOR:.*]] = tensor.extract_slice %[[ARG0]][2, 4] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
|
||||
// CHECK: %[[RESULT:.*]] = linalg.pad_tensor %[[SUBTENSOR]] low[0, 0] high[1, 3]
|
||||
// CHECK: linalg.yield %[[PAD]]
|
||||
// CHECK: return %[[RESULT]] : tensor<3x4xf32>
|
||||
|
@ -86,7 +86,7 @@ func @static_mixed_data_high_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<4x5xf32> to tensor<11x13xf32>
|
||||
%1 = subtensor %0[2, 4] [3, 4] [1, 1] : tensor<11x13xf32> to tensor<3x4xf32>
|
||||
%1 = tensor.extract_slice %0[2, 4] [3, 4] [1, 1] : tensor<11x13xf32> to tensor<3x4xf32>
|
||||
return %1 : tensor<3x4xf32>
|
||||
}
|
||||
|
||||
|
@ -95,7 +95,7 @@ func @static_mixed_data_high_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
// CHECK-LABEL: @static_mixed_data_low_pad
|
||||
// CHECK-SAME: %[[ARG0:.*]]: tensor<4x5xf32>, %[[PAD:.*]]: f32
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
// CHECK: %[[SUBTENSOR:.*]] = subtensor %[[ARG0]][0, 0] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
|
||||
// CHECK: %[[SUBTENSOR:.*]] = tensor.extract_slice %[[ARG0]][0, 0] [2, 1] [1, 1] : tensor<4x5xf32> to tensor<2x1xf32>
|
||||
// CHECK: %[[RESULT:.*]] = linalg.pad_tensor %[[SUBTENSOR]] low[1, 3] high[0, 0]
|
||||
// CHECK: linalg.yield %[[PAD]]
|
||||
// CHECK: return %[[RESULT]] : tensor<3x4xf32>
|
||||
|
@ -105,7 +105,7 @@ func @static_mixed_data_low_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<4x5xf32> to tensor<14x20xf32>
|
||||
%1 = subtensor %0[2, 4] [3, 4] [1, 1] : tensor<14x20xf32> to tensor<3x4xf32>
|
||||
%1 = tensor.extract_slice %0[2, 4] [3, 4] [1, 1] : tensor<14x20xf32> to tensor<3x4xf32>
|
||||
return %1 : tensor<3x4xf32>
|
||||
}
|
||||
|
||||
|
@ -123,7 +123,7 @@ func @static_mixed_data_low_high_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<4x5xf32> to tensor<13x16xf32>
|
||||
%1 = subtensor %0[1, 2] [7, 9] [1, 1] : tensor<13x16xf32> to tensor<7x9xf32>
|
||||
%1 = tensor.extract_slice %0[1, 2] [7, 9] [1, 1] : tensor<13x16xf32> to tensor<7x9xf32>
|
||||
return %1 : tensor<7x9xf32>
|
||||
}
|
||||
|
||||
|
@ -138,7 +138,7 @@ func @static_mixed_data_low_high_pad(%arg0 : tensor<4x5xf32>, %pad : f32)
|
|||
// CHECK: %[[GEN:.*]] = tensor.generate
|
||||
// CHECK: scf.yield %[[GEN]]
|
||||
// CHECK: } else {
|
||||
// CHECK: %[[SUBTENSOR:.*]] = subtensor %[[ARG0]][%{{.*}}, 4] [%{{.*}}, 1] [1, 1] : tensor<?x5xf32> to tensor<?x1xf32>
|
||||
// CHECK: %[[SUBTENSOR:.*]] = tensor.extract_slice %[[ARG0]][%{{.*}}, 4] [%{{.*}}, 1] [1, 1] : tensor<?x5xf32> to tensor<?x1xf32>
|
||||
// CHECK: %[[PADTENSOR:.*]] = linalg.pad_tensor %[[SUBTENSOR]] low[0, 0] high[%{{.*}}, 3]
|
||||
// CHECK: %[[CAST:.*]] = tensor.cast %[[PADTENSOR]] : tensor<?x4xf32> to tensor<3x4xf32>
|
||||
// CHECK: scf.yield %[[CAST]]
|
||||
|
@ -149,7 +149,7 @@ func @dynamic_high_pad(%arg0 : tensor<?x5xf32>, %h1: index, %pad : f32) -> tenso
|
|||
^bb0(%arg1: index, %arg2: index):
|
||||
linalg.yield %pad : f32
|
||||
} : tensor<?x5xf32> to tensor<?x13xf32>
|
||||
%1 = subtensor %0[2, 4] [3, 4] [1, 1] : tensor<?x13xf32> to tensor<3x4xf32>
|
||||
%1 = tensor.extract_slice %0[2, 4] [3, 4] [1, 1] : tensor<?x13xf32> to tensor<3x4xf32>
|
||||
return %1 : tensor<3x4xf32>
|
||||
}
|
||||
|
||||
|
|
|
@ -199,12 +199,12 @@ func @matmul_tensors(
|
|||
// CHECK: %[[STEPX:.+]] = affine.apply #[[MULMAP]]()[%[[NBLOCKSX]], %[[C8]]]
|
||||
// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTA:.*]] = tensor.extract_slice %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
// CHECK-SAME: outs(%[[sTC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
// CHECK: %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK: %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD]] : tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD2]] : tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD1]] : tensor<?x?xf32>
|
||||
|
|
|
@ -16,11 +16,11 @@ func @matmul_tensors(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tens
|
|||
%3 = scf.for %arg3 = %c0 to %0 step %c2 iter_args(%arg4 = %arg2) -> (tensor<?x?xf32>) {
|
||||
%4 = scf.for %arg5 = %c0 to %2 step %c3 iter_args(%arg6 = %arg4) -> (tensor<?x?xf32>) {
|
||||
%5 = scf.for %arg7 = %c0 to %1 step %c4 iter_args(%arg8 = %arg6) -> (tensor<?x?xf32>) {
|
||||
%6 = subtensor %t0[%arg3, %arg7][%c2, 4][1, 1] : tensor<?x?xf32> to tensor<?x4xf32>
|
||||
%7 = subtensor %arg1[%arg7, %arg5][4, %c3][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
|
||||
%8 = subtensor %arg8[%arg3, %arg5][%c2, %c3][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%6 = tensor.extract_slice %t0[%arg3, %arg7][%c2, 4][1, 1] : tensor<?x?xf32> to tensor<?x4xf32>
|
||||
%7 = tensor.extract_slice %arg1[%arg7, %arg5][4, %c3][1, 1] : tensor<?x?xf32> to tensor<4x?xf32>
|
||||
%8 = tensor.extract_slice %arg8[%arg3, %arg5][%c2, %c3][1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
%9 = linalg.matmul ins(%6, %7 : tensor<?x4xf32>, tensor<4x?xf32>) outs(%8 : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
%10 = subtensor_insert %9 into %arg8[%arg3, %arg5] [%c2, %c3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
%10 = tensor.insert_slice %9 into %arg8[%arg3, %arg5] [%c2, %c3] [1, 1] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
scf.yield %10 : tensor<?x?xf32>
|
||||
}
|
||||
scf.yield %5 : tensor<?x?xf32>
|
||||
|
@ -48,22 +48,22 @@ func @matmul_tensors(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>, %arg2: tens
|
|||
// CHECK-DAG: %[[dC1:.*]] = memref.dim %[[C]], %[[C1]] : tensor<?x?xf32>
|
||||
// CHECK: scf.for %[[I:[0-9a-z]*]]
|
||||
// CHECK: %[[sizeA0:.*]] = affine.min #[[BOUND2_MAP]](%[[I]])[%[[dA0]]]
|
||||
// CHECK: %[[stA:.*]] = subtensor %[[A]][%[[I]], 0] [%[[sizeA0]], %[[dA1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[stA:.*]] = tensor.extract_slice %[[A]][%[[I]], 0] [%[[sizeA0]], %[[dA1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sizeC0:.*]] = affine.min #[[BOUND2_MAP]](%[[I]])[%[[dC0]]]
|
||||
// CHECK-NEXT: scf.for %[[J:[0-9a-z]*]]
|
||||
// CHECK-NEXT: scf.for %[[K:[0-9a-z]*]] {{.*}} iter_args(%[[RES:[0-9a-z]*]]
|
||||
// CHECK-DAG: %[[stB1:.*]] = subtensor %[[B]][%[[K]], %[[J]]] [4, 3] [1, 1] : tensor<?x?xf32> to tensor<4x3xf32>
|
||||
// CHECK-DAG: %[[stF:.*]] = subtensor %[[RES]][%[[I]], %[[J]]] [2, 3] [1, 1] : tensor<?x?xf32> to tensor<2x3xf32>
|
||||
// CHECK-DAG: %[[stB1:.*]] = tensor.extract_slice %[[B]][%[[K]], %[[J]]] [4, 3] [1, 1] : tensor<?x?xf32> to tensor<4x3xf32>
|
||||
// CHECK-DAG: %[[stF:.*]] = tensor.extract_slice %[[RES]][%[[I]], %[[J]]] [2, 3] [1, 1] : tensor<?x?xf32> to tensor<2x3xf32>
|
||||
//
|
||||
// subtensors of the producing matmul.
|
||||
// slices of the producing matmul.
|
||||
// CHECK: %[[sizeB1:.*]] = affine.min #[[BOUND4_MAP]](%[[K]])[%[[dB1]]]
|
||||
// CHECK: %[[stB2:.*]] = subtensor %[[B]][0, %[[K]]] [%[[dB0]], %[[sizeB1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[stB2:.*]] = tensor.extract_slice %[[B]][0, %[[K]]] [%[[dB0]], %[[sizeB1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sizeC1:.*]] = affine.min #[[BOUND4_MAP]](%[[K]])[%[[dC1]]]
|
||||
// CHECK: %[[stC:.*]] = subtensor %[[C]][%[[I]], %[[K]]] [%[[sizeC0]], %[[sizeC1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[stC:.*]] = tensor.extract_slice %[[C]][%[[I]], %[[K]]] [%[[sizeC0]], %[[sizeC1]]] [1, 1] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[stD:.*]] = linalg.matmul ins(%[[stA]], %[[stB2]] : tensor<?x?xf32>, tensor<?x?xf32>) outs(%[[stC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
// CHECK: %[[CAST:.*]] = tensor.cast %[[stD]] : tensor<?x?xf32> to tensor<?x4xf32>
|
||||
// CHECK-NEXT: %[[stG:.*]] = linalg.matmul ins(%[[CAST]], %[[stB1]] : tensor<?x4xf32>, tensor<4x3xf32>) outs(%[[stF]] : tensor<2x3xf32>) -> tensor<2x3xf32>
|
||||
// CHECK-NEXT: subtensor_insert %[[stG]] into %[[RES]][%[[I]], %[[J]]]
|
||||
// CHECK-NEXT: tensor.insert_slice %[[stG]] into %[[RES]][%[[I]], %[[J]]]
|
||||
|
||||
// -----
|
||||
|
||||
|
@ -87,9 +87,9 @@ func @conv_tensors_static(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3
|
|||
%for0 = scf.for %iv0 = %c0 to %c112 step %c8 iter_args(%arg0 = %fill) -> tensor<1x112x112x32xf32> {
|
||||
%for1 = scf.for %iv1 = %c0 to %c112 step %c16 iter_args(%arg1 = %arg0) -> tensor<1x112x112x32xf32> {
|
||||
%for2 = scf.for %iv2 = %c0 to %c32 step %c4 iter_args(%arg2 = %arg1) -> tensor<1x112x112x32xf32> {
|
||||
%0 = subtensor %conv[0, %iv0, %iv1, %iv2][1, 8, 16, 4][1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
%1 = subtensor %elementwise[0, %iv0, %iv1, %iv2][1, 8, 16, 4][1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
%2 = subtensor %arg2[0, %iv0, %iv1, %iv2][1, 8, 16, 4][1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
%0 = tensor.extract_slice %conv[0, %iv0, %iv1, %iv2][1, 8, 16, 4][1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
%1 = tensor.extract_slice %elementwise[0, %iv0, %iv1, %iv2][1, 8, 16, 4][1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
%2 = tensor.extract_slice %arg2[0, %iv0, %iv1, %iv2][1, 8, 16, 4][1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
%add = linalg.generic
|
||||
{
|
||||
indexing_maps = [
|
||||
|
@ -104,7 +104,7 @@ func @conv_tensors_static(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3
|
|||
linalg.yield %result : f32
|
||||
} -> tensor<1x8x16x4xf32>
|
||||
|
||||
%insert = subtensor_insert %add into %arg2[0, %iv0, %iv1, %iv2] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x8x16x4xf32> into tensor<1x112x112x32xf32>
|
||||
%insert = tensor.insert_slice %add into %arg2[0, %iv0, %iv1, %iv2] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x8x16x4xf32> into tensor<1x112x112x32xf32>
|
||||
scf.yield %insert : tensor<1x112x112x32xf32>
|
||||
}
|
||||
scf.yield %for2 : tensor<1x112x112x32xf32>
|
||||
|
@ -127,19 +127,19 @@ func @conv_tensors_static(%input: tensor<1x225x225x3xf32>, %filter: tensor<3x3x3
|
|||
// CHECK-NEXT: %[[OFFSET_H:.+]] = affine.apply #[[MAP0]](%[[IV0]])
|
||||
// CHECK-NEXT: scf.for %[[IV1:.+]] = %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[ARG1:.+]] = %[[ARG0]])
|
||||
// CHECK-NEXT: %[[OFFSET_W:.+]] = affine.apply #[[MAP0]](%[[IV1]])
|
||||
// CHECK-NEXT: %[[ST_INPUT:.+]] = subtensor %arg0[0, %[[OFFSET_H]], %[[OFFSET_W]], 0] [1, 17, 33, 3] [1, 1, 1, 1] : tensor<1x225x225x3xf32> to tensor<1x17x33x3xf32>
|
||||
// CHECK-NEXT: %[[ST_INPUT:.+]] = tensor.extract_slice %arg0[0, %[[OFFSET_H]], %[[OFFSET_W]], 0] [1, 17, 33, 3] [1, 1, 1, 1] : tensor<1x225x225x3xf32> to tensor<1x17x33x3xf32>
|
||||
// CHECK-NEXT: scf.for %[[IV2:.+]] = %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[ARG2:.+]] = %[[ARG1]])
|
||||
// CHECK-NEXT: %[[ST_ELEM:.+]] = subtensor %[[ELEM]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
// CHECK-NEXT: %[[ST_ARG2:.+]] = subtensor %[[ARG2]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
// CHECK-NEXT: %[[ST_FILTER:.+]] = subtensor %[[FILTER]][0, 0, 0, %[[IV2]]] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
|
||||
// CHECK-NEXT: %[[ST_FILL:.+]] = subtensor %[[FILL]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
// CHECK-NEXT: %[[ST_ELEM:.+]] = tensor.extract_slice %[[ELEM]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
// CHECK-NEXT: %[[ST_ARG2:.+]] = tensor.extract_slice %[[ARG2]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
// CHECK-NEXT: %[[ST_FILTER:.+]] = tensor.extract_slice %[[FILTER]][0, 0, 0, %[[IV2]]] [3, 3, 3, 4] [1, 1, 1, 1] : tensor<3x3x3x32xf32> to tensor<3x3x3x4xf32>
|
||||
// CHECK-NEXT: %[[ST_FILL:.+]] = tensor.extract_slice %[[FILL]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4] [1, 1, 1, 1] : tensor<1x112x112x32xf32> to tensor<1x8x16x4xf32>
|
||||
// CHECK-NEXT: %[[ST_CONV:.+]] = linalg.conv_2d_input_nhwc_filter_hwcf
|
||||
// CHECK-SAME: ins(%[[ST_INPUT]], %[[ST_FILTER]] : tensor<1x17x33x3xf32>, tensor<3x3x3x4xf32>)
|
||||
// CHECK-SAME: outs(%[[ST_FILL]] : tensor<1x8x16x4xf32>)
|
||||
// CHECK-NEXT: %[[ADD:.+]] = linalg.generic
|
||||
// CHECK-SAME: ins(%[[ST_CONV]], %[[ST_ELEM]] : tensor<1x8x16x4xf32>, tensor<1x8x16x4xf32>)
|
||||
// CHECK-SAME: outs(%[[ST_ARG2]] : tensor<1x8x16x4xf32>)
|
||||
// CHECK: subtensor_insert %[[ADD]] into %[[ARG2]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4]
|
||||
// CHECK: tensor.insert_slice %[[ADD]] into %[[ARG2]][0, %[[IV0]], %[[IV1]], %[[IV2]]] [1, 8, 16, 4]
|
||||
|
||||
// -----
|
||||
|
||||
|
@ -174,9 +174,9 @@ func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?x
|
|||
%oh_size = affine.min affine_map<(d0)[s0] -> (16, -d0 + s0)>(%iv1)[%oh]
|
||||
%ow_size = affine.min affine_map<(d0)[s0] -> (4, -d0 + s0)>(%iv2)[%ow]
|
||||
%oc_size = affine.min affine_map<(d0)[s0] -> (2, -d0 + s0)>(%iv2)[%oc]
|
||||
%0 = subtensor %conv[%iv0, %iv1, %iv2, %iv3][%n_size, %oh_size, %ow_size, %oc_size][1, 1, 1, 1] : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
|
||||
%1 = subtensor %elementwise[%iv0, %iv1, %iv2, %iv3][%n_size, %oh_size, %ow_size, %oc_size][1, 1, 1, 1] : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
|
||||
%2 = subtensor %arg3[%iv0, %iv1, %iv2, %iv3][%n_size, %oh_size, %ow_size, %oc_size][1, 1, 1, 1] : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
|
||||
%0 = tensor.extract_slice %conv[%iv0, %iv1, %iv2, %iv3][%n_size, %oh_size, %ow_size, %oc_size][1, 1, 1, 1] : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
|
||||
%1 = tensor.extract_slice %elementwise[%iv0, %iv1, %iv2, %iv3][%n_size, %oh_size, %ow_size, %oc_size][1, 1, 1, 1] : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
|
||||
%2 = tensor.extract_slice %arg3[%iv0, %iv1, %iv2, %iv3][%n_size, %oh_size, %ow_size, %oc_size][1, 1, 1, 1] : tensor<?x?x?x?xf32> to tensor<?x?x?x?xf32>
|
||||
%add = linalg.generic
|
||||
{
|
||||
indexing_maps = [
|
||||
|
@ -191,7 +191,7 @@ func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?x
|
|||
linalg.yield %result : f32
|
||||
} -> tensor<?x?x?x?xf32>
|
||||
|
||||
%insert = subtensor_insert %add into %arg3[%iv0, %iv1, %iv2, %iv3] [%n_size, %oh_size, %ow_size, %oc_size] [1, 1, 1, 1] : tensor<?x?x?x?xf32> into tensor<?x?x?x?xf32>
|
||||
%insert = tensor.insert_slice %add into %arg3[%iv0, %iv1, %iv2, %iv3] [%n_size, %oh_size, %ow_size, %oc_size] [1, 1, 1, 1] : tensor<?x?x?x?xf32> into tensor<?x?x?x?xf32>
|
||||
scf.yield %insert : tensor<?x?x?x?xf32>
|
||||
}
|
||||
scf.yield %for3 : tensor<?x?x?x?xf32>
|
||||
|
@ -257,19 +257,19 @@ func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?x
|
|||
// CHECK-NEXT: %[[SIZE_ELEM_OC:.+]] = affine.min #[[BOUND2_MAP]](%[[IV2]])[%[[ELEM_OC]]]
|
||||
// CHECK-NEXT: %[[OFFSET_OW:.+]] = affine.apply #[[X2_MAP]](%[[IV2]])
|
||||
// CHECK-NEXT: %[[SIZE_INPUT_W:.+]] = affine.min #[[INPUT_BOUND]](%[[SIZE_ELEM_OW]], %[[IV2]])[%[[FILTER_W]], %[[INPUT_W]]]
|
||||
// CHECK-NEXT: %[[ST_INPUT:.+]] = subtensor %[[INPUT]][%[[IV0]], %[[OFFSET_OH]], %[[OFFSET_OW]], 0]
|
||||
// CHECK-NEXT: %[[ST_INPUT:.+]] = tensor.extract_slice %[[INPUT]][%[[IV0]], %[[OFFSET_OH]], %[[OFFSET_OW]], 0]
|
||||
// CHECK-SAME: [%[[SIZE_INPUT_N]], %[[SIZE_INPUT_H]], %[[SIZE_INPUT_W]], %[[INPUT_C]]]
|
||||
// CHECK-NEXT: %[[SIZE_ELEM_OW_2:.+]] = affine.min #[[BOUND4_MAP_2]](%[[IV2]])[%[[FILL_W]], %[[ELEM_OW]]]
|
||||
// CHECK-NEXT: scf.for %[[IV3:.+]] = %{{.+}} to %[[ELEM_OC]] step %{{.+}} iter_args(%[[ARG:[a-z0-9]+]]
|
||||
// CHECK-NEXT: %[[ST_ELEM:.+]] = subtensor %[[ELEM]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK-NEXT: %[[ST_ELEM:.+]] = tensor.extract_slice %[[ELEM]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK-SAME: [%[[SIZE_ELEM_N]], %[[SIZE_ELEM_OH]], %[[SIZE_ELEM_OW]], %[[SIZE_ELEM_OC]]]
|
||||
// CHECK-NEXT: %[[ST_ARG:.+]] = subtensor %[[ARG]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK-NEXT: %[[ST_ARG:.+]] = tensor.extract_slice %[[ARG]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK-SAME: [%[[SIZE_ELEM_N]], %[[SIZE_ELEM_OH]], %[[SIZE_ELEM_OW]], %[[SIZE_ELEM_OC]]]
|
||||
// CHECK-NEXT: %[[SIZE_ELEM_OC_2:.+]] = affine.min #[[BOUND2_MAP_2]](%[[IV3]], %[[IV2]])[%[[FILTER_OC]], %[[ELEM_OC]]]
|
||||
// CHECK-NEXT: %[[ST_FILTER:.+]] = subtensor %[[FILTER]][0, 0, 0, %[[IV3]]]
|
||||
// CHECK-NEXT: %[[ST_FILTER:.+]] = tensor.extract_slice %[[FILTER]][0, 0, 0, %[[IV3]]]
|
||||
// CHECK-SAME: [%[[FILTER_H]], %[[FILTER_W]], %[[FILTER_IC]], %[[SIZE_ELEM_OC_2]]]
|
||||
// CHECK-NEXT: %[[SIZE_ELEM_OC_3:.+]] = affine.min #[[BOUND2_MAP_2]](%[[IV3]], %[[IV2]])[%[[FILL_C]], %[[ELEM_OC]]]
|
||||
// CHECK-NEXT: %[[ST_FILL:.+]] = subtensor %[[FILL]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK-NEXT: %[[ST_FILL:.+]] = tensor.extract_slice %[[FILL]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK-SAME: [%[[SIZE_ELEM_N_2]], %[[SIZE_ELEM_OH_2]], %[[SIZE_ELEM_OW_2]], %[[SIZE_ELEM_OC_3]]]
|
||||
// CHECK-NEXT: %[[ST_CONV:.+]] = linalg.conv_2d_input_nhwc_filter_hwcf
|
||||
// CHECK-SAME: ins(%[[ST_INPUT]], %[[ST_FILTER]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
|
||||
|
@ -277,5 +277,5 @@ func @conv_tensors_dynamic(%input: tensor<?x?x?x?xf32>, %filter: tensor<?x?x?x?x
|
|||
// CHECK-NEXT: %[[ST_ADD:.+]] = linalg.generic
|
||||
// CHECK-SAME: ins(%[[ST_CONV]], %[[ST_ELEM]] : tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>)
|
||||
// CHECK-SAME: outs(%[[ST_ARG]] : tensor<?x?x?x?xf32>)
|
||||
// CHECK: subtensor_insert %[[ST_ADD]] into %[[ARG]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK: tensor.insert_slice %[[ST_ADD]] into %[[ARG]][%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
|
||||
// CHECK-SAME: [%[[SIZE_ELEM_N]], %[[SIZE_ELEM_OH]], %[[SIZE_ELEM_OW]], %[[SIZE_ELEM_OC]]]
|
||||
|
|
|
@ -12,9 +12,9 @@ func @matmul_tensors(
|
|||
// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xi32>) {
|
||||
// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xi32>) {
|
||||
// CHECK: %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xi32>) {
|
||||
// CHECK: %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x?xi8> to tensor<?x?xi8>
|
||||
// CHECK: %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<?x?xi8> to tensor<?x?xi8>
|
||||
// CHECK: %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?xi32> to tensor<?x?xi32>
|
||||
// CHECK: %[[sTA:.*]] = tensor.extract_slice %[[TA]][{{.*}}] : tensor<?x?xi8> to tensor<?x?xi8>
|
||||
// CHECK: %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<?x?xi8> to tensor<?x?xi8>
|
||||
// CHECK: %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<?x?xi32> to tensor<?x?xi32>
|
||||
|
||||
// Dynamic op has been canonicalized away.
|
||||
// CHECK-NOT: linalg.matmul {{.*}} tensor<?x?xi8>
|
||||
|
@ -28,8 +28,8 @@ func @matmul_tensors(
|
|||
// CHECK: : tensor<?x?xi32> to tensor<2x3xi32>
|
||||
// CHECK: %[[pD:.*]] = linalg.matmul_i8_i8_i32 ins(%[[pA]], %[[pB]] : tensor<2x4xi8>, tensor<4x3xi8>)
|
||||
// CHECK-SAME: outs(%[[pC]] : tensor<2x3xi32>) -> tensor<2x3xi32>
|
||||
// CHECK: %[[sTD:.*]] = subtensor %[[pD]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : tensor<2x3xi32> to tensor<?x?xi32>
|
||||
// CHECK: %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xi32> into tensor<?x?xi32>
|
||||
// CHECK: %[[sTD:.*]] = tensor.extract_slice %[[pD]][0, 0] [%{{.*}}, %{{.*}}] [1, 1] : tensor<2x3xi32> to tensor<?x?xi32>
|
||||
// CHECK: %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xi32> into tensor<?x?xi32>
|
||||
// CHECK: scf.yield %[[TD]] : tensor<?x?xi32>
|
||||
// CHECK: scf.yield %[[TD2]] : tensor<?x?xi32>
|
||||
// CHECK: scf.yield %[[TD1]] : tensor<?x?xi32>
|
||||
|
@ -52,15 +52,15 @@ func @generic_scalar_and_tensor(
|
|||
// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?x?xf32>) {
|
||||
// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?x?xf32>) {
|
||||
// CHECK: %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?x?xf32>) {
|
||||
// CHECK: %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
|
||||
// Padding injects static information.
|
||||
// CHECK: %[[pC:.*]] = linalg.pad_tensor %[[sTC]] low[%[[C0]], %[[C0]], %[[C0]]] high[%{{.*}}, %{{.*}}, %{{.*}}]
|
||||
// CHECK: : tensor<?x?x?xf32> to tensor<2x3x4xf32>
|
||||
// CHECK: %[[pD:.*]] = linalg.generic
|
||||
// CHECK-SAME: ins(%[[VAL]] : f32) outs(%[[pC]] : tensor<2x3x4xf32>)
|
||||
// CHECK: %[[sTD:.*]] = subtensor %[[pD]][0, 0, 0] [%{{.*}}, %{{.*}}, %{{.*}}] [1, 1, 1] : tensor<2x3x4xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
// CHECK: %[[sTD:.*]] = tensor.extract_slice %[[pD]][0, 0, 0] [%{{.*}}, %{{.*}}, %{{.*}}] [1, 1, 1] : tensor<2x3x4xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
// CHECK: scf.yield %[[TD]] : tensor<?x?x?xf32>
|
||||
// CHECK: scf.yield %[[TD2]] : tensor<?x?x?xf32>
|
||||
// CHECK: scf.yield %[[TD1]] : tensor<?x?x?xf32>
|
||||
|
@ -104,11 +104,11 @@ func @matmul_partially_padded_tensors(
|
|||
// CHECK-1DIM-TILE: %[[C0:.*]] = constant 0 : index
|
||||
// CHECK-1DIM-TILE: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xi32>) {
|
||||
// CHECK-1DIM-TILE: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xi32>) {
|
||||
// CHECK-1DIM-TILE: %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x8xi8> to tensor<?x8xi8>
|
||||
// CHECK-1DIM-TILE: %[[sTA:.*]] = tensor.extract_slice %[[TA]][{{.*}}] : tensor<?x8xi8> to tensor<?x8xi8>
|
||||
// CHECK-1DIM-TILE: %[[sTAc:.*]] = tensor.cast %[[sTA]] : tensor<?x8xi8> to tensor<?x?xi8>
|
||||
// CHECK-1DIM-TILE: %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<8x?xi8> to tensor<8x?xi8>
|
||||
// CHECK-1DIM-TILE: %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<8x?xi8> to tensor<8x?xi8>
|
||||
// CHECK-1DIM-TILE: %[[sTBc:.*]] = tensor.cast %[[sTB]] : tensor<8x?xi8> to tensor<?x?xi8>
|
||||
// CHECK-1DIM-TILE: %[[sTC:.*]] = subtensor %[[TC1]][{{.*}}] : tensor<?x?xi32> to tensor<?x?xi32>
|
||||
// CHECK-1DIM-TILE: %[[sTC:.*]] = tensor.extract_slice %[[TC1]][{{.*}}] : tensor<?x?xi32> to tensor<?x?xi32>
|
||||
// CHECK-1DIM-TILE: %[[pA:.*]] = linalg.pad_tensor %[[sTAc]] low[%[[C0]], %[[C0]]] high[%{{.*}}, %{{.*}}]
|
||||
// CHECK-1DIM-TILE: : tensor<?x?xi8> to tensor<2x8xi8>
|
||||
// CHECK-1DIM-TILE: %[[pB:.*]] = linalg.pad_tensor %[[sTBc]] low[%[[C0]], %[[C0]]] high[%{{.*}}, %{{.*}}]
|
||||
|
|
|
@ -11,12 +11,12 @@ func @matmul_tensors(
|
|||
// CHECK: %[[TD0:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[TC]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[TD1:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC1:.*]] = %[[TC0]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[TD2:.*]] = scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC2:.*]] = %[[TC1]]) -> (tensor<?x?xf32>) {
|
||||
// CHECK: %[[sTA:.*]] = subtensor %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTB:.*]] = subtensor %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTC:.*]] = subtensor %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTA:.*]] = tensor.extract_slice %[[TA]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTB:.*]] = tensor.extract_slice %[[TB]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTC:.*]] = tensor.extract_slice %[[TC2]][{{.*}}] : tensor<?x?xf32> to tensor<?x?xf32>
|
||||
// CHECK: %[[sTD:.*]] = linalg.matmul ins(%[[sTA]], %[[sTB]] : tensor<?x?xf32>, tensor<?x?xf32>)
|
||||
// CHECK-SAME: outs(%[[sTC]] : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
// CHECK: %[[TD:.*]] = subtensor_insert %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK: %[[TD:.*]] = tensor.insert_slice %[[sTD]] into %[[TC2]][{{.*}}] : tensor<?x?xf32> into tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD]] : tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD2]] : tensor<?x?xf32>
|
||||
// CHECK: scf.yield %[[TD1]] : tensor<?x?xf32>
|
||||
|
@ -51,14 +51,14 @@ func @matmul_tensors(
|
|||
// TLOOP-SAME: iterators["parallel", "parallel", "reduction"]
|
||||
// TLOOP-SAME: distribution["block_x", "block_y", "none"] {
|
||||
|
||||
// TLOOP: %[[SUB_ARG_0:.*]] = subtensor %[[A0]][%[[I]], %[[K]]]
|
||||
// TLOOP: %[[SUB_ARG_1:.*]] = subtensor %[[A1]][%[[K]], %[[J]]]
|
||||
// TLOOP: %[[SUB_ARG_2:.*]] = subtensor %[[A2]][%[[I]], %[[J]]]
|
||||
// TLOOP: %[[SUB_ARG_0:.*]] = tensor.extract_slice %[[A0]][%[[I]], %[[K]]]
|
||||
// TLOOP: %[[SUB_ARG_1:.*]] = tensor.extract_slice %[[A1]][%[[K]], %[[J]]]
|
||||
// TLOOP: %[[SUB_ARG_2:.*]] = tensor.extract_slice %[[A2]][%[[I]], %[[J]]]
|
||||
|
||||
// TLOOP: %[[PROD:.*]] = linalg.matmul ins(%[[SUB_ARG_0]], %[[SUB_ARG_1]]
|
||||
// TLOOP-SE: outs(%[[SUB_ARG_2]] : [[TY]]) -> [[TY]]
|
||||
|
||||
// TLOOP: %[[O:.*]] = subtensor_insert %[[PROD]] into %[[A2]][%[[I]], %[[J]]]
|
||||
// TLOOP: %[[O:.*]] = tensor.insert_slice %[[PROD]] into %[[A2]][%[[I]], %[[J]]]
|
||||
// TLOOP: linalg.yield %[[O]] : [[TY]]
|
||||
|
||||
// -----
|
||||
|
@ -93,13 +93,13 @@ func @generic_op_tensors(
|
|||
// CHECK: %[[TD0:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[TC0:.+]] = %[[INIT]]) -> (tensor<?x?x?xf32>) {
|
||||
// CHECK: %[[TD1:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[TC1:.+]] = %[[TC0]]) -> (tensor<?x?x?xf32>) {
|
||||
// CHECK: %[[TD2:.+]] = scf.for %{{.+}} to %{{.+}} step %{{.+}} iter_args(%[[TC2:.+]] = %[[TC1]]) -> (tensor<?x?x?xf32>) {
|
||||
// CHECK: %[[STARG0:.+]] = subtensor %[[ARG0]][{{.+}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[STARG1:.+]] = subtensor %[[ARG1]][{{.+}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[STARG2:.+]] = subtensor %[[TC2]][{{.+}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[STARG0:.+]] = tensor.extract_slice %[[ARG0]][{{.+}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[STARG1:.+]] = tensor.extract_slice %[[ARG1]][{{.+}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[STARG2:.+]] = tensor.extract_slice %[[TC2]][{{.+}}] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
// CHECK: %[[STRETURN:.+]] = linalg.generic
|
||||
// CHECK-SAME: ins(%[[STARG0]], %[[STARG1]] : tensor<?x?x?xf32>, tensor<?x?x?xf32>)
|
||||
// CHECK-SAME: outs(%[[STARG2]] : tensor<?x?x?xf32>)
|
||||
// CHECK: %[[TD:.+]] = subtensor_insert %[[STRETURN]] into %[[TC2]]
|
||||
// CHECK: %[[TD:.+]] = tensor.insert_slice %[[STRETURN]] into %[[TC2]]
|
||||
// CHECK: scf.yield %[[TD]]
|
||||
// CHECK: }
|
||||
// CHECK: scf.yield %[[TD2]]
|
||||
|
|
|
@ -586,7 +586,7 @@ func @pad_static_source(%arg0: tensor<2x5x2xf32>, %pad_value: f32) -> tensor<2x6
|
|||
// CHECK: %[[INIT:.*]] = linalg.init_tensor [6, %[[V1]], %[[V2]], %[[V5]]] : tensor<6x?x?x?xf32>
|
||||
// CHECK: %[[FILL:.*]] = linalg.fill(%[[INIT]], %{{.*}}) : tensor<6x?x?x?xf32>, f32 -> tensor<6x?x?x?xf32>
|
||||
// CHECK: %[[SRCDIM:.*]] = memref.dim %[[SRC]], %[[C3]] : tensor<1x2x2x?xf32>
|
||||
// CHECK: %[[RESULT:.*]] = subtensor_insert %[[SRC]] into %[[FILL]][2, %[[LOW]], 3, 3] [1, 2, 2, %[[SRCDIM]]] [1, 1, 1, 1] : tensor<1x2x2x?xf32> into tensor<6x?x?x?xf32>
|
||||
// CHECK: %[[RESULT:.*]] = tensor.insert_slice %[[SRC]] into %[[FILL]][2, %[[LOW]], 3, 3] [1, 2, 2, %[[SRCDIM]]] [1, 1, 1, 1] : tensor<1x2x2x?xf32> into tensor<6x?x?x?xf32>
|
||||
// CHECK: return %[[RESULT]]
|
||||
func @pad_static_dynamic(%arg0: tensor<1x2x2x?xf32>, %low: index, %high: index,
|
||||
%pad_value: f32) -> tensor<6x?x?x?xf32> {
|
||||
|
@ -638,7 +638,7 @@ func @pad_and_transfer_write_static(
|
|||
} : tensor<5x6xf32> to tensor<10x13xf32>
|
||||
%1 = vector.transfer_write %arg1, %0[%c0, %c0]
|
||||
: vector<7x9xf32>, tensor<10x13xf32>
|
||||
%2 = subtensor %1[0, 0] [5, 6] [1, 1] : tensor<10x13xf32> to tensor<5x6xf32>
|
||||
%2 = tensor.extract_slice %1[0, 0] [5, 6] [1, 1] : tensor<10x13xf32> to tensor<5x6xf32>
|
||||
return %2 : tensor<5x6xf32>
|
||||
}
|
||||
|
||||
|
@ -648,14 +648,14 @@ func @pad_and_transfer_write_static(
|
|||
// CHECK-SAME: %[[ARG0:.*]]: tensor<?x?xf32>, %[[ARG1:.*]]: vector<7x9xf32>, %[[SIZE:.*]]: index, %[[PADDING:.*]]: index
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
// CHECK: %[[C0:.*]] = constant 0 : index
|
||||
// CHECK: %[[SUB:.*]] = subtensor %[[ARG0]][0, 0] [%[[SIZE]], 6] [1, 1] : tensor<?x?xf32> to tensor<?x6xf32>
|
||||
// CHECK: %[[SUB:.*]] = tensor.extract_slice %[[ARG0]][0, 0] [%[[SIZE]], 6] [1, 1] : tensor<?x?xf32> to tensor<?x6xf32>
|
||||
// CHECK: %[[RESULT:.*]] = vector.transfer_write %[[ARG1]], %[[SUB]][%[[C0]], %[[C0]]] : vector<7x9xf32>, tensor<?x6xf32>
|
||||
// CHECK: return %[[RESULT]]
|
||||
func @pad_and_transfer_write_dynamic_static(
|
||||
%arg0: tensor<?x?xf32>, %arg1: vector<7x9xf32>, %size: index, %padding: index) -> tensor<?x6xf32> {
|
||||
%c0 = constant 0 : index
|
||||
%c5 = constant 5.0 : f32
|
||||
%s = subtensor %arg0[0, 0] [%size, 6] [1, 1]
|
||||
%s = tensor.extract_slice %arg0[0, 0] [%size, 6] [1, 1]
|
||||
: tensor<?x?xf32> to tensor<?x6xf32>
|
||||
%0 = linalg.pad_tensor %s low[0, 0] high[%padding, 7] {
|
||||
^bb0(%arg2: index, %arg3: index):
|
||||
|
@ -663,13 +663,13 @@ func @pad_and_transfer_write_dynamic_static(
|
|||
} : tensor<?x6xf32> to tensor<?x13xf32>
|
||||
%1 = vector.transfer_write %arg1, %0[%c0, %c0]
|
||||
: vector<7x9xf32>, tensor<?x13xf32>
|
||||
%2 = subtensor %1[0, 0] [%size, 6] [1, 1] : tensor<?x13xf32> to tensor<?x6xf32>
|
||||
%2 = tensor.extract_slice %1[0, 0] [%size, 6] [1, 1] : tensor<?x13xf32> to tensor<?x6xf32>
|
||||
return %2 : tensor<?x6xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @pad_and_subtensor_insert
|
||||
// CHECK-LABEL: func @pad_and_insert_slice
|
||||
// CHECK-SAME: %[[ARG0:.*]]: tensor<5x6xf32>, %[[ARG1:.*]]: tensor<12x13xf32>
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
// CHECK-DAG: %[[C0:.*]] = constant 0 : index
|
||||
|
@ -677,7 +677,7 @@ func @pad_and_transfer_write_dynamic_static(
|
|||
// CHECK: %[[READ:.*]] = vector.transfer_read %[[ARG0]][%[[C0]], %[[C0]]], %[[C5]] : tensor<5x6xf32>, vector<7x9xf32>
|
||||
// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[READ]], %[[ARG1]][%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<7x9xf32>, tensor<12x13xf32>
|
||||
// CHECK: return %[[WRITE]]
|
||||
func @pad_and_subtensor_insert(
|
||||
func @pad_and_insert_slice(
|
||||
%arg0: tensor<5x6xf32>, %arg1: tensor<12x13xf32>) -> tensor<12x13xf32> {
|
||||
%c0 = constant 0 : index
|
||||
%c5 = constant 5.0 : f32
|
||||
|
@ -685,7 +685,7 @@ func @pad_and_subtensor_insert(
|
|||
^bb0(%arg2: index, %arg3: index):
|
||||
linalg.yield %c5 : f32
|
||||
} : tensor<5x6xf32> to tensor<7x9xf32>
|
||||
%r = subtensor_insert %0 into %arg1[0, 0][7, 9][1, 1] : tensor<7x9xf32> into tensor<12x13xf32>
|
||||
%r = tensor.insert_slice %0 into %arg1[0, 0][7, 9][1, 1] : tensor<7x9xf32> into tensor<12x13xf32>
|
||||
return %r : tensor<12x13xf32>
|
||||
}
|
||||
|
||||
|
|
|
@ -367,27 +367,3 @@ func @tensor_cast_to_memref(%arg0 : tensor<4x6x16x32xi8>) ->
|
|||
%1 = memref.buffer_cast %0 : memref<?x?x16x32xi8>
|
||||
return %1 : memref<?x?x16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// TODO: Move this test to Tensor/canonicalize.mlir.
|
||||
func @subtensor_insert_propagate_dest_cast(%arg0 : tensor<2x?xi32>, %arg1 : tensor<i32>,
|
||||
%arg2 : index, %arg3 : index) -> tensor<?x?xi32> {
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c2 = constant 2 : index
|
||||
%c8 = constant 8 : index
|
||||
%0 = memref.dim %arg0, %c1 : tensor<2x?xi32>
|
||||
%1 = tensor.extract %arg1[] : tensor<i32>
|
||||
%2 = tensor.generate %arg2, %c8 {
|
||||
^bb0(%arg4: index, %arg5: index):
|
||||
tensor.yield %1 : i32
|
||||
} : tensor<?x?xi32>
|
||||
%3 = subtensor_insert %arg0 into %2[%c0, %arg3] [%c2, %0] [%c1, %c1] : tensor<2x?xi32> into tensor<?x?xi32>
|
||||
return %3 : tensor<?x?xi32>
|
||||
}
|
||||
// CHECK-LABEL: func @subtensor_insert_propagate_dest_cast
|
||||
// CHECK: %[[UPDATED:.+]] = subtensor_insert %{{.+}} into %{{.+}}[0, %{{.+}}] [2, %{{.+}}] [1, 1]
|
||||
// CHECK-SAME: tensor<2x?xi32> into tensor<?x8xi32>
|
||||
// CHECK: %[[CAST:.+]] = tensor.cast %[[UPDATED]]
|
||||
// CHECK: return %[[CAST]]
|
||||
|
|
|
@ -659,10 +659,10 @@ func @matmul_on_tensors(%t0: tensor<32x1024xf32>, %t1: tensor<1024x1024xf32>) ->
|
|||
scf.yield %2 : tensor<?x?xf32>
|
||||
}
|
||||
// CHECK-NOT: tensor.cast
|
||||
// CHECK: %[[RES:.*]] = subtensor_insert %[[FOR_RES]] into %[[T1]][0, 0] [32, 1024] [1, 1] : tensor<32x1024xf32> into tensor<1024x1024xf32>
|
||||
// CHECK: %[[RES:.*]] = tensor.insert_slice %[[FOR_RES]] into %[[T1]][0, 0] [32, 1024] [1, 1] : tensor<32x1024xf32> into tensor<1024x1024xf32>
|
||||
// CHECK: return %[[RES]] : tensor<1024x1024xf32>
|
||||
%2 = tensor.cast %1 : tensor<?x?xf32> to tensor<32x1024xf32>
|
||||
%res = subtensor_insert %2 into %t1[0, 0] [32, 1024] [1, 1] : tensor<32x1024xf32> into tensor<1024x1024xf32>
|
||||
%res = tensor.insert_slice %2 into %t1[0, 0] [32, 1024] [1, 1] : tensor<32x1024xf32> into tensor<1024x1024xf32>
|
||||
return %res : tensor<1024x1024xf32>
|
||||
}
|
||||
|
||||
|
|
|
@ -24,202 +24,6 @@ func @cmpi_equal_operands(%arg0: i64)
|
|||
|
||||
// -----
|
||||
|
||||
func @subtensor_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = subtensor %arg0[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
return %0 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @subtensor_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SUBTENSOR:.+]] = subtensor %[[ARG0]][0, %{{[a-zA-Z0-9_]+}}, 1]
|
||||
// CHECK-SAME: [4, 1, %{{[a-zA-Z0-9_]+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x1x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.cast %[[SUBTENSOR]]
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @rank_reducing_subtensor_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index) -> tensor<?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = subtensor %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?xf32>
|
||||
return %0 : tensor<?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @rank_reducing_subtensor_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SUBTENSOR:.+]] = subtensor %[[ARG0]][0, %{{[a-zA-Z0-9_]+}}, 1]
|
||||
// CHECK-SAME: [4, 1, %{{[a-zA-Z0-9_]+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.cast %[[SUBTENSOR]]
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @trivial_subtensor
|
||||
// CHECK-SAME: %[[ARG0:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK-NOT: subtensor
|
||||
// CHECK: return %[[ARG0]] : tensor<4x6x16x32xi8>
|
||||
func @trivial_subtensor(%arg0 : tensor<4x6x16x32xi8>) -> tensor<4x6x16x32xi8> {
|
||||
%0 = subtensor %arg0[0, 0, 0, 0] [4, 6, 16, 32] [1, 1, 1, 1] : tensor<4x6x16x32xi8> to tensor<4x6x16x32xi8>
|
||||
return %0 : tensor<4x6x16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @trivial_subtensor_insert
|
||||
// CHECK-SAME: %[[ARG0:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK-NOT: subtensor
|
||||
// CHECK: return %[[ARG0]] : tensor<4x6x16x32xi8>
|
||||
func @trivial_subtensor_insert(%arg0 : tensor<4x6x16x32xi8>, %arg1 : tensor<4x6x16x32xi8>) -> tensor<4x6x16x32xi8> {
|
||||
%0 = subtensor_insert %arg0 into %arg1[0, 0, 0, 0] [4, 6, 16, 32] [1, 1, 1, 1] : tensor<4x6x16x32xi8> into tensor<4x6x16x32xi8>
|
||||
return %0 : tensor<4x6x16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @rank_reducing_tensor_of_cast
|
||||
// CHECK-SAME: %[[ARG0:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK: %[[S:.+]] = subtensor %arg0[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<4x6x16x32xi8> to tensor<16x32xi8>
|
||||
// Tensor cast is moved after subtensor and then gets canonicalized away.
|
||||
// CHECK-NOT: tensor.cast
|
||||
// CHECK: return %[[S]] : tensor<16x32xi8>
|
||||
func @rank_reducing_tensor_of_cast(%arg : tensor<4x6x16x32xi8>) -> tensor<16x32xi8> {
|
||||
%0 = tensor.cast %arg : tensor<4x6x16x32xi8> to tensor<?x?x16x32xi8>
|
||||
%1 = subtensor %0[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<?x?x16x32xi8> to tensor<16x32xi8>
|
||||
return %1 : tensor<16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @rank_reducing_subtensor_insert_of_cast
|
||||
// CHECK-SAME: %[[A:.[a-z0-9A-Z_]+]]: tensor<16x32xi8>
|
||||
// CHECK-SAME: %[[B:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK: %[[S:.+]] = subtensor_insert %[[A]] into %[[B]][0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<16x32xi8> into tensor<4x6x16x32xi8>
|
||||
// Tensor cast is folded away.
|
||||
// CHECK-NOT: tensor.cast
|
||||
// CHECK: return %[[S]] : tensor<4x6x16x32xi8>
|
||||
func @rank_reducing_subtensor_insert_of_cast(%a : tensor<16x32xi8>, %b : tensor<4x6x16x32xi8>) -> tensor<4x6x16x32xi8> {
|
||||
%cast = tensor.cast %a : tensor<16x32xi8> to tensor<?x32xi8>
|
||||
%res = subtensor_insert %cast into %b[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<?x32xi8> into tensor<4x6x16x32xi8>
|
||||
return %res : tensor<4x6x16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func @subtensor_insert_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = subtensor_insert %arg0 into %arg3[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
return %0 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @subtensor_insert_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = subtensor_insert %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @subtensor_to_subtensor_insert_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = subtensor %arg0[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
%1 = subtensor_insert %0 into %arg3[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
return %1 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @subtensor_to_subtensor_insert_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SUBTENSOR:.+]] = subtensor %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}} [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x1x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = subtensor_insert %[[SUBTENSOR]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<4x1x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @rank_reducing_subtensor_insert_canonicalize(%arg0 : tensor<?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = subtensor_insert %arg0 into %arg3[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?xf32> into tensor<?x?x?xf32>
|
||||
return %0 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @rank_reducing_subtensor_insert_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = subtensor_insert %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @rank_reducing_subtensor_to_subtensor_insert_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = subtensor %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?xf32>
|
||||
%1 = subtensor_insert %0 into %arg3[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?xf32> into tensor<?x?x?xf32>
|
||||
return %1 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @rank_reducing_subtensor_to_subtensor_insert_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SUBTENSOR:.+]] = subtensor %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = subtensor_insert %[[SUBTENSOR]] into %[[ARG3]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<4x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @subtensor_insert_output_dest_canonicalize(%arg0 : tensor<2x3xi32>, %arg1 : tensor<i32>) -> tensor<3x9xi32> {
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c2 = constant 2 : index
|
||||
%c9 = constant 9 : index
|
||||
%c3 = constant 3 : index
|
||||
%2 = tensor.extract %arg1[] : tensor<i32>
|
||||
%4 = tensor.generate %c3, %c9 {
|
||||
^bb0(%arg2: index, %arg3: index):
|
||||
tensor.yield %2 : i32
|
||||
} : tensor<?x?xi32>
|
||||
%5 = subtensor_insert %arg0 into %4[%c0, %c1] [%c2, %c3] [1, 1] : tensor<2x3xi32> into tensor<?x?xi32>
|
||||
%6 = tensor.cast %5 : tensor<?x?xi32> to tensor<3x9xi32>
|
||||
return %6 : tensor<3x9xi32>
|
||||
}
|
||||
// CHECK-LABEL: func @subtensor_insert_output_dest_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-z0-9_]+]]: tensor<2x3xi32>
|
||||
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor<i32>
|
||||
// CHECK: %[[PAD:.+]] = tensor.extract %[[ARG1]]
|
||||
// CHECK: %[[GENERATE:.+]] = tensor.generate
|
||||
// CHECK: %[[RESULT:.+]] = subtensor_insert %[[ARG0]] into %[[GENERATE]]
|
||||
// CHECK: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: @select_same_val
|
||||
// CHECK: return %arg1
|
||||
func @select_same_val(%arg0: i1, %arg1: i64) -> i64 {
|
||||
|
|
|
@ -263,3 +263,222 @@ func @from_elements.constant() -> tensor<3xindex> {
|
|||
%tensor = tensor.from_elements %c1, %c2, %c1 : tensor<3xindex>
|
||||
return %tensor : tensor<3xindex>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func @slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = tensor.extract_slice %arg0[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
return %0 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @slice_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]][0, %{{[a-zA-Z0-9_]+}}, 1]
|
||||
// CHECK-SAME: [4, 1, %{{[a-zA-Z0-9_]+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x1x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.cast %[[SLICE]]
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @rank_reducing_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index) -> tensor<?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = tensor.extract_slice %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?xf32>
|
||||
return %0 : tensor<?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @rank_reducing_slice_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]][0, %{{[a-zA-Z0-9_]+}}, 1]
|
||||
// CHECK-SAME: [4, 1, %{{[a-zA-Z0-9_]+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.cast %[[SLICE]]
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @trivial_slice
|
||||
// CHECK-SAME: %[[ARG0:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK-NOT: tensor.extract_slice
|
||||
// CHECK: return %[[ARG0]] : tensor<4x6x16x32xi8>
|
||||
func @trivial_slice(%arg0 : tensor<4x6x16x32xi8>) -> tensor<4x6x16x32xi8> {
|
||||
%0 = tensor.extract_slice %arg0[0, 0, 0, 0] [4, 6, 16, 32] [1, 1, 1, 1] : tensor<4x6x16x32xi8> to tensor<4x6x16x32xi8>
|
||||
return %0 : tensor<4x6x16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @trivial_insert_slice
|
||||
// CHECK-SAME: %[[ARG0:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK-NOT: tensor.extract_slice
|
||||
// CHECK: return %[[ARG0]] : tensor<4x6x16x32xi8>
|
||||
func @trivial_insert_slice(%arg0 : tensor<4x6x16x32xi8>, %arg1 : tensor<4x6x16x32xi8>) -> tensor<4x6x16x32xi8> {
|
||||
%0 = tensor.insert_slice %arg0 into %arg1[0, 0, 0, 0] [4, 6, 16, 32] [1, 1, 1, 1] : tensor<4x6x16x32xi8> into tensor<4x6x16x32xi8>
|
||||
return %0 : tensor<4x6x16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @rank_reducing_tensor_of_cast
|
||||
// CHECK-SAME: %[[ARG0:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK: %[[S:.+]] = tensor.extract_slice %arg0[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<4x6x16x32xi8> to tensor<16x32xi8>
|
||||
// Tensor cast is moved after slice and then gets canonicalized away.
|
||||
// CHECK-NOT: tensor.cast
|
||||
// CHECK: return %[[S]] : tensor<16x32xi8>
|
||||
func @rank_reducing_tensor_of_cast(%arg : tensor<4x6x16x32xi8>) -> tensor<16x32xi8> {
|
||||
%0 = tensor.cast %arg : tensor<4x6x16x32xi8> to tensor<?x?x16x32xi8>
|
||||
%1 = tensor.extract_slice %0[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<?x?x16x32xi8> to tensor<16x32xi8>
|
||||
return %1 : tensor<16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @rank_reducing_insert_slice_of_cast
|
||||
// CHECK-SAME: %[[A:.[a-z0-9A-Z_]+]]: tensor<16x32xi8>
|
||||
// CHECK-SAME: %[[B:.[a-z0-9A-Z_]+]]: tensor<4x6x16x32xi8>
|
||||
// CHECK: %[[S:.+]] = tensor.insert_slice %[[A]] into %[[B]][0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<16x32xi8> into tensor<4x6x16x32xi8>
|
||||
// Tensor cast is folded away.
|
||||
// CHECK-NOT: tensor.cast
|
||||
// CHECK: return %[[S]] : tensor<4x6x16x32xi8>
|
||||
func @rank_reducing_insert_slice_of_cast(%a : tensor<16x32xi8>, %b : tensor<4x6x16x32xi8>) -> tensor<4x6x16x32xi8> {
|
||||
%cast = tensor.cast %a : tensor<16x32xi8> to tensor<?x32xi8>
|
||||
%res = tensor.insert_slice %cast into %b[0, 1, 0] [1, 1, 16] [1, 1, 1] : tensor<?x32xi8> into tensor<4x6x16x32xi8>
|
||||
return %res : tensor<4x6x16x32xi8>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
func @insert_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = tensor.insert_slice %arg0 into %arg3[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
return %0 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @insert_slice_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @slice_to_insert_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = tensor.extract_slice %arg0[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
%1 = tensor.insert_slice %0 into %arg3[%c0, %arg1, %c1] [%c4, %c1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> into tensor<?x?x?xf32>
|
||||
return %1 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @slice_to_insert_slice_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}} [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x1x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[SLICE]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<4x1x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @rank_reducing_insert_slice_canonicalize(%arg0 : tensor<?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = tensor.insert_slice %arg0 into %arg3[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?xf32> into tensor<?x?x?xf32>
|
||||
return %0 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @rank_reducing_insert_slice_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:.+]]: tensor<?x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @rank_reducing_slice_to_insert_slice_canonicalize(%arg0 : tensor<?x?x?xf32>, %arg1 : index,
|
||||
%arg2 : index, %arg3 : tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c4 = constant 4 : index
|
||||
%0 = tensor.extract_slice %arg0[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?x?xf32> to tensor<?x?xf32>
|
||||
%1 = tensor.insert_slice %0 into %arg3[%c0, %arg1, %c1] [%c4, 1, %arg2] [%c1, %c1, %c1] : tensor<?x?xf32> into tensor<?x?x?xf32>
|
||||
return %1 : tensor<?x?x?xf32>
|
||||
}
|
||||
// CHECK-LABEL: func @rank_reducing_slice_to_insert_slice_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: tensor<?x?x?xf32>
|
||||
// CHECK: %[[SLICE:.+]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<?x?x?xf32> to tensor<4x?xf32>
|
||||
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[SLICE]] into %[[ARG3]]
|
||||
// CHECK-SAME: [0, %{{.+}}, 1] [4, 1, %{{.+}}] [1, 1, 1]
|
||||
// CHECK-SAME: : tensor<4x?xf32> into tensor<?x?x?xf32>
|
||||
// CHEKC: return %[[RESULT]]
|
||||
|
||||
// -----
|
||||
|
||||
func @insert_slice_propagate_dest_cast(%arg0 : tensor<2x?xi32>, %arg1 : tensor<i32>,
|
||||
%arg2 : index, %arg3 : index) -> tensor<?x?xi32> {
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c2 = constant 2 : index
|
||||
%c8 = constant 8 : index
|
||||
%0 = memref.dim %arg0, %c1 : tensor<2x?xi32>
|
||||
%1 = tensor.extract %arg1[] : tensor<i32>
|
||||
%2 = tensor.generate %arg2, %c8 {
|
||||
^bb0(%arg4: index, %arg5: index):
|
||||
tensor.yield %1 : i32
|
||||
} : tensor<?x?xi32>
|
||||
%3 = tensor.insert_slice %arg0 into %2[%c0, %arg3] [%c2, %0] [%c1, %c1] : tensor<2x?xi32> into tensor<?x?xi32>
|
||||
return %3 : tensor<?x?xi32>
|
||||
}
|
||||
// CHECK-LABEL: func @insert_slice_propagate_dest_cast
|
||||
// CHECK: %[[UPDATED:.+]] = tensor.insert_slice %{{.+}} into %{{.+}}[0, %{{.+}}] [2, %{{.+}}] [1, 1]
|
||||
// CHECK-SAME: tensor<2x?xi32> into tensor<?x8xi32>
|
||||
// CHECK: %[[CAST:.+]] = tensor.cast %[[UPDATED]]
|
||||
// CHECK: return %[[CAST]]
|
||||
|
||||
// -----
|
||||
|
||||
func @insert_slice_output_dest_canonicalize(%arg0 : tensor<2x3xi32>, %arg1 : tensor<i32>) -> tensor<3x9xi32> {
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
%c2 = constant 2 : index
|
||||
%c9 = constant 9 : index
|
||||
%c3 = constant 3 : index
|
||||
%2 = tensor.extract %arg1[] : tensor<i32>
|
||||
%4 = tensor.generate %c3, %c9 {
|
||||
^bb0(%arg2: index, %arg3: index):
|
||||
tensor.yield %2 : i32
|
||||
} : tensor<?x?xi32>
|
||||
%5 = tensor.insert_slice %arg0 into %4[%c0, %c1] [%c2, %c3] [1, 1] : tensor<2x3xi32> into tensor<?x?xi32>
|
||||
%6 = tensor.cast %5 : tensor<?x?xi32> to tensor<3x9xi32>
|
||||
return %6 : tensor<3x9xi32>
|
||||
}
|
||||
// CHECK-LABEL: func @insert_slice_output_dest_canonicalize
|
||||
// CHECK-SAME: %[[ARG0:[a-zA-z0-9_]+]]: tensor<2x3xi32>
|
||||
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: tensor<i32>
|
||||
// CHECK: %[[PAD:.+]] = tensor.extract %[[ARG1]]
|
||||
// CHECK: %[[GENERATE:.+]] = tensor.generate
|
||||
// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[ARG0]] into %[[GENERATE]]
|
||||
// CHECK: return %[[RESULT]]
|
||||
|
|
|
@ -825,31 +825,31 @@ func @assume_alignment(%0: memref<4x4xf16>) {
|
|||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @subtensor({{.*}}) {
|
||||
func @subtensor(%t: tensor<8x16x4xf32>, %idx : index) {
|
||||
// CHECK-LABEL: func @slice({{.*}}) {
|
||||
func @slice(%t: tensor<8x16x4xf32>, %idx : index) {
|
||||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: tensor<8x16x4xf32> to tensor<?x?x?xf32>
|
||||
%1 = subtensor %t[%c0, %c0, %c0][%idx, %idx, %idx][%c1, %c1, %c1]
|
||||
%1 = tensor.extract_slice %t[%c0, %c0, %c0][%idx, %idx, %idx][%c1, %c1, %c1]
|
||||
: tensor<8x16x4xf32> to tensor<?x?x?xf32>
|
||||
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: tensor<8x16x4xf32> to tensor<4x4x4xf32>
|
||||
%2 = subtensor %t[0, 2, 0][4, 4, 4][1, 1, 1]
|
||||
%2 = tensor.extract_slice %t[0, 2, 0][4, 4, 4][1, 1, 1]
|
||||
: tensor<8x16x4xf32> to tensor<4x4x4xf32>
|
||||
|
||||
// CHECK: subtensor
|
||||
// CHECK: tensor.extract_slice
|
||||
// CHECK-SAME: tensor<8x16x4xf32> to tensor<4x4xf32>
|
||||
%3 = subtensor %t[0, 2, 0][4, 1, 4][1, 1, 1]
|
||||
%3 = tensor.extract_slice %t[0, 2, 0][4, 1, 4][1, 1, 1]
|
||||
: tensor<8x16x4xf32> to tensor<4x4xf32>
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL: func @subtensor_insert({{.*}}) {
|
||||
func @subtensor_insert(
|
||||
// CHECK-LABEL: func @insert_slice({{.*}}) {
|
||||
func @insert_slice(
|
||||
%t: tensor<8x16x4xf32>,
|
||||
%t2: tensor<16x32x8xf32>,
|
||||
%t3: tensor<4x4xf32>,
|
||||
|
@ -857,19 +857,19 @@ func @subtensor_insert(
|
|||
%c0 = constant 0 : index
|
||||
%c1 = constant 1 : index
|
||||
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: tensor<8x16x4xf32> into tensor<16x32x8xf32>
|
||||
%1 = subtensor_insert %t into %t2[%c0, %c0, %c0][%idx, %idx, %idx][%c1, %c1, %c1]
|
||||
%1 = tensor.insert_slice %t into %t2[%c0, %c0, %c0][%idx, %idx, %idx][%c1, %c1, %c1]
|
||||
: tensor<8x16x4xf32> into tensor<16x32x8xf32>
|
||||
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: tensor<8x16x4xf32> into tensor<16x32x8xf32>
|
||||
%2 = subtensor_insert %t into %t2[%c0, %idx, %c0][%idx, 4, %idx][%c1, 1, %c1]
|
||||
%2 = tensor.insert_slice %t into %t2[%c0, %idx, %c0][%idx, 4, %idx][%c1, 1, %c1]
|
||||
: tensor<8x16x4xf32> into tensor<16x32x8xf32>
|
||||
|
||||
// CHECK: subtensor_insert
|
||||
// CHECK: tensor.insert_slice
|
||||
// CHECK-SAME: tensor<4x4xf32> into tensor<8x16x4xf32>
|
||||
%3 = subtensor_insert %t3 into %t[0, 2, 0][4, 1, 4][1, 1, 1]
|
||||
%3 = tensor.insert_slice %t3 into %t[0, 2, 0][4, 1, 4][1, 1, 1]
|
||||
: tensor<4x4xf32> into tensor<8x16x4xf32>
|
||||
|
||||
return
|
||||
|
|
|
@ -1214,9 +1214,9 @@ func @assume_alignment(%0: memref<4x4xf16>) {
|
|||
|
||||
// -----
|
||||
|
||||
func @subtensor_wrong_dynamic_type(%t: tensor<8x16x4xf32>, %idx : index) {
|
||||
func @slice_wrong_dynamic_type(%t: tensor<8x16x4xf32>, %idx : index) {
|
||||
// expected-error @+1 {{expected result type to be 'tensor<4x4x4xf32>' or a rank-reduced version. (mismatch of result sizes)}}
|
||||
%0 = subtensor %t[0, 2, 0][4, 4, 4][1, 1, 1]
|
||||
%0 = tensor.extract_slice %t[0, 2, 0][4, 4, 4][1, 1, 1]
|
||||
: tensor<8x16x4xf32> to tensor<?x4x4xf32>
|
||||
|
||||
return
|
||||
|
@ -1224,9 +1224,9 @@ func @subtensor_wrong_dynamic_type(%t: tensor<8x16x4xf32>, %idx : index) {
|
|||
|
||||
// -----
|
||||
|
||||
func @subtensor_wrong_static_type(%t: tensor<8x16x4xf32>, %idx : index) {
|
||||
func @slice_wrong_static_type(%t: tensor<8x16x4xf32>, %idx : index) {
|
||||
// expected-error @+1 {{expected result type to be 'tensor<?x3x?xf32>' or a rank-reduced version. (mismatch of result sizes)}}
|
||||
%0 = subtensor %t[0, 0, 0][%idx, 3, %idx][1, 1, 1]
|
||||
%0 = tensor.extract_slice %t[0, 0, 0][%idx, 3, %idx][1, 1, 1]
|
||||
: tensor<8x16x4xf32> to tensor<4x4x4xf32>
|
||||
|
||||
return
|
||||
|
|
|
@ -10,12 +10,12 @@ func @main() {
|
|||
%const = constant dense<10.0> : tensor<2xf32>
|
||||
%insert_val = constant dense<20.0> : tensor<1xf32>
|
||||
|
||||
// Both of these subtensor_insert ops insert into the same original tensor
|
||||
// Both of these insert_slice ops insert into the same original tensor
|
||||
// value `%const`. This can easily cause bugs if at the memref level
|
||||
// we attempt to write in-place into the memref that %const has been
|
||||
// converted into.
|
||||
%inserted_at_position_0 = subtensor_insert %insert_val into %const[0][1][1] : tensor<1xf32> into tensor<2xf32>
|
||||
%inserted_at_position_1 = subtensor_insert %insert_val into %const[1][1][1] : tensor<1xf32> into tensor<2xf32>
|
||||
%inserted_at_position_0 = tensor.insert_slice %insert_val into %const[0][1][1] : tensor<1xf32> into tensor<2xf32>
|
||||
%inserted_at_position_1 = tensor.insert_slice %insert_val into %const[1][1][1] : tensor<1xf32> into tensor<2xf32>
|
||||
|
||||
%unranked_at_position_0 = tensor.cast %inserted_at_position_0 : tensor<2xf32> to tensor<*xf32>
|
||||
call @print_memref_f32(%unranked_at_position_0) : (tensor<*xf32>) -> ()
|
||||
|
|
|
@ -9,7 +9,7 @@
|
|||
func @main() {
|
||||
%const = constant dense<10.0> : tensor<2xf32>
|
||||
%insert_val = constant dense<20.0> : tensor<1xf32>
|
||||
%inserted = subtensor_insert %insert_val into %const[0][1][1] : tensor<1xf32> into tensor<2xf32>
|
||||
%inserted = tensor.insert_slice %insert_val into %const[0][1][1] : tensor<1xf32> into tensor<2xf32>
|
||||
|
||||
%unranked = tensor.cast %inserted : tensor<2xf32> to tensor<*xf32>
|
||||
call @print_memref_f32(%unranked) : (tensor<*xf32>) -> ()
|
||||
|
|
|
@ -1065,9 +1065,9 @@ func @memref_cast_folding_subview_static(%V: memref<16x16xf32>, %a: index, %b: i
|
|||
|
||||
// -----
|
||||
|
||||
// CHECK-LABEL: func @subtensor
|
||||
// CHECK-LABEL: func @slice
|
||||
// CHECK-SAME: %[[ARG0:[0-9a-z]*]]: index, %[[ARG1:[0-9a-z]*]]: index
|
||||
func @subtensor(%t: tensor<8x16x4xf32>, %arg0 : index, %arg1 : index)
|
||||
func @slice(%t: tensor<8x16x4xf32>, %arg0 : index, %arg1 : index)
|
||||
-> tensor<?x?x?xf32>
|
||||
{
|
||||
%c0 = constant 0 : index
|
||||
|
@ -1076,18 +1076,18 @@ func @subtensor(%t: tensor<8x16x4xf32>, %arg0 : index, %arg1 : index)
|
|||
%c7 = constant 7 : index
|
||||
%c11 = constant 11 : index
|
||||
|
||||
// CHECK: subtensor %{{.*}}[0, 0, 0] [7, 11, 2] [1, 1, 1] :
|
||||
// CHECK: tensor.extract_slice %{{.*}}[0, 0, 0] [7, 11, 2] [1, 1, 1] :
|
||||
// CHECK-SAME: tensor<8x16x4xf32> to tensor<7x11x2xf32>
|
||||
// tensor.cast gets folded away in consumer.
|
||||
// CHECK-NOT: tensor.cast
|
||||
%1 = subtensor %t[%c0, %c0, %c0] [%c7, %c11, %c2] [%c1, %c1, %c1]
|
||||
%1 = tensor.extract_slice %t[%c0, %c0, %c0] [%c7, %c11, %c2] [%c1, %c1, %c1]
|
||||
: tensor<8x16x4xf32> to tensor<?x?x?xf32>
|
||||
|
||||
// Test: subtensor with one dynamic operand can also be folded.
|
||||
// CHECK: subtensor %{{.*}}[0, 0, 0] [2, %[[ARG0]], 2] [1, 1, 1] :
|
||||
// Test: slice with one dynamic operand can also be folded.
|
||||
// CHECK: tensor.extract_slice %{{.*}}[0, 0, 0] [2, %[[ARG0]], 2] [1, 1, 1] :
|
||||
// CHECK-SAME: tensor<7x11x2xf32> to tensor<2x?x2xf32>
|
||||
// CHECK: tensor.cast %{{.*}} : tensor<2x?x2xf32> to tensor<?x?x?xf32>
|
||||
%2 = subtensor %1[%c0, %c0, %c0] [%c2, %arg0, %c2] [%c1, %c1, %c1]
|
||||
%2 = tensor.extract_slice %1[%c0, %c0, %c0] [%c2, %arg0, %c2] [%c1, %c1, %c1]
|
||||
: tensor<?x?x?xf32> to tensor<?x?x?xf32>
|
||||
|
||||
return %2 : tensor<?x?x?xf32>
|
||||
|
|
|
@ -529,9 +529,9 @@ static void applyPadTensorToGenericPatterns(FuncOp funcOp) {
|
|||
(void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
|
||||
}
|
||||
|
||||
static void applySubTensorOfPadTensorSwapPattern(FuncOp funcOp) {
|
||||
static void applyExtractSliceOfPadTensorSwapPattern(FuncOp funcOp) {
|
||||
RewritePatternSet patterns(funcOp.getContext());
|
||||
patterns.add<SubTensorOfPadTensorSwapPattern>(funcOp.getContext());
|
||||
patterns.add<ExtractSliceOfPadTensorSwapPattern>(funcOp.getContext());
|
||||
(void)applyPatternsAndFoldGreedily(funcOp, std::move(patterns));
|
||||
}
|
||||
|
||||
|
@ -614,7 +614,7 @@ void TestLinalgTransforms::runOnFunction() {
|
|||
if (testTransformPadTensor)
|
||||
return applyPadTensorToGenericPatterns(getFunction());
|
||||
if (testSwapSubTensorPadTensor)
|
||||
return applySubTensorOfPadTensorSwapPattern(getFunction());
|
||||
return applyExtractSliceOfPadTensorSwapPattern(getFunction());
|
||||
if (testAffineMinSCFCanonicalizationPatterns)
|
||||
return applyAffineMinSCFCanonicalizationPatterns(getFunction());
|
||||
if (testTileAndPadPattern)
|
||||
|
|
Loading…
Reference in New Issue