forked from OSchip/llvm-project
[mlir][linalg] Add makeComposedPadHighOp.
Add the makeComposedPadHighOp method which creates a new PadTensorOp if necessary. If the source to pad is actually the result of a sequence of padded LinalgOps, the method checks if padding is needed or if we can use the padded result of the padded LinalgOp sequence directly. Example: ``` %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1] %1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst } %2 = linalg.matmul ins(...) outs(%1) %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1] ``` when padding %3 return %2 instead of introducing ``` %4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %cst } ``` Depends On D114161 Reviewed By: nicolasvasilache, pifon2a Differential Revision: https://reviews.llvm.org/D114175
This commit is contained in:
parent
9300b133c8
commit
86f186efea
|
@ -93,20 +93,42 @@ FailureOr<int64_t> getConstantUpperBoundForIndex(Value value);
|
|||
///
|
||||
/// Example:
|
||||
/// ```
|
||||
/// %0 = tensor.extract_slice %arg0[3, 4][3, 32][1, 1] : tensor<64x64xf32> to
|
||||
/// %0 = tensor.extract_slice %arg0[3, 4][3, 32][1, 1] : tensor<64x64xf32> to
|
||||
/// tensor<3x32xf32>
|
||||
/// %1 = tensor.extract_slice %0[0, 5][3, 4][1, 1] : tensor<3x32xf32> to
|
||||
/// %1 = tensor.extract_slice %0[0, 5][3, 4][1, 1] : tensor<3x32xf32> to
|
||||
/// tensor<3x4xf32>
|
||||
/// ```
|
||||
/// folds into:
|
||||
/// ```
|
||||
/// %1 = tensor.extract_slice %arg0[3, 9][3, 4][1, 1] : tensor<64x64xf32> to
|
||||
/// tensor<3x4xf32>
|
||||
/// %1 = tensor.extract_slice %arg0[3, 9][3, 4][1, 1] : tensor<64x64xf32> to
|
||||
/// tensor<3x4xf32>
|
||||
/// ```
|
||||
tensor::ExtractSliceOp makeComposedExtractSliceOp(
|
||||
OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets,
|
||||
ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides);
|
||||
|
||||
/// Create a PadTensorOp that pads `source` to the size of the statically sized
|
||||
/// `type` whose static sizes are assumed to be greater than the dynamic
|
||||
/// `source` size. The padding introduces trailing `pad` values until the target
|
||||
/// size is met. If `source` is defined by one or more LinalgOps that have been
|
||||
/// padded with the same value and sizes, return their padded result instead of
|
||||
/// creating a PadTensorOp.
|
||||
///
|
||||
/// Example:
|
||||
/// ```
|
||||
/// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
|
||||
/// %1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst }
|
||||
/// %2 = linalg.matmul ins(...) outs(%1)
|
||||
/// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
|
||||
/// ```
|
||||
/// makeComposedPadHighOp(source=%3, pad=%cst) returns %2
|
||||
/// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4
|
||||
/// ```
|
||||
/// %4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %other_cst }
|
||||
/// ```
|
||||
Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
|
||||
Value source, Value pad, bool nofold);
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Fusion / Tiling utilities
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -211,9 +211,9 @@ static LogicalResult padOperandToSmallestStaticBoundingBox(
|
|||
auto staticTensorType = RankedTensorType::get(
|
||||
staticSizes, getElementTypeOrSelf(opOperand->get()));
|
||||
bool nofold = nofoldFunc ? nofoldFunc(*opOperand) : false;
|
||||
result = linalg::PadTensorOp::createPadHighOp(
|
||||
staticTensorType, opOperand->get(), paddingValue.getValue(),
|
||||
/*nofold=*/nofold, opToPad->getLoc(), b);
|
||||
result =
|
||||
makeComposedPadHighOp(b, opToPad->getLoc(), staticTensorType,
|
||||
opOperand->get(), paddingValue.getValue(), nofold);
|
||||
return success();
|
||||
}
|
||||
|
||||
|
|
|
@ -322,6 +322,66 @@ tensor::ExtractSliceOp makeComposedExtractSliceOp(
|
|||
foldedOffsets, sizes, strides);
|
||||
}
|
||||
|
||||
Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
|
||||
Value source, Value pad, bool nofold) {
|
||||
assert(type.hasStaticShape() && "expect tensor type to have static shape");
|
||||
|
||||
// Exit if `source` is not defined by an ExtractSliceOp.
|
||||
auto sliceOp = source.getDefiningOp<tensor::ExtractSliceOp>();
|
||||
if (!sliceOp)
|
||||
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
|
||||
|
||||
// Search the `source` use-def chain for padded LinalgOps.
|
||||
Value current = sliceOp.source();
|
||||
while (current) {
|
||||
auto linalgOp = current.getDefiningOp<LinalgOp>();
|
||||
if (!linalgOp)
|
||||
break;
|
||||
OpResult opResult = current.cast<OpResult>();
|
||||
current = linalgOp.getOutputOperand(opResult.getResultNumber())->get();
|
||||
}
|
||||
auto padTensorOp = current ? current.getDefiningOp<PadTensorOp>() : nullptr;
|
||||
|
||||
// Exit if the search fails to match a PadTensorOp at the end of the matched
|
||||
// LinalgOp sequence.
|
||||
if (!padTensorOp)
|
||||
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
|
||||
|
||||
// Exit if the padded result type does not match.
|
||||
if (sliceOp.source().getType() != type)
|
||||
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
|
||||
|
||||
// Exit if the LinalgOps are not high padded.
|
||||
if (llvm::any_of(padTensorOp.getMixedLowPad(), [](OpFoldResult ofr) {
|
||||
return getConstantIntValue(ofr) != static_cast<int64_t>(0);
|
||||
}))
|
||||
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
|
||||
|
||||
// Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size
|
||||
// of the slice padded by `padTensorOp`.
|
||||
auto padTensorOpSliceOp =
|
||||
padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
|
||||
if (!padTensorOpSliceOp ||
|
||||
llvm::any_of(llvm::zip(sliceOp.getMixedSizes(),
|
||||
padTensorOpSliceOp.getMixedSizes()),
|
||||
[](std::tuple<OpFoldResult, OpFoldResult> it) {
|
||||
return !isEqualConstantIntOrValue(std::get<0>(it),
|
||||
std::get<1>(it));
|
||||
}))
|
||||
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
|
||||
|
||||
// Exit if the padding values do not match.
|
||||
Attribute padTensorOpPadAttr, padAttr;
|
||||
Value padTensorOpPad = padTensorOp.getConstantPaddingValue();
|
||||
if (!padTensorOpPad ||
|
||||
!matchPattern(padTensorOpPad, m_Constant(&padTensorOpPadAttr)) ||
|
||||
!matchPattern(pad, m_Constant(&padAttr)) || padTensorOpPadAttr != padAttr)
|
||||
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
|
||||
|
||||
// Return the padded result if the padding values and sizes match.
|
||||
return sliceOp.source();
|
||||
}
|
||||
|
||||
/// Specialization to build an scf "for" nest.
|
||||
template <>
|
||||
void GenerateLoopNest<scf::ForOp>::doit(
|
||||
|
|
|
@ -214,6 +214,123 @@ func @dynamic_sizes(%arg0: tensor<?x?xf32>,
|
|||
|
||||
// -----
|
||||
|
||||
#map0 = affine_map<(d0) -> (64, d0)>
|
||||
|
||||
// CHECK: compose_padding
|
||||
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
|
||||
func @compose_padding(%arg0: tensor<64x64xf32>,
|
||||
%iv0 : index) -> tensor<?x?xf32> {
|
||||
%cst = arith.constant 0.0 : f32
|
||||
|
||||
// CHECK: %[[SIZE:.*]] = affine.min
|
||||
%size = affine.min #map0(%iv0)
|
||||
|
||||
// CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
|
||||
// CHECK-SAME: [0, 0]
|
||||
// CHECK-SAME: [%[[SIZE]], %[[SIZE]]]
|
||||
// CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]]
|
||||
// CHECK: %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]]
|
||||
// CHECK: %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]]
|
||||
%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
|
||||
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
|
||||
^bb0(%arg3: index, %arg4: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x?xf32> to tensor<64x64xf32>
|
||||
%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
|
||||
%3 = linalg.fill(%cst, %2) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
|
||||
%4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
|
||||
|
||||
// Check there are no additional pad tensor operations.
|
||||
// CHECK-NOT: linalg.pad_tensor
|
||||
|
||||
// Check the matmul directly uses the result of the fill operation.
|
||||
// CHECK: %[[T4:.*]] = linalg.matmul ins(%[[T3]]
|
||||
// CHECK: %[[T5:.*]] = tensor.extract_slice %[[T4]]
|
||||
// CHECK-SAME: [0, 0]
|
||||
// CHECK-SAME: [%[[SIZE]], %[[SIZE]]]
|
||||
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
|
||||
// CHECK: return %[[T5]]
|
||||
return %5 : tensor<?x?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#map0 = affine_map<(d0) -> (64, d0)>
|
||||
|
||||
// CHECK: different_padding_values
|
||||
func @different_padding_values(%arg0: tensor<64x64xf32>,
|
||||
%iv0 : index) -> tensor<?x?xf32> {
|
||||
%cst = arith.constant 42.0 : f32
|
||||
%size = affine.min #map0(%iv0)
|
||||
%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
|
||||
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
|
||||
^bb0(%arg3: index, %arg4: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x?xf32> to tensor<64x64xf32>
|
||||
%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
|
||||
%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
|
||||
|
||||
// Different padding values prevent composing the paddings (42.0 vs. 0.0).
|
||||
// CHECK: = linalg.fill
|
||||
// CHECK: = linalg.pad_tensor
|
||||
// CHECK: = linalg.matmul
|
||||
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
return %5 : tensor<?x?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#map0 = affine_map<(d0) -> (64, d0)>
|
||||
|
||||
// CHECK: different_padding_dynamic_sizes
|
||||
func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
|
||||
%iv0 : index) -> tensor<?x?xf32> {
|
||||
%cst = arith.constant 0.0 : f32
|
||||
%size = affine.min #map0(%iv0)
|
||||
%0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
|
||||
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
|
||||
^bb0(%arg3: index, %arg4: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x?xf32> to tensor<64x64xf32>
|
||||
%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
|
||||
%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
|
||||
|
||||
// Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
|
||||
// CHECK: = linalg.fill
|
||||
// CHECK: = linalg.pad_tensor
|
||||
// CHECK: = linalg.matmul
|
||||
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
return %5 : tensor<?x?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#map0 = affine_map<(d0) -> (64, d0)>
|
||||
|
||||
// CHECK: different_padding_static_sizes
|
||||
func @different_padding_static_sizes(%arg0: tensor<62x62xf32>,
|
||||
%iv0 : index) -> tensor<?x?xf32> {
|
||||
%cst = arith.constant 0.0 : f32
|
||||
%size = affine.min #map0(%iv0)
|
||||
%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
|
||||
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
|
||||
^bb0(%arg3: index, %arg4: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x?xf32> to tensor<62x62xf32>
|
||||
%2 = linalg.fill(%cst, %1) : f32, tensor<62x62xf32> -> tensor<62x62xf32>
|
||||
%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
|
||||
|
||||
// Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
|
||||
// CHECK: = linalg.fill
|
||||
// CHECK: = linalg.pad_tensor
|
||||
// CHECK: = linalg.matmul
|
||||
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
|
||||
return %5 : tensor<?x?xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#map = affine_map<(d0) -> (7, -d0 + 12)>
|
||||
|
||||
// CHECK-FILL: scalar_operand
|
||||
|
|
Loading…
Reference in New Issue