[mlir][linalg] Add makeComposedPadHighOp.

Add the makeComposedPadHighOp method which creates a new PadTensorOp if necessary. If the source to pad is actually the result of a sequence of padded LinalgOps, the method checks if padding is needed or if we can use the padded result of the padded LinalgOp sequence directly.

Example:
```
%0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
%1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst }
%2 = linalg.matmul ins(...) outs(%1)
%3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
```
when padding %3 return %2 instead of introducing
```
%4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %cst }
```

Depends On D114161

Reviewed By: nicolasvasilache, pifon2a

Differential Revision: https://reviews.llvm.org/D114175
This commit is contained in:
Tobias Gysi 2021-11-24 19:12:39 +00:00
parent 9300b133c8
commit 86f186efea
4 changed files with 206 additions and 7 deletions

View File

@ -93,20 +93,42 @@ FailureOr<int64_t> getConstantUpperBoundForIndex(Value value);
///
/// Example:
/// ```
/// %0 = tensor.extract_slice %arg0[3, 4][3, 32][1, 1] : tensor<64x64xf32> to
/// %0 = tensor.extract_slice %arg0[3, 4][3, 32][1, 1] : tensor<64x64xf32> to
/// tensor<3x32xf32>
/// %1 = tensor.extract_slice %0[0, 5][3, 4][1, 1] : tensor<3x32xf32> to
/// %1 = tensor.extract_slice %0[0, 5][3, 4][1, 1] : tensor<3x32xf32> to
/// tensor<3x4xf32>
/// ```
/// folds into:
/// ```
/// %1 = tensor.extract_slice %arg0[3, 9][3, 4][1, 1] : tensor<64x64xf32> to
/// tensor<3x4xf32>
/// %1 = tensor.extract_slice %arg0[3, 9][3, 4][1, 1] : tensor<64x64xf32> to
/// tensor<3x4xf32>
/// ```
tensor::ExtractSliceOp makeComposedExtractSliceOp(
OpBuilder &b, Location loc, Value source, ArrayRef<OpFoldResult> offsets,
ArrayRef<OpFoldResult> sizes, ArrayRef<OpFoldResult> strides);
/// Create a PadTensorOp that pads `source` to the size of the statically sized
/// `type` whose static sizes are assumed to be greater than the dynamic
/// `source` size. The padding introduces trailing `pad` values until the target
/// size is met. If `source` is defined by one or more LinalgOps that have been
/// padded with the same value and sizes, return their padded result instead of
/// creating a PadTensorOp.
///
/// Example:
/// ```
/// %0 = tensor.extract_slice %arg0 [%iv0, %iv1] [%sz0, %sz1]
/// %1 = linalg.pad_tensor %0 low[0, 0] high[...] { linalg.yield %cst }
/// %2 = linalg.matmul ins(...) outs(%1)
/// %3 = tensor.extract_slice %2 [0, 0] [%sz0, %sz1]
/// ```
/// makeComposedPadHighOp(source=%3, pad=%cst) returns %2
/// makeComposedPadHighOp(source=%3, pad=%other_cst) returns %4
/// ```
/// %4 = linalg.pad_tensor %3 low[0, 0] high[...] { linalg.yield %other_cst }
/// ```
Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
Value source, Value pad, bool nofold);
//===----------------------------------------------------------------------===//
// Fusion / Tiling utilities
//===----------------------------------------------------------------------===//

View File

@ -211,9 +211,9 @@ static LogicalResult padOperandToSmallestStaticBoundingBox(
auto staticTensorType = RankedTensorType::get(
staticSizes, getElementTypeOrSelf(opOperand->get()));
bool nofold = nofoldFunc ? nofoldFunc(*opOperand) : false;
result = linalg::PadTensorOp::createPadHighOp(
staticTensorType, opOperand->get(), paddingValue.getValue(),
/*nofold=*/nofold, opToPad->getLoc(), b);
result =
makeComposedPadHighOp(b, opToPad->getLoc(), staticTensorType,
opOperand->get(), paddingValue.getValue(), nofold);
return success();
}

View File

@ -322,6 +322,66 @@ tensor::ExtractSliceOp makeComposedExtractSliceOp(
foldedOffsets, sizes, strides);
}
Value makeComposedPadHighOp(OpBuilder &b, Location loc, RankedTensorType type,
Value source, Value pad, bool nofold) {
assert(type.hasStaticShape() && "expect tensor type to have static shape");
// Exit if `source` is not defined by an ExtractSliceOp.
auto sliceOp = source.getDefiningOp<tensor::ExtractSliceOp>();
if (!sliceOp)
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
// Search the `source` use-def chain for padded LinalgOps.
Value current = sliceOp.source();
while (current) {
auto linalgOp = current.getDefiningOp<LinalgOp>();
if (!linalgOp)
break;
OpResult opResult = current.cast<OpResult>();
current = linalgOp.getOutputOperand(opResult.getResultNumber())->get();
}
auto padTensorOp = current ? current.getDefiningOp<PadTensorOp>() : nullptr;
// Exit if the search fails to match a PadTensorOp at the end of the matched
// LinalgOp sequence.
if (!padTensorOp)
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
// Exit if the padded result type does not match.
if (sliceOp.source().getType() != type)
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
// Exit if the LinalgOps are not high padded.
if (llvm::any_of(padTensorOp.getMixedLowPad(), [](OpFoldResult ofr) {
return getConstantIntValue(ofr) != static_cast<int64_t>(0);
}))
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
// Exit if the sizes of the dynamic sizes of `sliceOp` do not match the size
// of the slice padded by `padTensorOp`.
auto padTensorOpSliceOp =
padTensorOp.source().getDefiningOp<tensor::ExtractSliceOp>();
if (!padTensorOpSliceOp ||
llvm::any_of(llvm::zip(sliceOp.getMixedSizes(),
padTensorOpSliceOp.getMixedSizes()),
[](std::tuple<OpFoldResult, OpFoldResult> it) {
return !isEqualConstantIntOrValue(std::get<0>(it),
std::get<1>(it));
}))
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
// Exit if the padding values do not match.
Attribute padTensorOpPadAttr, padAttr;
Value padTensorOpPad = padTensorOp.getConstantPaddingValue();
if (!padTensorOpPad ||
!matchPattern(padTensorOpPad, m_Constant(&padTensorOpPadAttr)) ||
!matchPattern(pad, m_Constant(&padAttr)) || padTensorOpPadAttr != padAttr)
return PadTensorOp::createPadHighOp(type, source, pad, nofold, loc, b);
// Return the padded result if the padding values and sizes match.
return sliceOp.source();
}
/// Specialization to build an scf "for" nest.
template <>
void GenerateLoopNest<scf::ForOp>::doit(

View File

@ -214,6 +214,123 @@ func @dynamic_sizes(%arg0: tensor<?x?xf32>,
// -----
#map0 = affine_map<(d0) -> (64, d0)>
// CHECK: compose_padding
// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32>
func @compose_padding(%arg0: tensor<64x64xf32>,
%iv0 : index) -> tensor<?x?xf32> {
%cst = arith.constant 0.0 : f32
// CHECK: %[[SIZE:.*]] = affine.min
%size = affine.min #map0(%iv0)
// CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]]
// CHECK-SAME: [0, 0]
// CHECK-SAME: [%[[SIZE]], %[[SIZE]]]
// CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]]
// CHECK: %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]]
// CHECK: %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]]
%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
^bb0(%arg3: index, %arg4: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x?xf32> to tensor<64x64xf32>
%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
%3 = linalg.fill(%cst, %2) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
%4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
// Check there are no additional pad tensor operations.
// CHECK-NOT: linalg.pad_tensor
// Check the matmul directly uses the result of the fill operation.
// CHECK: %[[T4:.*]] = linalg.matmul ins(%[[T3]]
// CHECK: %[[T5:.*]] = tensor.extract_slice %[[T4]]
// CHECK-SAME: [0, 0]
// CHECK-SAME: [%[[SIZE]], %[[SIZE]]]
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
// CHECK: return %[[T5]]
return %5 : tensor<?x?xf32>
}
// -----
#map0 = affine_map<(d0) -> (64, d0)>
// CHECK: different_padding_values
func @different_padding_values(%arg0: tensor<64x64xf32>,
%iv0 : index) -> tensor<?x?xf32> {
%cst = arith.constant 42.0 : f32
%size = affine.min #map0(%iv0)
%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
^bb0(%arg3: index, %arg4: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x?xf32> to tensor<64x64xf32>
%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
// Different padding values prevent composing the paddings (42.0 vs. 0.0).
// CHECK: = linalg.fill
// CHECK: = linalg.pad_tensor
// CHECK: = linalg.matmul
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
return %5 : tensor<?x?xf32>
}
// -----
#map0 = affine_map<(d0) -> (64, d0)>
// CHECK: different_padding_dynamic_sizes
func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>,
%iv0 : index) -> tensor<?x?xf32> {
%cst = arith.constant 0.0 : f32
%size = affine.min #map0(%iv0)
%0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
^bb0(%arg3: index, %arg4: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x?xf32> to tensor<64x64xf32>
%2 = linalg.fill(%cst, %1) : f32, tensor<64x64xf32> -> tensor<64x64xf32>
%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor<?x?xf32>
// Different dynamic sizes prevent composing the paddings (%iv0 vs %size).
// CHECK: = linalg.fill
// CHECK: = linalg.pad_tensor
// CHECK: = linalg.matmul
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
return %5 : tensor<?x?xf32>
}
// -----
#map0 = affine_map<(d0) -> (64, d0)>
// CHECK: different_padding_static_sizes
func @different_padding_static_sizes(%arg0: tensor<62x62xf32>,
%iv0 : index) -> tensor<?x?xf32> {
%cst = arith.constant 0.0 : f32
%size = affine.min #map0(%iv0)
%0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
%1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] {
^bb0(%arg3: index, %arg4: index): // no predecessors
linalg.yield %cst : f32
} : tensor<?x?xf32> to tensor<62x62xf32>
%2 = linalg.fill(%cst, %1) : f32, tensor<62x62xf32> -> tensor<62x62xf32>
%4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor<?x?xf32>
// Different static sizes prevent composing the paddings (62 vs 64 derived from #map0).
// CHECK: = linalg.fill
// CHECK: = linalg.pad_tensor
// CHECK: = linalg.matmul
%5 = linalg.matmul ins(%4, %4 : tensor<?x?xf32>, tensor<?x?xf32>) outs(%4 : tensor<?x?xf32>) -> tensor<?x?xf32>
return %5 : tensor<?x?xf32>
}
// -----
#map = affine_map<(d0) -> (7, -d0 + 12)>
// CHECK-FILL: scalar_operand