forked from OSchip/llvm-project
[mlir][linalg] Perform checks early in hoist padding.
Instead of checking for unexpected operations (any operation with a region except for scf::For and `padTensorOp` or operations with a memory effect) while cloning the packing loop nest perform the checks early. Update `dropNonIndexDependencies` to check for unexpected operations. Additionally, check all of these operations have index type operands only. Depends On D114428 Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D114438
This commit is contained in:
parent
fd723eaa92
commit
4b03906346
|
@ -43,7 +43,8 @@ using namespace mlir::linalg;
|
|||
/// 2. Pad op does not have a constant padding value.
|
||||
/// 3. There is no immediately enclosing scf::ForOp.
|
||||
/// 4. The backward slice from the pad op to the scf::ForOp to hoist above
|
||||
/// contains an unknown op with a region.
|
||||
/// contains an unknown op with non index type operands, a region, or a
|
||||
/// memory effect.
|
||||
/// 5. The backward slice from the pad op to the scf::ForOp to hoist above is
|
||||
/// empty.
|
||||
/// 6. The source tensor of pad op is not defined by an extract slice op.
|
||||
|
@ -80,7 +81,8 @@ private:
|
|||
/// operands consumed by `padTensorOp` and `sliceOp` and drops the operations
|
||||
/// not part of this index computation. Afterwards, the filtered
|
||||
/// `backwardSlice` contains only the loops whose induction variable is used,
|
||||
/// directly or indirectly, to index the padded tensor.
|
||||
/// directly or indirectly, to index the padded tensor. The method returns
|
||||
/// failure if the filtered backward slice contains an unexpected operation.
|
||||
///
|
||||
/// Example:
|
||||
/// ```
|
||||
|
@ -96,8 +98,8 @@ private:
|
|||
/// ```
|
||||
/// dropNonIndexDependencies(%padded_slice, %slice)
|
||||
/// removes [scf.for %k, linalg.fill(%cst, %arg1)] from backwardSlice.
|
||||
void dropNonIndexDependencies(PadTensorOp padTensorOp,
|
||||
tensor::ExtractSliceOp sliceOp);
|
||||
LogicalResult dropNonIndexDependencies(PadTensorOp padTensorOp,
|
||||
tensor::ExtractSliceOp sliceOp);
|
||||
|
||||
/// Encodes whether the analysis is valid and hoisting can proceed.
|
||||
bool valid;
|
||||
|
@ -209,18 +211,8 @@ HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
|
|||
// Remove all ops in the backward slice that are not used to index the padded
|
||||
// tensor. In particular, keep `padTensorOp`, `sliceOp`, and the loop and
|
||||
// affine operations used for the index computation.
|
||||
dropNonIndexDependencies(padTensorOp, sliceOp);
|
||||
|
||||
// Check if an op has a region it is either `padTensorOp`, a scf::ForOp, or a
|
||||
// LinalgOp.
|
||||
for (Operation *op : backwardSlice) {
|
||||
if (op != padTensorOp && op->getNumRegions() > 0 &&
|
||||
!isa<scf::ForOp, LinalgOp>(op)) {
|
||||
LLVM_DEBUG(DBGS() << "Unsupported op with region: " << *op
|
||||
<< " -> skip\n");
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (failed(dropNonIndexDependencies(padTensorOp, sliceOp)))
|
||||
return;
|
||||
|
||||
// Add only the loops part of the filtered `backwardSlice` to the packing
|
||||
// loops. All other loops are not used to index the padded data and
|
||||
|
@ -239,8 +231,9 @@ HoistingAnalysis::HoistingAnalysis(PadTensorOp padTensorOp, int numLoops) {
|
|||
valid = true;
|
||||
}
|
||||
|
||||
void HoistingAnalysis::dropNonIndexDependencies(
|
||||
PadTensorOp padTensorOp, tensor::ExtractSliceOp sliceOp) {
|
||||
LogicalResult
|
||||
HoistingAnalysis::dropNonIndexDependencies(PadTensorOp padTensorOp,
|
||||
tensor::ExtractSliceOp sliceOp) {
|
||||
// Set of all values used for index computation.
|
||||
SetVector<Value> indexEdges;
|
||||
|
||||
|
@ -289,7 +282,7 @@ void HoistingAnalysis::dropNonIndexDependencies(
|
|||
// Add the index operands of the loop if its induction variable is
|
||||
// used for index computation.
|
||||
if (auto forOp = dyn_cast<scf::ForOp>(op)) {
|
||||
if (indexEdges.contains(forOp.getInductionVar())) {
|
||||
if (!hasIndexResult(op) && indexEdges.contains(forOp.getInductionVar())) {
|
||||
addIndexOperandsToIndexEdges(op);
|
||||
continue;
|
||||
}
|
||||
|
@ -298,6 +291,21 @@ void HoistingAnalysis::dropNonIndexDependencies(
|
|||
// used for index computation.
|
||||
if (hasIndexResult(op)) {
|
||||
addIndexOperandsToIndexEdges(op);
|
||||
// Check the operands of the remaining operations all have index type.
|
||||
if (llvm::any_of(op->getOperandTypes(),
|
||||
[](Type type) { return !type.isIndex(); })) {
|
||||
LLVM_DEBUG(DBGS() << "Unsupported op with non index type operands: "
|
||||
<< op << " -> skip\n");
|
||||
return failure();
|
||||
}
|
||||
// Check the remaining operations do not have regions or memory effects.
|
||||
auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op);
|
||||
bool hasMemoryEffect = effectInterface && !effectInterface.hasNoEffect();
|
||||
if (hasMemoryEffect || op->getNumRegions() != 0) {
|
||||
LLVM_DEBUG(DBGS() << "Unsupported op with region or memory effect: "
|
||||
<< op << " -> skip\n");
|
||||
return failure();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
// Remove all other operation not used by the index computation except for
|
||||
|
@ -305,6 +313,7 @@ void HoistingAnalysis::dropNonIndexDependencies(
|
|||
if (!isa<arith::ConstantOp>(op))
|
||||
backwardSlice.remove(op);
|
||||
}
|
||||
return success();
|
||||
}
|
||||
|
||||
SmallVector<Value>
|
||||
|
@ -416,18 +425,13 @@ FailureOr<Value> mlir::linalg::hoistPaddingOnTensors(PadTensorOp opToHoist,
|
|||
if (auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(op))
|
||||
if (bvm.lookupOrDefault(sliceOp.source()) == packedTensor)
|
||||
continue;
|
||||
auto effects = dyn_cast<MemoryEffectOpInterface>(op);
|
||||
bool hasNoEffects = !effects || effects.hasNoEffect();
|
||||
if (hasNoEffects &&
|
||||
(op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op))) {
|
||||
// Clone all operations except it is a loop.
|
||||
auto forOp = dyn_cast<scf::ForOp>(op);
|
||||
if (!forOp) {
|
||||
b.clone(*op, bvm);
|
||||
continue;
|
||||
}
|
||||
// TODO: support more cases as they appear.
|
||||
auto forOp = dyn_cast<scf::ForOp>(op);
|
||||
assert(forOp && llvm::is_contained(analysis.packingLoops, forOp) &&
|
||||
"expect an scf::ForOp that is a packing loop");
|
||||
|
||||
// Create a packing loop that takes `packedTensor` as iteration argument.
|
||||
auto clonedForOp =
|
||||
b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
|
||||
bvm.lookupOrDefault(forOp.upperBound()),
|
||||
|
|
|
@ -436,3 +436,154 @@ func @non_constant_padding(%arg0: tensor<24x12xf32>,
|
|||
return %0 : tensor<24x25xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#map0 = affine_map<(d0) -> (5, -d0 + 24)>
|
||||
#map1 = affine_map<(d0) -> (7, -d0 + 25)>
|
||||
#map2 = affine_map<(d0) -> (-d0 + 5)>
|
||||
#map3 = affine_map<(d0) -> (-d0 + 7)>
|
||||
|
||||
// CHECK: unexpected_operation
|
||||
// CHECK-DOUBLE: unexpected_operation
|
||||
// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]*]]: memref<?xindex>
|
||||
// CHECK-SAME: %[[ARG4:[0-9a-zA-Z]*]]: i32
|
||||
func @unexpected_operation(%arg0: tensor<24x12xf32>,
|
||||
%arg1: tensor<12x25xf32>,
|
||||
%arg2: tensor<24x25xf32>,
|
||||
%arg3: memref<?xindex>,
|
||||
%arg4: i32) -> tensor<24x25xf32> {
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
%c5 = arith.constant 5 : index
|
||||
%c7 = arith.constant 7 : index
|
||||
%c6 = arith.constant 6 : index
|
||||
%c24 = arith.constant 24 : index
|
||||
%c25 = arith.constant 25 : index
|
||||
%c12 = arith.constant 12 : index
|
||||
%c0 = arith.constant 0 : index
|
||||
|
||||
// CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] =
|
||||
%0 = scf.for %arg5 = %c0 to %c24 step %c5 iter_args(%arg6 = %arg2) -> (tensor<24x25xf32>) {
|
||||
|
||||
// CHECK-NEXT: scf.for %[[IV1:[0-9a-zA-Z]*]] =
|
||||
%1 = scf.for %arg7 = %c0 to %c25 step %c7 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) {
|
||||
|
||||
// CHECK-NEXT: scf.for %[[IV2:[0-9a-zA-Z]*]] =
|
||||
%2 = scf.for %arg9 = %c0 to %c12 step %c6 iter_args(%arg10 = %arg8) -> (tensor<24x25xf32>) {
|
||||
%3 = affine.min #map0(%arg5)
|
||||
%4 = tensor.extract_slice %arg0[%arg5, %arg9] [%3, 6] [1, 1] : tensor<24x12xf32> to tensor<?x6xf32>
|
||||
%5 = affine.min #map1(%arg7)
|
||||
%6 = tensor.extract_slice %arg1[%arg9, %arg7] [6, %5] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
|
||||
%7 = tensor.extract_slice %arg10[%arg5, %arg7] [%3, %5] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
|
||||
%8 = affine.apply #map2(%3)
|
||||
|
||||
// Check cannot hoist due to unexpected operation with memory effect.
|
||||
// CHECK: %[[IDX0:.*]] = memref.load %[[ARG3]]
|
||||
// CHECK: %[[T0:.*]] = linalg.pad_tensor {{.*}}, %[[IDX0]]
|
||||
%9 = memref.load %arg3[%c0] : memref<?xindex>
|
||||
%10 = linalg.pad_tensor %4 nofold low[%c0, %c0] high[%8, %9] {
|
||||
^bb0(%arg11: index, %arg12: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x6xf32> to tensor<5x6xf32>
|
||||
%11 = affine.apply #map3(%5)
|
||||
|
||||
// Check cannot hoist due to unexpected operation with non index operand.
|
||||
// CHECK: %[[IDX1:.*]] = arith.index_cast %[[ARG4]]
|
||||
// CHECK: %[[T1:.*]] = linalg.pad_tensor {{.*}}[%[[IDX1]]
|
||||
%12 = arith.index_cast %arg4 : i32 to index
|
||||
%13 = linalg.pad_tensor %6 nofold low[%c0, %c0] high[%12, %11] {
|
||||
^bb0(%arg11: index, %arg12: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<6x?xf32> to tensor<6x7xf32>
|
||||
%14 = linalg.pad_tensor %7 low[%c0, %c0] high[%8, %11] {
|
||||
^bb0(%arg11: index, %arg12: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x?xf32> to tensor<5x7xf32>
|
||||
|
||||
// CHECK: = linalg.matmul ins(%[[T0]], %[[T1]]
|
||||
%15 = linalg.matmul ins(%10, %13 : tensor<5x6xf32>, tensor<6x7xf32>) outs(%14 : tensor<5x7xf32>) -> tensor<5x7xf32>
|
||||
%16 = tensor.extract_slice %15[0, 0] [%3, %5] [1, 1] : tensor<5x7xf32> to tensor<?x?xf32>
|
||||
%17 = tensor.insert_slice %16 into %arg10[%arg5, %arg7] [%3, %5] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
|
||||
scf.yield %17 : tensor<24x25xf32>
|
||||
}
|
||||
scf.yield %2 : tensor<24x25xf32>
|
||||
}
|
||||
scf.yield %1 : tensor<24x25xf32>
|
||||
}
|
||||
return %0 : tensor<24x25xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
#map0 = affine_map<(d0) -> (5, -d0 + 24)>
|
||||
#map1 = affine_map<(d0) -> (7, -d0 + 25)>
|
||||
#map2 = affine_map<(d0) -> (-d0 + 5)>
|
||||
#map3 = affine_map<(d0) -> (-d0 + 7)>
|
||||
|
||||
// CHECK: unexpected_loop
|
||||
// CHECK-DOUBLE: unexpected_loop
|
||||
// CHECK-SAME: %[[ARG3:[0-9a-zA-Z]*]]: index
|
||||
func @unexpected_loop(%arg0: tensor<24x12xf32>,
|
||||
%arg1: tensor<12x25xf32>,
|
||||
%arg2: tensor<24x25xf32>,
|
||||
%arg3: index) -> tensor<24x25xf32> {
|
||||
%c0 = arith.constant 0 : index
|
||||
%c12 = arith.constant 12 : index
|
||||
%c25 = arith.constant 25 : index
|
||||
%c24 = arith.constant 24 : index
|
||||
%c6 = arith.constant 6 : index
|
||||
%c7 = arith.constant 7 : index
|
||||
%c5 = arith.constant 5 : index
|
||||
%cst = arith.constant 0.000000e+00 : f32
|
||||
|
||||
// CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] =
|
||||
%0 = scf.for %arg4 = %c0 to %c24 step %c5 iter_args(%arg5 = %arg2) -> (tensor<24x25xf32>) {
|
||||
|
||||
// CHECK-NEXT: scf.for %[[IV1:[0-9a-zA-Z]*]] =
|
||||
%1 = scf.for %arg6 = %c0 to %c25 step %c7 iter_args(%arg7 = %arg5) -> (tensor<24x25xf32>) {
|
||||
|
||||
// Check the padding of the first input operand is hoisted.
|
||||
// CHECK: = linalg.pad_tensor
|
||||
|
||||
// CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] =
|
||||
%2 = scf.for %arg8 = %c0 to %c12 step %c6 iter_args(%arg9 = %arg7) -> (tensor<24x25xf32>) {
|
||||
%3 = affine.min #map0(%arg4)
|
||||
%4 = tensor.extract_slice %arg0[%arg4, %arg8] [%3, 6] [1, 1] : tensor<24x12xf32> to tensor<?x6xf32>
|
||||
%5 = affine.min #map1(%arg6)
|
||||
%6 = tensor.extract_slice %arg1[%arg8, %arg6] [6, %5] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32>
|
||||
%7 = tensor.extract_slice %arg9[%arg4, %arg6] [%3, %5] [1, 1] : tensor<24x25xf32> to tensor<?x?xf32>
|
||||
%8 = affine.apply #map2(%3)
|
||||
|
||||
// Check cannot hoist due to unexpected operation that has a region.
|
||||
// CHECK: %[[IDX0:.*]] = scf.for {{.*}} step %[[ARG3]]
|
||||
// CHECK: %[[T0:.*]] = linalg.pad_tensor {{.*}}, %[[IDX0]]
|
||||
%9 = scf.for %arg10 = %c0 to %c24 step %arg3 iter_args(%arg11 = %c0) -> (index) {
|
||||
%17 = arith.addi %arg3, %arg11 : index
|
||||
scf.yield %17 : index
|
||||
}
|
||||
%10 = linalg.pad_tensor %4 nofold low[%c0, %c0] high[%8, %9] {
|
||||
^bb0(%arg10: index, %arg11: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x6xf32> to tensor<5x6xf32>
|
||||
%11 = affine.apply #map3(%5)
|
||||
%12 = linalg.pad_tensor %6 nofold low[%c0, %c0] high[%c0, %11] {
|
||||
^bb0(%arg10: index, %arg11: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<6x?xf32> to tensor<6x7xf32>
|
||||
%13 = linalg.pad_tensor %7 low[%c0, %c0] high[%8, %11] {
|
||||
^bb0(%arg10: index, %arg11: index): // no predecessors
|
||||
linalg.yield %cst : f32
|
||||
} : tensor<?x?xf32> to tensor<5x7xf32>
|
||||
|
||||
// CHECK: = linalg.matmul ins(%[[T0]]
|
||||
%14 = linalg.matmul ins(%10, %12 : tensor<5x6xf32>, tensor<6x7xf32>) outs(%13 : tensor<5x7xf32>) -> tensor<5x7xf32>
|
||||
%15 = tensor.extract_slice %14[0, 0] [%3, %5] [1, 1] : tensor<5x7xf32> to tensor<?x?xf32>
|
||||
%16 = tensor.insert_slice %15 into %arg9[%arg4, %arg6] [%3, %5] [1, 1] : tensor<?x?xf32> into tensor<24x25xf32>
|
||||
scf.yield %16 : tensor<24x25xf32>
|
||||
}
|
||||
scf.yield %2 : tensor<24x25xf32>
|
||||
}
|
||||
scf.yield %1 : tensor<24x25xf32>
|
||||
}
|
||||
return %0 : tensor<24x25xf32>
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue