[mlir][linalg] makeTiledShape: No affine.min if tile size == 1

This improves codegen (more static type information) with `scalarize-dynamic-dims`.

Differential Revision: https://reviews.llvm.org/D109415
This commit is contained in:
Matthias Springer 2021-09-14 10:47:39 +09:00
parent fb1def9c66
commit 62883459cd
4 changed files with 100 additions and 20 deletions

View File

@ -519,6 +519,15 @@ void GenerateLoopNest<scf::ParallelOp>::doit(
assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
}
static Value fullyComposeAndAffineApply(OpBuilder &b, Location loc,
AffineExpr expr, ValueRange operands) {
AffineMap map = AffineMap::inferFromExprList({expr}).front();
SmallVector<Value> normalizedOperands(operands.begin(), operands.end());
mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands);
canonicalizeMapAndOperands(&map, &normalizedOperands);
return b.createOrFold<AffineApplyOp>(loc, map, normalizedOperands);
}
Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
ValueRange tileSizes, AffineMap map, ValueRange lbs,
ValueRange ubs, ValueRange subShapeSizes) {
@ -554,16 +563,21 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
applyMapToValues(builder, loc, m, subShapeSizes).front();
// Resulting size needs to be made half open interval again.
AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext());
Value size = makeComposedAffineApply(builder, loc, s0 + 1, closedIntSize);
Value size =
fullyComposeAndAffineApply(builder, loc, s0 + 1, closedIntSize);
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: raw size: " << size << "\n");
// The size of the subview / extract_slice should be trimmed to avoid
// out-of-bounds accesses, unless we statically know the subshape size
// divides the shape size evenly.
// out-of-bounds accesses, unless:
// a. We statically know the subshape size divides the shape size evenly.
// b. The subshape size is 1. According to the way the loops are set up,
// tensors with "0" dimensions would never be constructed.
int64_t shapeSize = shape[r];
auto sizeCst = size.getDefiningOp<ConstantIndexOp>();
if (ShapedType::isDynamic(shapeSize) || !sizeCst ||
(shapeSize % sizeCst.getValue()) != 0) {
auto hasTileSizeOne = sizeCst && sizeCst.getValue() == 1;
auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) &&
((shapeSize % sizeCst.getValue()) == 0);
if (!hasTileSizeOne && !dividesEvenly) {
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize
<< ", size: " << size
<< ": make sure in bound with affine.min\n");
@ -577,6 +591,7 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
Value d = applyMapToValues(builder, loc, m, ubs).front();
SmallVector<Value, 4> operands{size, d, offset};
fullyComposeAffineMapAndOperands(&minMap, &operands);
canonicalizeMapAndOperands(&minMap, &operands);
size = builder.create<AffineMinOp>(loc, builder.getIndexType(), minMap,
operands);
}
@ -623,7 +638,7 @@ SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc, ValueRange ivs,
// Before composing, we need to make range a closed interval.
Value size = isTiled ? tileSizes[idx] : sizeBounds[idx];
AffineExpr d0 = getAffineDimExpr(0, b.getContext());
sizes.push_back(makeComposedAffineApply(b, loc, d0 - 1, size));
sizes.push_back(fullyComposeAndAffineApply(b, loc, d0 - 1, size));
LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n");
}
return sizes;

View File

@ -212,7 +212,6 @@ module {
}
}
// CHaECK: #[[MAP0:.+]] = affine_map<(d0, d1) -> (16, d0 - d1)>
// CHECK: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
// CHECK: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, 16, -d0 + s1)>

View File

@ -25,3 +25,50 @@ func @matmul_partly_dynamic_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x2000x
outs(%out: tensor<?x2000xf32>) -> tensor<?x2000xf32>
return %r : tensor<?x2000xf32>
}
// -----
// The input IR of this test case is a tiled and peeled linalg.matmul op.
// CHECK-LABEL: func @tiled_and_peeled_matmul(
// CHECK: linalg.matmul ins({{.*}} : tensor<32x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<32x258xf32>) -> tensor<32x258xf32>
// CHECK: linalg.matmul ins({{.*}} : tensor<1x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<1x258xf32>) -> tensor<1x258xf32>
#map0 = affine_map<(d0) -> (64, -d0 + 257)>
#map1 = affine_map<()[s0] -> ((s0 floordiv 32) * 32)>
#map2 = affine_map<(d0)[s0] -> (d0 - (s0 floordiv 32) * 32)>
func @tiled_and_peeled_matmul(%arg0: tensor<257x259xf32>, %arg1: tensor<259x258xf32>, %arg2: tensor<257x258xf32>) -> tensor<257x258xf32> {
%c257 = constant 257 : index
%c64 = constant 64 : index
%cst = constant 0.000000e+00 : f32
%c0 = constant 0 : index
%c32 = constant 32 : index
%0 = linalg.fill(%cst, %arg2) : f32, tensor<257x258xf32> -> tensor<257x258xf32>
%1 = scf.for %arg3 = %c0 to %c257 step %c64 iter_args(%arg4 = %0) -> (tensor<257x258xf32>) {
%2 = affine.min #map0(%arg3)
%3 = tensor.extract_slice %arg0[%arg3, 0] [%2, 259] [1, 1] : tensor<257x259xf32> to tensor<?x259xf32>
%4 = tensor.extract_slice %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<257x258xf32> to tensor<?x258xf32>
%5 = affine.apply #map1()[%2]
%6 = scf.for %arg5 = %c0 to %5 step %c32 iter_args(%arg6 = %4) -> (tensor<?x258xf32>) {
%10 = tensor.extract_slice %3[%arg5, 0] [32, 259] [1, 1] : tensor<?x259xf32> to tensor<32x259xf32>
%11 = tensor.extract_slice %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<?x258xf32> to tensor<32x258xf32>
%12 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%10, %arg1 : tensor<32x259xf32>, tensor<259x258xf32>) outs(%11 : tensor<32x258xf32>) -> tensor<32x258xf32>
%13 = tensor.insert_slice %12 into %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<32x258xf32> into tensor<?x258xf32>
scf.yield %13 : tensor<?x258xf32>
}
%7 = cmpi slt, %5, %2 : index
%8 = scf.if %7 -> (tensor<?x258xf32>) {
%10 = affine.apply #map2(%2)[%2]
%11 = tensor.extract_slice %3[%5, 0] [%10, 259] [1, 1] : tensor<?x259xf32> to tensor<?x259xf32>
%12 = tensor.extract_slice %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> to tensor<?x258xf32>
%13 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%11, %arg1 : tensor<?x259xf32>, tensor<259x258xf32>) outs(%12 : tensor<?x258xf32>) -> tensor<?x258xf32>
%14 = tensor.insert_slice %13 into %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> into tensor<?x258xf32>
scf.yield %14 : tensor<?x258xf32>
} else {
scf.yield %6 : tensor<?x258xf32>
}
%9 = tensor.insert_slice %8 into %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<?x258xf32> into tensor<257x258xf32>
scf.yield %9 : tensor<257x258xf32>
}
return %1 : tensor<257x258xf32>
}

View File

@ -1,5 +1,5 @@
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s -check-prefix=CHECK-1D
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s -check-prefix=CHECK-2D
func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
%B: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
@ -11,17 +11,36 @@ func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
return
}
// CHECK-LABEL:func @matmul
// CHECK: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
// CHECK: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
// CHECK: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
// CHECK-1D-LABEL:func @matmul
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
// CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
//
// CHECK: linalg.copy
// CHECK: linalg.copy
// CHECK: linalg.copy
// CHECK-1D: vector.transfer_read {{.*}} : memref<8x16xf32, #{{.*}}>, vector<8x16xf32>
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32, #{{.*}}>
// CHECK-1D: vector.transfer_read {{.*}} : memref<16x12xf32, #{{.*}}>, vector<16x12xf32>
// CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32, #{{.*}}>
// CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
//
// CHECK: vector.contract
// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"]
// CHECK-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
// CHECK-1D: vector.contract
// CHECK-1D-SAME: iterator_types = ["parallel", "parallel", "reduction"]
// CHECK-1D-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
//
// CHECK: linalg.copy
// CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
// CHECK-2D-LABEL:func @matmul
// CHECK-2D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
// CHECK-2D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
// CHECK-2D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
//
// CHECK-2D: linalg.copy
// CHECK-2D: linalg.copy
// CHECK-2D: linalg.copy
//
// CHECK-2D: vector.contract
// CHECK-2D-SAME: iterator_types = ["parallel", "parallel", "reduction"]
// CHECK-2D-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
//
// CHECK-2D: linalg.copy