forked from OSchip/llvm-project
[mlir][linalg] makeTiledShape: No affine.min if tile size == 1
This improves codegen (more static type information) with `scalarize-dynamic-dims`. Differential Revision: https://reviews.llvm.org/D109415
This commit is contained in:
parent
fb1def9c66
commit
62883459cd
|
@ -519,6 +519,15 @@ void GenerateLoopNest<scf::ParallelOp>::doit(
|
|||
assert(ivs.size() == iteratorTypes.size() && "did not generate enough loops");
|
||||
}
|
||||
|
||||
static Value fullyComposeAndAffineApply(OpBuilder &b, Location loc,
|
||||
AffineExpr expr, ValueRange operands) {
|
||||
AffineMap map = AffineMap::inferFromExprList({expr}).front();
|
||||
SmallVector<Value> normalizedOperands(operands.begin(), operands.end());
|
||||
mlir::fullyComposeAffineMapAndOperands(&map, &normalizedOperands);
|
||||
canonicalizeMapAndOperands(&map, &normalizedOperands);
|
||||
return b.createOrFold<AffineApplyOp>(loc, map, normalizedOperands);
|
||||
}
|
||||
|
||||
Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
|
||||
ValueRange tileSizes, AffineMap map, ValueRange lbs,
|
||||
ValueRange ubs, ValueRange subShapeSizes) {
|
||||
|
@ -554,16 +563,21 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
|
|||
applyMapToValues(builder, loc, m, subShapeSizes).front();
|
||||
// Resulting size needs to be made half open interval again.
|
||||
AffineExpr s0 = getAffineSymbolExpr(0, builder.getContext());
|
||||
Value size = makeComposedAffineApply(builder, loc, s0 + 1, closedIntSize);
|
||||
Value size =
|
||||
fullyComposeAndAffineApply(builder, loc, s0 + 1, closedIntSize);
|
||||
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: raw size: " << size << "\n");
|
||||
|
||||
// The size of the subview / extract_slice should be trimmed to avoid
|
||||
// out-of-bounds accesses, unless we statically know the subshape size
|
||||
// divides the shape size evenly.
|
||||
// out-of-bounds accesses, unless:
|
||||
// a. We statically know the subshape size divides the shape size evenly.
|
||||
// b. The subshape size is 1. According to the way the loops are set up,
|
||||
// tensors with "0" dimensions would never be constructed.
|
||||
int64_t shapeSize = shape[r];
|
||||
auto sizeCst = size.getDefiningOp<ConstantIndexOp>();
|
||||
if (ShapedType::isDynamic(shapeSize) || !sizeCst ||
|
||||
(shapeSize % sizeCst.getValue()) != 0) {
|
||||
auto hasTileSizeOne = sizeCst && sizeCst.getValue() == 1;
|
||||
auto dividesEvenly = sizeCst && !ShapedType::isDynamic(shapeSize) &&
|
||||
((shapeSize % sizeCst.getValue()) == 0);
|
||||
if (!hasTileSizeOne && !dividesEvenly) {
|
||||
LLVM_DEBUG(llvm::dbgs() << "makeTiledShape: shapeSize=" << shapeSize
|
||||
<< ", size: " << size
|
||||
<< ": make sure in bound with affine.min\n");
|
||||
|
@ -577,6 +591,7 @@ Value makeTiledShape(OpBuilder &builder, Location loc, Value valueToTile,
|
|||
Value d = applyMapToValues(builder, loc, m, ubs).front();
|
||||
SmallVector<Value, 4> operands{size, d, offset};
|
||||
fullyComposeAffineMapAndOperands(&minMap, &operands);
|
||||
canonicalizeMapAndOperands(&minMap, &operands);
|
||||
size = builder.create<AffineMinOp>(loc, builder.getIndexType(), minMap,
|
||||
operands);
|
||||
}
|
||||
|
@ -623,7 +638,7 @@ SmallVector<Value> computeTileSizes(OpBuilder &b, Location loc, ValueRange ivs,
|
|||
// Before composing, we need to make range a closed interval.
|
||||
Value size = isTiled ? tileSizes[idx] : sizeBounds[idx];
|
||||
AffineExpr d0 = getAffineDimExpr(0, b.getContext());
|
||||
sizes.push_back(makeComposedAffineApply(b, loc, d0 - 1, size));
|
||||
sizes.push_back(fullyComposeAndAffineApply(b, loc, d0 - 1, size));
|
||||
LLVM_DEBUG(llvm::dbgs() << "computeTileSizes: " << sizes.back() << "\n");
|
||||
}
|
||||
return sizes;
|
||||
|
|
|
@ -212,7 +212,6 @@ module {
|
|||
}
|
||||
}
|
||||
|
||||
// CHaECK: #[[MAP0:.+]] = affine_map<(d0, d1) -> (16, d0 - d1)>
|
||||
// CHECK: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)>
|
||||
// CHECK: #[[MAP1:.+]] = affine_map<(d0)[s0, s1] -> (-d0 + s0, 16, -d0 + s1)>
|
||||
|
||||
|
|
|
@ -25,3 +25,50 @@ func @matmul_partly_dynamic_tensor(%arg0: tensor<?x?xf32>, %arg1: tensor<?x2000x
|
|||
outs(%out: tensor<?x2000xf32>) -> tensor<?x2000xf32>
|
||||
return %r : tensor<?x2000xf32>
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
// The input IR of this test case is a tiled and peeled linalg.matmul op.
|
||||
|
||||
// CHECK-LABEL: func @tiled_and_peeled_matmul(
|
||||
// CHECK: linalg.matmul ins({{.*}} : tensor<32x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<32x258xf32>) -> tensor<32x258xf32>
|
||||
// CHECK: linalg.matmul ins({{.*}} : tensor<1x259xf32>, tensor<259x258xf32>) outs({{.*}} : tensor<1x258xf32>) -> tensor<1x258xf32>
|
||||
#map0 = affine_map<(d0) -> (64, -d0 + 257)>
|
||||
#map1 = affine_map<()[s0] -> ((s0 floordiv 32) * 32)>
|
||||
#map2 = affine_map<(d0)[s0] -> (d0 - (s0 floordiv 32) * 32)>
|
||||
|
||||
func @tiled_and_peeled_matmul(%arg0: tensor<257x259xf32>, %arg1: tensor<259x258xf32>, %arg2: tensor<257x258xf32>) -> tensor<257x258xf32> {
|
||||
%c257 = constant 257 : index
|
||||
%c64 = constant 64 : index
|
||||
%cst = constant 0.000000e+00 : f32
|
||||
%c0 = constant 0 : index
|
||||
%c32 = constant 32 : index
|
||||
%0 = linalg.fill(%cst, %arg2) : f32, tensor<257x258xf32> -> tensor<257x258xf32>
|
||||
%1 = scf.for %arg3 = %c0 to %c257 step %c64 iter_args(%arg4 = %0) -> (tensor<257x258xf32>) {
|
||||
%2 = affine.min #map0(%arg3)
|
||||
%3 = tensor.extract_slice %arg0[%arg3, 0] [%2, 259] [1, 1] : tensor<257x259xf32> to tensor<?x259xf32>
|
||||
%4 = tensor.extract_slice %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<257x258xf32> to tensor<?x258xf32>
|
||||
%5 = affine.apply #map1()[%2]
|
||||
%6 = scf.for %arg5 = %c0 to %5 step %c32 iter_args(%arg6 = %4) -> (tensor<?x258xf32>) {
|
||||
%10 = tensor.extract_slice %3[%arg5, 0] [32, 259] [1, 1] : tensor<?x259xf32> to tensor<32x259xf32>
|
||||
%11 = tensor.extract_slice %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<?x258xf32> to tensor<32x258xf32>
|
||||
%12 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%10, %arg1 : tensor<32x259xf32>, tensor<259x258xf32>) outs(%11 : tensor<32x258xf32>) -> tensor<32x258xf32>
|
||||
%13 = tensor.insert_slice %12 into %arg6[%arg5, 0] [32, 258] [1, 1] : tensor<32x258xf32> into tensor<?x258xf32>
|
||||
scf.yield %13 : tensor<?x258xf32>
|
||||
}
|
||||
%7 = cmpi slt, %5, %2 : index
|
||||
%8 = scf.if %7 -> (tensor<?x258xf32>) {
|
||||
%10 = affine.apply #map2(%2)[%2]
|
||||
%11 = tensor.extract_slice %3[%5, 0] [%10, 259] [1, 1] : tensor<?x259xf32> to tensor<?x259xf32>
|
||||
%12 = tensor.extract_slice %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> to tensor<?x258xf32>
|
||||
%13 = linalg.matmul {__internal_linalg_transform__ = "tile"} ins(%11, %arg1 : tensor<?x259xf32>, tensor<259x258xf32>) outs(%12 : tensor<?x258xf32>) -> tensor<?x258xf32>
|
||||
%14 = tensor.insert_slice %13 into %6[%5, 0] [%10, 258] [1, 1] : tensor<?x258xf32> into tensor<?x258xf32>
|
||||
scf.yield %14 : tensor<?x258xf32>
|
||||
} else {
|
||||
scf.yield %6 : tensor<?x258xf32>
|
||||
}
|
||||
%9 = tensor.insert_slice %8 into %arg4[%arg3, 0] [%2, 258] [1, 1] : tensor<?x258xf32> into tensor<257x258xf32>
|
||||
scf.yield %9 : tensor<257x258xf32>
|
||||
}
|
||||
return %1 : tensor<257x258xf32>
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s
|
||||
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s
|
||||
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-1d | FileCheck %s -check-prefix=CHECK-1D
|
||||
// RUN: mlir-opt %s -test-linalg-transform-patterns=test-matmul-to-vector-patterns-tile-2d | FileCheck %s -check-prefix=CHECK-2D
|
||||
|
||||
func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
|
||||
%B: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
|
||||
|
@ -11,17 +11,36 @@ func @matmul(%A: memref<1584x1584xf32, offset: 0, strides: [1584, 1]>,
|
|||
return
|
||||
}
|
||||
|
||||
// CHECK-LABEL:func @matmul
|
||||
// CHECK: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
|
||||
// CHECK: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
|
||||
// CHECK: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
|
||||
// CHECK-1D-LABEL:func @matmul
|
||||
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
|
||||
// CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
|
||||
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
|
||||
//
|
||||
// CHECK: linalg.copy
|
||||
// CHECK: linalg.copy
|
||||
// CHECK: linalg.copy
|
||||
// CHECK-1D: vector.transfer_read {{.*}} : memref<8x16xf32, #{{.*}}>, vector<8x16xf32>
|
||||
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32, #{{.*}}>
|
||||
// CHECK-1D: vector.transfer_read {{.*}} : memref<16x12xf32, #{{.*}}>, vector<16x12xf32>
|
||||
// CHECK-1D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32, #{{.*}}>
|
||||
// CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
|
||||
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
|
||||
//
|
||||
// CHECK: vector.contract
|
||||
// CHECK-SAME: iterator_types = ["parallel", "parallel", "reduction"]
|
||||
// CHECK-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
|
||||
// CHECK-1D: vector.contract
|
||||
// CHECK-1D-SAME: iterator_types = ["parallel", "parallel", "reduction"]
|
||||
// CHECK-1D-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
|
||||
//
|
||||
// CHECK: linalg.copy
|
||||
// CHECK-1D: vector.transfer_read {{.*}} : memref<8x12xf32, #{{.*}}>, vector<8x12xf32>
|
||||
// CHECK-1D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32, #{{.*}}>
|
||||
|
||||
// CHECK-2D-LABEL:func @matmul
|
||||
// CHECK-2D: vector.transfer_write {{.*}} : vector<8x16xf32>, memref<8x16xf32>
|
||||
// CHECK-2D: vector.transfer_write {{.*}} : vector<16x12xf32>, memref<16x12xf32>
|
||||
// CHECK-2D: vector.transfer_write {{.*}} : vector<8x12xf32>, memref<8x12xf32>
|
||||
//
|
||||
// CHECK-2D: linalg.copy
|
||||
// CHECK-2D: linalg.copy
|
||||
// CHECK-2D: linalg.copy
|
||||
//
|
||||
// CHECK-2D: vector.contract
|
||||
// CHECK-2D-SAME: iterator_types = ["parallel", "parallel", "reduction"]
|
||||
// CHECK-2D-SAME: : vector<8x16xf32>, vector<12x16xf32> into vector<8x12xf32>
|
||||
//
|
||||
// CHECK-2D: linalg.copy
|
||||
|
|
Loading…
Reference in New Issue