forked from OSchip/llvm-project
[mlir][sparse] mask reduction update
Reduction updates should be masked, just like the load and stores. Note that alternatively, we could use the fact that masked values are zero of += updates and mask invariants to get this working but that would not work for *= updates. Masking the update itself is cleanest. This change also replaces the constant mask with a broadcast of "true" since this constant folds much better for various folding patterns. Reviewed By: nicolasvasilache Differential Revision: https://reviews.llvm.org/D98000
This commit is contained in:
parent
87d5b34c24
commit
adc35b689f
|
@ -635,8 +635,8 @@ static Value genVectorMask(CodeGen &codegen, PatternRewriter &rewriter,
|
|||
matchPattern(hi, m_Constant(&hiInt)) &&
|
||||
matchPattern(step, m_Constant(&stepInt))) {
|
||||
if (((hiInt.getInt() - loInt.getInt()) % stepInt.getInt()) == 0)
|
||||
return rewriter.create<vector::ConstantMaskOp>(
|
||||
loc, mtp, rewriter.getI64ArrayAttr(codegen.curVecLength));
|
||||
return rewriter.create<vector::BroadcastOp>(
|
||||
loc, mtp, rewriter.create<ConstantIntOp>(loc, 1, 1));
|
||||
}
|
||||
// Otherwise, generate a vector mask that avoids overrunning the upperbound
|
||||
// during vector execution. Here we rely on subsequent loop optimizations to
|
||||
|
@ -723,9 +723,13 @@ static Value genTensorLoad(Merger &merger, CodeGen &codegen,
|
|||
static void genTensorStore(Merger &merger, CodeGen &codegen,
|
||||
PatternRewriter &rewriter, linalg::GenericOp op,
|
||||
unsigned tensor, Value rhs) {
|
||||
Location loc = op.getLoc();
|
||||
// Test if this is a scalarized reduction.
|
||||
unsigned lhs = op.getNumShapedOperands() - 1;
|
||||
if (lhs == tensor && codegen.redVal) {
|
||||
if (codegen.curVecLength > 1)
|
||||
rhs = rewriter.create<SelectOp>(loc, codegen.curVecMask, rhs,
|
||||
codegen.redVal);
|
||||
codegen.redVal = rhs;
|
||||
return;
|
||||
}
|
||||
|
@ -736,7 +740,6 @@ static void genTensorStore(Merger &merger, CodeGen &codegen,
|
|||
unsigned idx = map.getDimPosition(i);
|
||||
args.push_back(codegen.loops[idx]); // universal dense index
|
||||
}
|
||||
Location loc = op.getLoc();
|
||||
Value ptr = codegen.buffers[tensor];
|
||||
if (codegen.curVecLength > 1)
|
||||
genVectorStore(codegen, rewriter, rhs, ptr, args);
|
||||
|
@ -798,7 +801,7 @@ static void genReductionEnd(Merger &merger, CodeGen &codegen,
|
|||
return;
|
||||
codegen.redVal = merger.exp(codegen.redExp).val = Value(); // end chain
|
||||
unsigned lhs = op.getNumShapedOperands() - 1;
|
||||
if (codegen.curVecLength > 1) {
|
||||
if (red.getType().isa<VectorType>()) {
|
||||
// TODO: assumes + reductions for now
|
||||
codegen.curVecLength = 1;
|
||||
Value ld = genTensorLoad(merger, codegen, rewriter, op, codegen.redExp);
|
||||
|
|
|
@ -250,6 +250,37 @@ func @reduction_d(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tenso
|
|||
return %0 : tensor<f32>
|
||||
}
|
||||
|
||||
//
|
||||
// CHECK-VEC1-LABEL: func @reduction_17
|
||||
// CHECK-VEC1-DAG: %[[c0:.*]] = constant 0 : index
|
||||
// CHECK-VEC1-DAG: %[[c16:.*]] = constant 16 : index
|
||||
// CHECK-VEC1-DAG: %[[c17:.*]] = constant 17 : index
|
||||
// CHECK-VEC1-DAG: %[[v0:.*]] = constant dense<0.000000e+00> : vector<16xf32>
|
||||
// CHECK-VEC1: %[[red:.*]] = scf.for %[[i:.*]] = %[[c0]] to %[[c17]] step %[[c16]] iter_args(%[[red_in:.*]] = %[[v0]]) -> (vector<16xf32>) {
|
||||
// CHECK-VEC1: %[[sub:.*]] = subi %[[c17]], %[[i]] : index
|
||||
// CHECK-VEC1: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
|
||||
// CHECK-VEC1: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<17xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
|
||||
// CHECK-VEC1: %[[lb:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<17xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
|
||||
// CHECK-VEC1: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
|
||||
// CHECK-VEC1: %[[a:.*]] = addf %[[red_in]], %[[m]] : vector<16xf32>
|
||||
// CHECK-VEC1: %[[s:.*]] = select %[[mask]], %[[a]], %[[red_in]] : vector<16xi1>, vector<16xf32>
|
||||
// CHECK-VEC1: scf.yield %[[s]] : vector<16xf32>
|
||||
// CHECK-VEC1: }
|
||||
// CHECK-VEC1: %{{.*}} = vector.reduction "add", %[[red]], %{{.*}} : vector<16xf32> into f32
|
||||
// CHECK-VEC1: return
|
||||
//
|
||||
func @reduction_17(%arga: tensor<17xf32>, %argb: tensor<17xf32>, %argx: tensor<f32>) -> tensor<f32> {
|
||||
%0 = linalg.generic #trait_reduction_d
|
||||
ins(%arga, %argb: tensor<17xf32>, tensor<17xf32>)
|
||||
outs(%argx: tensor<f32>) {
|
||||
^bb(%a: f32, %b: f32, %x: f32):
|
||||
%0 = mulf %a, %b : f32
|
||||
%1 = addf %x, %0 : f32
|
||||
linalg.yield %1 : f32
|
||||
} -> tensor<f32>
|
||||
return %0 : tensor<f32>
|
||||
}
|
||||
|
||||
#trait_mul_ds = {
|
||||
indexing_maps = [
|
||||
affine_map<(i,j) -> (i,j)>, // a
|
||||
|
|
Loading…
Reference in New Issue