forked from OSchip/llvm-project
[mlir][sparse] incorporate vector index into address computation
When computing dense address, a vectorized index must be accounted for properly. This bug was formerly undetected because we get 0 * prev + i in most cases, which folds away the scalar part. Now it works for all cases. Reviewed By: bixia Differential Revision: https://reviews.llvm.org/D97317
This commit is contained in:
parent
6740694742
commit
17fa919847
|
@ -182,6 +182,7 @@ public:
|
||||||
continue;
|
continue;
|
||||||
// Conjunction already covered?
|
// Conjunction already covered?
|
||||||
for (unsigned p2 : latSets[s]) {
|
for (unsigned p2 : latSets[s]) {
|
||||||
|
assert(!latGT(p1, p2)); // Lj => Li would be bad
|
||||||
if (onlyDenseDiff(p2, p1)) {
|
if (onlyDenseDiff(p2, p1)) {
|
||||||
add = false;
|
add = false;
|
||||||
break;
|
break;
|
||||||
|
@ -752,6 +753,17 @@ static Value genInvariantValue(Merger &merger, CodeGen &codegen,
|
||||||
return val;
|
return val;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Generates an address computation "sz * p + i".
|
||||||
|
static Value genAddress(CodeGen &codegen, PatternRewriter &rewriter,
|
||||||
|
Location loc, Value size, Value p, Value i) {
|
||||||
|
Value mul = rewriter.create<MulIOp>(loc, size, p);
|
||||||
|
if (auto vtp = i.getType().dyn_cast<VectorType>()) {
|
||||||
|
Value inv = rewriter.create<IndexCastOp>(loc, mul, vtp.getElementType());
|
||||||
|
mul = genVectorInvariantValue(codegen, rewriter, inv);
|
||||||
|
}
|
||||||
|
return rewriter.create<AddIOp>(loc, mul, i);
|
||||||
|
}
|
||||||
|
|
||||||
/// Recursively generates tensor expression.
|
/// Recursively generates tensor expression.
|
||||||
static Value genExp(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
|
static Value genExp(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
|
||||||
linalg::GenericOp op, unsigned exp) {
|
linalg::GenericOp op, unsigned exp) {
|
||||||
|
@ -1073,9 +1085,8 @@ static void genLocals(Merger &merger, CodeGen &codegen,
|
||||||
break;
|
break;
|
||||||
Value p = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0)
|
Value p = (pat == 0) ? rewriter.create<ConstantIndexOp>(loc, 0)
|
||||||
: codegen.pidxs[tensor][topSort[pat - 1]];
|
: codegen.pidxs[tensor][topSort[pat - 1]];
|
||||||
Value m = rewriter.create<MulIOp>(loc, codegen.sizes[idx], p);
|
codegen.pidxs[tensor][idx] = genAddress(
|
||||||
codegen.pidxs[tensor][idx] =
|
codegen, rewriter, loc, codegen.sizes[idx], p, codegen.loops[idx]);
|
||||||
rewriter.create<AddIOp>(loc, m, codegen.loops[idx]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -145,6 +145,40 @@ func @mul_s(%arga: tensor<1024xf32>, %argb: tensor<1024xf32>, %argx: tensor<1024
|
||||||
return %0 : tensor<1024xf32>
|
return %0 : tensor<1024xf32>
|
||||||
}
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// CHECK-VEC2-LABEL: func @mul_s_alt
|
||||||
|
// CHECK-VEC2-DAG: %[[c0:.*]] = constant 0 : index
|
||||||
|
// CHECK-VEC2-DAG: %[[c1:.*]] = constant 1 : index
|
||||||
|
// CHECK-VEC2-DAG: %[[c16:.*]] = constant 16 : index
|
||||||
|
// CHECK-VEC2: %[[p:.*]] = load %{{.*}}[%[[c0]]] : memref<?xi32>
|
||||||
|
// CHECK-VEC2: %[[q:.*]] = index_cast %[[p]] : i32 to index
|
||||||
|
// CHECK-VEC2: %[[r:.*]] = load %{{.*}}[%[[c1]]] : memref<?xi32>
|
||||||
|
// CHECK-VEC2: %[[s:.*]] = index_cast %[[r]] : i32 to index
|
||||||
|
// CHECK-VEC2: scf.for %[[i:.*]] = %[[q]] to %[[s]] step %[[c16]] {
|
||||||
|
// CHECK-VEC2: %[[sub:.*]] = subi %[[s]], %[[i]] : index
|
||||||
|
// CHECK-VEC2: %[[mask:.*]] = vector.create_mask %[[sub]] : vector<16xi1>
|
||||||
|
// CHECK-VEC2: %[[li:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xi32>, vector<16xi1>, vector<16xi32> into vector<16xi32>
|
||||||
|
// CHECK-VEC2: %[[la:.*]] = vector.maskedload %{{.*}}[%[[i]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
|
||||||
|
// CHECK-VEC2: %[[lb:.*]] = vector.gather %{{.*}}[%[[li]]], %[[mask]], %{{.*}} : memref<?xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
|
||||||
|
// CHECK-VEC2: %[[m:.*]] = mulf %[[la]], %[[lb]] : vector<16xf32>
|
||||||
|
// CHECK-VEC2: vector.scatter %{{.*}}[%[[li]]], %[[mask]], %[[m]] : memref<1024xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
|
||||||
|
// CHECK-VEC2: }
|
||||||
|
// CHECK-VEC2: return
|
||||||
|
//
|
||||||
|
!SparseTensor = type !llvm.ptr<i8>
|
||||||
|
func @mul_s_alt(%argA: !SparseTensor, %argB: !SparseTensor, %argx: tensor<1024xf32>) -> tensor<1024xf32> {
|
||||||
|
%arga = linalg.sparse_tensor %argA : !SparseTensor to tensor<1024xf32>
|
||||||
|
%argb = linalg.sparse_tensor %argB : !SparseTensor to tensor<1024xf32>
|
||||||
|
%0 = linalg.generic #trait_mul_s
|
||||||
|
ins(%arga, %argb: tensor<1024xf32>, tensor<1024xf32>)
|
||||||
|
outs(%argx: tensor<1024xf32>) {
|
||||||
|
^bb(%a: f32, %b: f32, %x: f32):
|
||||||
|
%0 = mulf %a, %b : f32
|
||||||
|
linalg.yield %0 : f32
|
||||||
|
} -> tensor<1024xf32>
|
||||||
|
return %0 : tensor<1024xf32>
|
||||||
|
}
|
||||||
|
|
||||||
#trait_reduction_d = {
|
#trait_reduction_d = {
|
||||||
indexing_maps = [
|
indexing_maps = [
|
||||||
affine_map<(i) -> (i)>, // a
|
affine_map<(i) -> (i)>, // a
|
||||||
|
|
Loading…
Reference in New Issue