llvm-project/mlir/test/Dialect/Linalg/llvm.mlir

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

268 lines
17 KiB
MLIR
Raw Normal View History

// RUN: mlir-opt %s -convert-linalg-to-llvm | FileCheck %s
// RUN: mlir-opt %s -convert-linalg-to-loops -convert-linalg-to-llvm | FileCheck %s --check-prefix=LLVM-LOOPS
func @range(%arg0: index) {
%c0 = constant 0 : index
%c1 = constant 1 : index
%R = linalg.range %c0:%arg0:%c1 : !linalg.range
return
}
// CHECK-LABEL: func @range(%{{.*}}: !llvm.i64) {
// CHECK: llvm.mlir.constant(0 : index) : !llvm.i64
// CHECK-NEXT: llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK-NEXT: llvm.mlir.undef : !llvm<"{ i64, i64, i64 }">
// CHECK-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
// CHECK-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ i64, i64, i64 }">
// CHECK-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm<"{ i64, i64, i64 }">
func @slice(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: !linalg.range) {
%1 = linalg.slice %arg0[%arg1] : memref<?xf32, offset: ?, strides: [1]>, !linalg.range, memref<?xf32, offset: ?, strides: [1]>
return
}
// CHECK-LABEL: func @slice
// insert data ptr for slice op
// CHECK: llvm.extractvalue %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// CHECK-NEXT: llvm.extractvalue %{{.*}}[2] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// CHECK-NEXT: llvm.extractvalue %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
// CHECK-NEXT: llvm.mul %{{.*}}, %{{.*}} : !llvm.i64
// CHECK-NEXT: llvm.add %{{.*}}, %{{.*}} : !llvm.i64
// insert offset
// CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// CHECK-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[2] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// CHECK-NEXT: llvm.mlir.constant(0 : index)
// CHECK-NEXT: llvm.extractvalue %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
// CHECK-NEXT: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i64, i64, i64 }">
// CHECK-NEXT: llvm.extractvalue %{{.*}}[2] : !llvm<"{ i64, i64, i64 }">
// get size[0] from parent view
// CHECK-NEXT: llvm.extractvalue %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// CHECK-NEXT: llvm.icmp "slt" %{{.*}}, %{{.*}} : !llvm.i64
// CHECK-NEXT: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : !llvm.i1, !llvm.i64
// compute size[0] bounded by parent view's size[0]
// CHECK-NEXT: llvm.sub %{{.*}}, %{{.*}} : !llvm.i64
// bound below by 0
// CHECK-NEXT: llvm.icmp "slt" %{{.*}}, %{{.*}} : !llvm.i64
// CHECK-NEXT: llvm.select %{{.*}}, %{{.*}}, %{{.*}} : !llvm.i1, !llvm.i64
// compute stride[0] using bounded size
// CHECK-NEXT: llvm.mul %{{.*}}, %{{.*}} : !llvm.i64
// insert size and stride
// CHECK-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// CHECK-NEXT: llvm.insertvalue %{{.*}}, %{{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
func @dot(%arg0: memref<?xf32, offset: ?, strides: [1]>, %arg1: memref<?xf32, offset: ?, strides: [1]>, %arg2: memref<f32>) {
linalg.dot(%arg0, %arg1, %arg2) : memref<?xf32, offset: ?, strides: [1]>, memref<?xf32, offset: ?, strides: [1]>, memref<f32>
return
}
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 21:12:47 +08:00
// CHECK-LABEL: func @dot
// CHECK: llvm.call @linalg_dot_viewsxf32_viewsxf32_viewf32(%{{.*}}) :
// CHECK-SAME: !llvm<"float*">, !llvm<"float*">, !llvm.i64, !llvm.i64, !llvm.i64
// CHECK-SAME: !llvm<"float*">, !llvm<"float*">, !llvm.i64, !llvm.i64, !llvm.i64
// CHECK-SAME: !llvm<"float*">, !llvm<"float*">, !llvm.i64
func @slice_with_range_and_index(%arg0: memref<?x?xf64, offset: ?, strides: [?, 1]>) {
%c0 = constant 0 : index
%c1 = constant 1 : index
%R = linalg.range %c0:%c1:%c1 : !linalg.range
loop.for %i0 = %c0 to %c1 step %c1 {
%1 = linalg.slice %arg0[%i0, %R] : memref<?x?xf64, offset: ?, strides: [?, 1]>, index, !linalg.range, memref<?xf64, offset: ?, strides: [1]>
}
return
}
// CHECK-LABEL: func @slice_with_range_and_index
// loop-body.
// CHECK: llvm.mlir.undef : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
// CHECK: llvm.extractvalue %{{.*}}[4, 0] : !llvm<"{ double*, double*, i64, [2 x i64], [2 x i64] }">
// CHECK: llvm.extractvalue %{{.*}}[4, 1] : !llvm<"{ double*, double*, i64, [2 x i64], [2 x i64] }">
// CHECK: llvm.extractvalue %{{.*}}[2] : !llvm<"{ double*, double*, i64, [2 x i64], [2 x i64] }">
// CHECK: llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
// CHECK: llvm.insertvalue %{{.*}}[2] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
// CHECK: llvm.extractvalue %{{.*}}[0] : !llvm<"{ i64, i64, i64 }">
// CHECK: llvm.extractvalue %{{.*}}[1] : !llvm<"{ i64, i64, i64 }">
// CHECK: llvm.insertvalue %{{.*}}[3, 0] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
// CHECK: llvm.insertvalue %{{.*}}[4, 0] : !llvm<"{ double*, double*, i64, [1 x i64], [1 x i64] }">
func @copy(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
linalg.copy(%arg0, %arg1) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
return
}
// CHECK-LABEL: func @copy
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 21:12:47 +08:00
// CHECK: llvm.call @linalg_copy_viewsxsxsxf32_viewsxsxsxf32({{.*}}) :
// CHECK-SAME: !llvm<"float*">, !llvm<"float*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
// CHECK-SAME: !llvm<"float*">, !llvm<"float*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
func @transpose(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
%0 = linalg.transpose %arg0 (i, j, k) -> (k, i, j) : memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
return
}
// CHECK-LABEL: func @transpose
// CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
func @copy_transpose(%arg0: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, %arg1: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>) {
linalg.copy(%arg0, %arg1) {inputPermutation = affine_map<(i, j, k) -> (i, k, j)>,
outputPermutation = affine_map<(i, j, k) -> (k, j, i)>}
: memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>, memref<?x?x?xf32, offset: ?, strides: [?, ?, 1]>
return
}
// CHECK-LABEL: func @copy
// Transpose input
// CHECK: llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// Transpose output
// CHECK: llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 21:12:47 +08:00
// Call external copy.
// CHECK: llvm.call @linalg_copy_viewsxsxsxf32_viewsxsxsxf32
#matmul_accesses = [
affine_map<(m, n, k) -> (m, k)>,
affine_map<(m, n, k) -> (k, n)>,
affine_map<(m, n, k) -> (m, n)>
]
#matmul_trait = {
args_in = 2,
args_out = 1,
iterator_types = ["parallel", "parallel", "reduction"],
indexing_maps = #matmul_accesses,
library_call = "external_outerproduct_matmul"
}
!vector_type_A = type vector<4xf32>
!vector_type_B = type vector<4xf32>
!vector_type_C = type vector<4x4xf32>
!matrix_type_A = type memref<?x?x!vector_type_A>
!matrix_type_B = type memref<?x?x!vector_type_B>
!matrix_type_C = type memref<?x?x!vector_type_C>
func @matmul_vec_impl(%A: !matrix_type_A, %B: !matrix_type_B, %C: !matrix_type_C) {
linalg.generic #matmul_trait %A, %B, %C {
^bb0(%a: !vector_type_A, %b: !vector_type_B, %c: !vector_type_C):
%d = vector.outerproduct %a, %b, %c: !vector_type_A, !vector_type_B
linalg.yield %d: !vector_type_C
} : !matrix_type_A, !matrix_type_B, !matrix_type_C
return
}
// CHECK-LABEL: func @matmul_vec_impl(
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 21:12:47 +08:00
// CHECK: llvm.call @external_outerproduct_matmul(%{{.*}}) :
// CHECK-SAME: !llvm<"<4 x float>*">, !llvm<"<4 x float>*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
// CHECK-SAME: !llvm<"<4 x float>*">, !llvm<"<4 x float>*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
// CHECK-SAME: !llvm<"[4 x <4 x float>]*">, !llvm<"[4 x <4 x float>]*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
// LLVM-LOOPS-LABEL: func @matmul_vec_impl(
// LLVM-LOOPS: llvm.shufflevector {{.*}} [0 : i32, 0 : i32, 0 : i32, 0 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
// LLVM-LOOPS: llvm.shufflevector {{.*}} [1 : i32, 1 : i32, 1 : i32, 1 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
// LLVM-LOOPS: llvm.shufflevector {{.*}} [2 : i32, 2 : i32, 2 : i32, 2 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
// LLVM-LOOPS: llvm.shufflevector {{.*}} [3 : i32, 3 : i32, 3 : i32, 3 : i32] : !llvm<"<4 x float>">, !llvm<"<4 x float>">
// LLVM-LOOPS-NEXT: llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <4 x float>]">
// LLVM-LOOPS-NEXT: "llvm.intr.fma"({{.*}}) : (!llvm<"<4 x float>">, !llvm<"<4 x float>">, !llvm<"<4 x float>">) -> !llvm<"<4 x float>">
// LLVM-LOOPS-NEXT: llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x <4 x float>]">
#indexed_matmul_trait = {
args_in = 2,
args_out = 1,
iterator_types = ["parallel", "parallel", "reduction"],
indexing_maps = #matmul_accesses,
library_call = "external_indexed_outerproduct_matmul"
}
func @matmul_vec_indexed(%A: !matrix_type_A,
%B: !matrix_type_B,
%C: !matrix_type_C) {
linalg.indexed_generic #indexed_matmul_trait %A, %B, %C {
^bb0(%i: index, %j: index, %k: index,
%a: !vector_type_A, %b: !vector_type_B, %c: !vector_type_C):
%d = vector.outerproduct %a, %b, %c: !vector_type_A, !vector_type_B
linalg.yield %d: !vector_type_C
} : !matrix_type_A, !matrix_type_B, !matrix_type_C
return
}
// CHECK-LABEL: func @matmul_vec_indexed(
// CHECK: %[[ZERO:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
[mlir] use unpacked memref descriptors at function boundaries The existing (default) calling convention for memrefs in standard-to-LLVM conversion was motivated by interfacing with LLVM IR produced from C sources. In particular, it passes a pointer to the memref descriptor structure when calling the function. Therefore, the descriptor is allocated on stack before the call. This convention leads to several problems. PR44644 indicates a problem with stack exhaustion when calling functions with memref-typed arguments in a loop. Allocating outside of the loop may lead to concurrent access problems in case the loop is parallel. When targeting GPUs, the contents of the stack-allocated memory for the descriptor (passed by pointer) needs to be explicitly copied to the device. Using an aggregate type makes it impossible to attach pointer-specific argument attributes pertaining to alignment and aliasing in the LLVM dialect. Change the default calling convention for memrefs in standard-to-LLVM conversion to transform a memref into a list of arguments, each of primitive type, that are comprised in the memref descriptor. This avoids stack allocation for ranked memrefs (and thus stack exhaustion and potential concurrent access problems) and simplifies the device function invocation on GPUs. Provide an option in the standard-to-LLVM conversion to generate auxiliary wrapper function with the same interface as the previous calling convention, compatible with LLVM IR porduced from C sources. These auxiliary functions pack the individual values into a descriptor structure or unpack it. They also handle descriptor stack allocation if necessary, serving as an allocation scope: the memory reserved by `alloca` will be freed on exiting the auxiliary function. The effect of this change on MLIR-generated only LLVM IR is minimal. When interfacing MLIR-generated LLVM IR with C-generated LLVM IR, the integration only needs to require auxiliary functions and change the function name to call the wrapper function instead of the original function. This also opens the door to forwarding aliasing and alignment information from memrefs to LLVM IR pointers in the standrd-to-LLVM conversion.
2020-02-10 21:12:47 +08:00
// CHECK: llvm.call @external_indexed_outerproduct_matmul(%[[ZERO]], %[[ZERO]], %[[ZERO]], %{{.*}}) :
// CHECK-SAME: !llvm<"<4 x float>*">, !llvm<"<4 x float>*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
// CHECK-SAME: !llvm<"<4 x float>*">, !llvm<"<4 x float>*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
// CHECK-SAME: !llvm<"[4 x <4 x float>]*">, !llvm<"[4 x <4 x float>]*">, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64, !llvm.i64
func @reshape_static(%arg0: memref<3x4x5xf32>) {
// Reshapes that expand and collapse back a contiguous tensor with some 1's.
%0 = linalg.reshape %arg0 [affine_map<(i, j, k, l, m) -> (i, j)>,
affine_map<(i, j, k, l, m) -> (k)>,
affine_map<(i, j, k, l, m) -> (l, m)>] :
memref<3x4x5xf32> into memref<1x3x4x1x5xf32>
%r0 = linalg.reshape %0 [affine_map<(i, j, k, l, m) -> (i, j)>,
affine_map<(i, j, k, l, m) -> (k)>,
affine_map<(i, j, k, l, m) -> (l, m)>] :
memref<1x3x4x1x5xf32> into memref<3x4x5xf32>
return
}
// CHECK-LABEL: func @reshape_static(
// CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(3 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(4 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 3] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(5 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 4] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(60 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(20 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(5 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 2] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(5 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 3] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 4] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.mlir.undef : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.extractvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [5 x i64], [5 x i64] }">
// CHECK: llvm.insertvalue {{.*}}[2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.mlir.constant(3 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.mlir.constant(4 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.mlir.constant(5 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[3, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.mlir.constant(20 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 0] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.mlir.constant(5 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 1] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">
// CHECK: llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK: llvm.insertvalue {{.*}}[4, 2] : !llvm<"{ float*, float*, i64, [3 x i64], [3 x i64] }">