[mlir][nvvm][rocdl] refactor NVVM and ROCDL dialect. NFC.

- Extract common logic between -convert-gpu-to-nvvm and -convert-gpu-to-rocdl.
- Cope with the fact that alloca operates on different addrspaces between NVVM
  and ROCDL.
- Modernize unit tests for ROCDL dialect.

Differential Revision: https://reviews.llvm.org/D79021
This commit is contained in:
Wen-Heng (Jack) Chung 2020-05-01 00:05:28 +02:00 committed by Alex Zinenko
parent 291d24838f
commit 9ad5e57316
7 changed files with 503 additions and 360 deletions

View File

@ -11,11 +11,19 @@
#include <memory>
namespace mlir {
class LLVMTypeConverter;
class OwningRewritePatternList;
template <typename OpT>
class OperationPass;
namespace gpu {
class GPUModuleOp;
} // namespace gpu
template <typename OpT> class OperationPass;
/// Collect a set of patterns to convert from the GPU dialect to ROCDL.
void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter,
OwningRewritePatternList &patterns);
/// Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>

View File

@ -0,0 +1,171 @@
//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/Builders.h"
namespace mlir {
template <unsigned AllocaAddrSpace>
struct GPUFuncOpLowering : ConvertToLLVMPattern {
explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
: ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
typeConverter.getDialect()->getContext(),
typeConverter) {}
LogicalResult
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
assert(operands.empty() && "func op is not expected to have operands");
auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
Location loc = gpuFuncOp.getLoc();
SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().dyn_cast<MemRefType>();
assert(type && type.hasStaticShape() && "unexpected type in attribution");
uint64_t numElements = type.getNumElements();
auto elementType = typeConverter.convertType(type.getElementType())
.template cast<LLVM::LLVMType>();
auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
std::string name = std::string(
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(),
gpu::GPUDialect::getWorkgroupAddressSpace());
workgroupBuffers.push_back(globalOp);
}
// Rewrite the original GPU function to an LLVM function.
auto funcType = typeConverter.convertType(gpuFuncOp.getType())
.template cast<LLVM::LLVMType>()
.getPointerElementTy();
// Remap proper input types.
TypeConverter::SignatureConversion signatureConversion(
gpuFuncOp.front().getNumArguments());
typeConverter.convertFunctionSignature(
gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
// Create the new function operation. Only copy those attributes that are
// not specific to function modeling.
SmallVector<NamedAttribute, 4> attributes;
for (const auto &attr : gpuFuncOp.getAttrs()) {
if (attr.first == SymbolTable::getSymbolAttrName() ||
attr.first == impl::getTypeAttrName() ||
attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
continue;
attributes.push_back(attr);
}
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, attributes);
{
// Insert operations that correspond to converted workgroup and private
// memory attributions to the body of the function. This must operate on
// the original function, before the body region is inlined in the new
// function to maintain the relation between block arguments and the
// parent operation that assigns their semantics.
OpBuilder::InsertionGuard guard(rewriter);
// Rewrite workgroup memory attributions to addresses of global buffers.
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
unsigned numProperArguments = gpuFuncOp.getNumArguments();
auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
Value zero = nullptr;
if (!workgroupBuffers.empty())
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
rewriter.getI32IntegerAttr(0));
for (auto en : llvm::enumerate(workgroupBuffers)) {
LLVM::GlobalOp global = en.value();
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
auto elementType = global.getType().getArrayElementType();
Value memory = rewriter.create<LLVM::GEPOp>(
loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
address, ArrayRef<Value>{zero, zero});
// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
auto type = attribution.getType().cast<MemRefType>();
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, typeConverter, type, memory);
signatureConversion.remapInput(numProperArguments + en.index(), descr);
}
// Rewrite private memory attributions to alloca'ed buffers.
unsigned numWorkgroupAttributions =
gpuFuncOp.getNumWorkgroupAttributions();
auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().cast<MemRefType>();
assert(type && type.hasStaticShape() &&
"unexpected type in attribution");
// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
auto ptrType = typeConverter.convertType(type.getElementType())
.template cast<LLVM::LLVMType>()
.getPointerTo(AllocaAddrSpace);
Value numElements = rewriter.create<LLVM::ConstantOp>(
gpuFuncOp.getLoc(), int64Ty,
rewriter.getI64IntegerAttr(type.getNumElements()));
Value allocated = rewriter.create<LLVM::AllocaOp>(
gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, typeConverter, type, allocated);
signatureConversion.remapInput(
numProperArguments + numWorkgroupAttributions + en.index(), descr);
}
}
// Move the region to the new function, update the entry block signature.
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
llvmFuncOp.end());
rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
signatureConversion);
rewriter.eraseOp(gpuFuncOp);
return success();
}
};
struct GPUReturnOpLowering : public ConvertToLLVMPattern {
GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
: ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
typeConverter.getDialect()->getContext(),
typeConverter) {}
LogicalResult
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
return success();
}
};
} // namespace mlir
#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_

View File

@ -21,6 +21,7 @@
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/FormatVariadic.h"
#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"
#include "../PassDetail.h"
@ -88,155 +89,6 @@ struct GPUShuffleOpLowering : public ConvertToLLVMPattern {
}
};
struct GPUFuncOpLowering : ConvertToLLVMPattern {
explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
: ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
typeConverter.getDialect()->getContext(),
typeConverter) {}
LogicalResult
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
assert(operands.empty() && "func op is not expected to have operands");
auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
Location loc = gpuFuncOp.getLoc();
SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().dyn_cast<MemRefType>();
assert(type && type.hasStaticShape() && "unexpected type in attribution");
uint64_t numElements = type.getNumElements();
auto elementType = typeConverter.convertType(type.getElementType())
.cast<LLVM::LLVMType>();
auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
std::string name = std::string(
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
auto globalOp = rewriter.create<LLVM::GlobalOp>(
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
LLVM::Linkage::Internal, name, /*value=*/Attribute(),
gpu::GPUDialect::getWorkgroupAddressSpace());
workgroupBuffers.push_back(globalOp);
}
// Rewrite the original GPU function to an LLVM function.
auto funcType = typeConverter.convertType(gpuFuncOp.getType())
.cast<LLVM::LLVMType>()
.getPointerElementTy();
// Remap proper input types.
TypeConverter::SignatureConversion signatureConversion(
gpuFuncOp.front().getNumArguments());
typeConverter.convertFunctionSignature(
gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
// Create the new function operation. Only copy those attributes that are
// not specific to function modeling.
SmallVector<NamedAttribute, 4> attributes;
for (const auto &attr : gpuFuncOp.getAttrs()) {
if (attr.first == SymbolTable::getSymbolAttrName() ||
attr.first == impl::getTypeAttrName() ||
attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
continue;
attributes.push_back(attr);
}
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
LLVM::Linkage::External, attributes);
{
// Insert operations that correspond to converted workgroup and private
// memory attributions to the body of the function. This must operate on
// the original function, before the body region is inlined in the new
// function to maintain the relation between block arguments and the
// parent operation that assigns their semantics.
OpBuilder::InsertionGuard guard(rewriter);
// Rewrite workgroup memory attributions to addresses of global buffers.
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
unsigned numProperArguments = gpuFuncOp.getNumArguments();
auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
Value zero = nullptr;
if (!workgroupBuffers.empty())
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
rewriter.getI32IntegerAttr(0));
for (auto en : llvm::enumerate(workgroupBuffers)) {
LLVM::GlobalOp global = en.value();
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
auto elementType = global.getType().getArrayElementType();
Value memory = rewriter.create<LLVM::GEPOp>(
loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
address, ArrayRef<Value>{zero, zero});
// Build a memref descriptor pointing to the buffer to plug with the
// existing memref infrastructure. This may use more registers than
// otherwise necessary given that memref sizes are fixed, but we can try
// and canonicalize that away later.
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
auto type = attribution.getType().cast<MemRefType>();
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, typeConverter, type, memory);
signatureConversion.remapInput(numProperArguments + en.index(), descr);
}
// Rewrite private memory attributions to alloca'ed buffers.
unsigned numWorkgroupAttributions =
gpuFuncOp.getNumWorkgroupAttributions();
auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
Value attribution = en.value();
auto type = attribution.getType().cast<MemRefType>();
assert(type && type.hasStaticShape() &&
"unexpected type in attribution");
// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
auto ptrType = typeConverter.convertType(type.getElementType())
.cast<LLVM::LLVMType>()
.getPointerTo();
Value numElements = rewriter.create<LLVM::ConstantOp>(
gpuFuncOp.getLoc(), int64Ty,
rewriter.getI64IntegerAttr(type.getNumElements()));
Value allocated = rewriter.create<LLVM::AllocaOp>(
gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
auto descr = MemRefDescriptor::fromStaticShape(
rewriter, loc, typeConverter, type, allocated);
signatureConversion.remapInput(
numProperArguments + numWorkgroupAttributions + en.index(), descr);
}
}
// Move the region to the new function, update the entry block signature.
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
llvmFuncOp.end());
rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
signatureConversion);
rewriter.eraseOp(gpuFuncOp);
return success();
}
};
struct GPUReturnOpLowering : public ConvertToLLVMPattern {
GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
: ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
typeConverter.getDialect()->getContext(),
typeConverter) {}
LogicalResult
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
ConversionPatternRewriter &rewriter) const override {
rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
return success();
}
};
/// Import the GPU Ops to NVVM Patterns.
#include "GPUToNVVM.cpp.inc"
@ -300,8 +152,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
NVVM::GridDimYOp, NVVM::GridDimZOp>,
GPUShuffleOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>(
converter);
GPUShuffleOpLowering, GPUReturnOpLowering,
// Explicitly drop memory space when lowering private memory
// attributions since NVVM models it as `alloca`s in the default
// memory space and does not support `alloca`s with addrspace(5).
GPUFuncOpLowering<0>>(converter);
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
"__nv_fabs");
patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",

View File

@ -14,11 +14,16 @@
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
#include "mlir/Dialect/GPU/GPUDialect.h"
#include "mlir/Dialect/GPU/Passes.h"
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
#include "mlir/Dialect/Vector/VectorOps.h"
#include "mlir/Pass/Pass.h"
#include "mlir/Transforms/DialectConversion.h"
#include "llvm/Support/FormatVariadic.h"
#include "../GPUCommon/GPUOpsLowering.h"
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
#include "../GPUCommon/OpToFuncCallLowering.h"
#include "../PassDetail.h"
@ -38,41 +43,25 @@ public:
void runOnOperation() override {
gpu::GPUModuleOp m = getOperation();
OwningRewritePatternList patterns;
LLVMTypeConverter converter(m.getContext());
populateStdToLLVMConversionPatterns(converter, patterns);
patterns.insert<
GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
converter);
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
"__ocml_fabs_f64");
patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
"__ocml_ceil_f64");
patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
"__ocml_cos_f64");
patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
"__ocml_exp_f64");
patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
"__ocml_log_f64");
patterns.insert<OpToFuncCallLowering<Log10Op>>(
converter, "__ocml_log10_f32", "__ocml_log10_f64");
patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
"__ocml_log2_f64");
patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
"__ocml_tanh_f64");
ConversionTarget target(getContext());
target.addLegalDialect<LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
OwningRewritePatternList patterns;
populateGpuRewritePatterns(m.getContext(), patterns);
applyPatternsAndFoldGreedily(m, patterns);
patterns.clear();
populateVectorToLLVMConversionPatterns(converter, patterns);
populateStdToLLVMConversionPatterns(converter, patterns);
populateGpuToROCDLConversionPatterns(converter, patterns);
LLVMConversionTarget target(getContext());
target.addIllegalDialect<gpu::GPUDialect>();
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op>();
target.addIllegalOp<FuncOp>();
target.addLegalDialect<ROCDL::ROCDLDialect>();
// TODO(whchung): Remove once we support replacing non-root ops.
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
if (failed(applyPartialConversion(m, target, patterns, &converter)))
signalPassFailure();
}
@ -80,6 +69,36 @@ public:
} // anonymous namespace
void mlir::populateGpuToROCDLConversionPatterns(
LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
patterns.insert<
GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter);
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
"__ocml_fabs_f64");
patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
"__ocml_ceil_f64");
patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
"__ocml_cos_f64");
patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
"__ocml_exp_f64");
patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
"__ocml_log_f64");
patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__ocml_log10_f32",
"__ocml_log10_f64");
patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
"__ocml_log2_f64");
patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
"__ocml_tanh_f64");
}
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
mlir::createLowerGpuOpsToROCDLOpsPass() {
return std::make_unique<LowerGpuOpsToROCDLOpsPass>();

View File

@ -0,0 +1,231 @@
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
gpu.module @kernel {
// NVVM-LABEL: llvm.func @private
gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
// Allocate private memory inside the function.
// NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
// NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
// ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
// ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
// Populate the memref descriptor.
// NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
// ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(5)*, float addrspace(5)*, i64, [1 x i64], [1 x i64] }">
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
// "Store" lowering should work just as any other memref, only check that
// we emit some core instructions.
// NVVM: llvm.extractvalue %[[descr6:.*]]
// NVVM: llvm.getelementptr
// NVVM: llvm.store
// ROCDL: llvm.extractvalue %[[descr6:.*]]
// ROCDL: llvm.getelementptr
// ROCDL: llvm.store
%c0 = constant 0 : index
store %arg0, %arg1[%c0] : memref<4xf32, 5>
"terminator"() : () -> ()
}
}
// -----
gpu.module @kernel {
// Workgroup buffers are allocated as globals.
// NVVM: llvm.mlir.global internal @[[buffer:.*]]()
// NVVM-SAME: addr_space = 3
// NVVM-SAME: !llvm<"[4 x float]">
// ROCDL: llvm.mlir.global internal @[[buffer:.*]]()
// ROCDL-SAME: addr_space = 3
// ROCDL-SAME: !llvm<"[4 x float]">
// NVVM-LABEL: llvm.func @workgroup
// NVVM-SAME: {
// ROCDL-LABEL: llvm.func @workgroup
// ROCDL-SAME: {
gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
// Get the address of the first element in the global array.
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
// NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
// NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
// NVVM-SAME: !llvm<"float addrspace(3)*">
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
// ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
// ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
// ROCDL-SAME: !llvm<"float addrspace(3)*">
// Populate the memref descriptor.
// NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
// ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
// "Store" lowering should work just as any other memref, only check that
// we emit some core instructions.
// NVVM: llvm.extractvalue %[[descr6:.*]]
// NVVM: llvm.getelementptr
// NVVM: llvm.store
// ROCDL: llvm.extractvalue %[[descr6:.*]]
// ROCDL: llvm.getelementptr
// ROCDL: llvm.store
%c0 = constant 0 : index
store %arg0, %arg1[%c0] : memref<4xf32, 3>
"terminator"() : () -> ()
}
}
// -----
gpu.module @kernel {
// Check that the total size was computed correctly.
// NVVM: llvm.mlir.global internal @[[buffer:.*]]()
// NVVM-SAME: addr_space = 3
// NVVM-SAME: !llvm<"[48 x float]">
// ROCDL: llvm.mlir.global internal @[[buffer:.*]]()
// ROCDL-SAME: addr_space = 3
// ROCDL-SAME: !llvm<"[48 x float]">
// NVVM-LABEL: llvm.func @workgroup3d
// ROCDL-LABEL: llvm.func @workgroup3d
gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
// Get the address of the first element in the global array.
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
// NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
// NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
// NVVM-SAME: !llvm<"float addrspace(3)*">
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
// ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
// ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
// ROCDL-SAME: !llvm<"float addrspace(3)*">
// Populate the memref descriptor.
// NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// NVVM: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
// NVVM: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
// NVVM: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
// NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
// NVVM: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
// NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
// NVVM: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
// ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// ROCDL: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
// ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
// ROCDL: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
// ROCDL: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
// ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
// ROCDL: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
// ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
// ROCDL: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
// ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// ROCDL: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
%c0 = constant 0 : index
store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
"terminator"() : () -> ()
}
}
// -----
gpu.module @kernel {
// Check that several buffers are defined.
// NVVM: llvm.mlir.global internal @[[buffer1:.*]]()
// NVVM-SAME: !llvm<"[1 x float]">
// NVVM: llvm.mlir.global internal @[[buffer2:.*]]()
// NVVM-SAME: !llvm<"[2 x float]">
// ROCDL: llvm.mlir.global internal @[[buffer1:.*]]()
// ROCDL-SAME: !llvm<"[1 x float]">
// ROCDL: llvm.mlir.global internal @[[buffer2:.*]]()
// ROCDL-SAME: !llvm<"[2 x float]">
// NVVM-LABEL: llvm.func @multiple
// ROCDL-LABEL: llvm.func @multiple
gpu.func @multiple(%arg0: f32)
workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
// Workgroup buffers.
// NVVM: llvm.mlir.addressof @[[buffer1]]
// NVVM: llvm.mlir.addressof @[[buffer2]]
// ROCDL: llvm.mlir.addressof @[[buffer1]]
// ROCDL: llvm.mlir.addressof @[[buffer2]]
// Private buffers.
// NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
// NVVM: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
// NVVM: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
// ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
// ROCDL: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
// ROCDL: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
%c0 = constant 0 : index
store %arg0, %arg1[%c0] : memref<1xf32, 3>
store %arg0, %arg2[%c0] : memref<2xf32, 3>
store %arg0, %arg3[%c0] : memref<3xf32, 5>
store %arg0, %arg4[%c0] : memref<4xf32, 5>
"terminator"() : () -> ()
}
}

View File

@ -1,145 +0,0 @@
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
gpu.module @kernel {
// CHECK-LABEL: llvm.func @private
gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
// Allocate private memory inside the function.
// CHECK: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
// CHECK: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
// Populate the memref descriptor.
// CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
// CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
// "Store" lowering should work just as any other memref, only check that
// we emit some core instructions.
// CHECK: llvm.extractvalue %[[descr6:.*]]
// CHECK: llvm.getelementptr
// CHECK: llvm.store
%c0 = constant 0 : index
store %arg0, %arg1[%c0] : memref<4xf32, 5>
"terminator"() : () -> ()
}
}
// -----
gpu.module @kernel {
// Workgroup buffers are allocated as globals.
// CHECK: llvm.mlir.global internal @[[buffer:.*]]()
// CHECK-SAME: addr_space = 3
// CHECK-SAME: !llvm<"[4 x float]">
// CHECK-LABEL: llvm.func @workgroup
// CHECK-SAME: {
gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
// Get the address of the first element in the global array.
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
// CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
// CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
// CHECK-SAME: !llvm<"float addrspace(3)*">
// Populate the memref descriptor.
// CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
// CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
// "Store" lowering should work just as any other memref, only check that
// we emit some core instructions.
// CHECK: llvm.extractvalue %[[descr6:.*]]
// CHECK: llvm.getelementptr
// CHECK: llvm.store
%c0 = constant 0 : index
store %arg0, %arg1[%c0] : memref<4xf32, 3>
"terminator"() : () -> ()
}
}
// -----
gpu.module @kernel {
// Check that the total size was computed correctly.
// CHECK: llvm.mlir.global internal @[[buffer:.*]]()
// CHECK-SAME: addr_space = 3
// CHECK-SAME: !llvm<"[48 x float]">
// CHECK-LABEL: llvm.func @workgroup3d
gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
// Get the address of the first element in the global array.
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
// CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
// CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
// CHECK-SAME: !llvm<"float addrspace(3)*">
// Populate the memref descriptor.
// CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
// CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
// CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
// CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
// CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
// CHECK: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
// CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
// CHECK: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
// CHECK: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
// CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
// CHECK: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
// CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
// CHECK: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
// CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
// CHECK: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
%c0 = constant 0 : index
store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
"terminator"() : () -> ()
}
}
// -----
gpu.module @kernel {
// Check that several buffers are defined.
// CHECK: llvm.mlir.global internal @[[buffer1:.*]]()
// CHECK-SAME: !llvm<"[1 x float]">
// CHECK: llvm.mlir.global internal @[[buffer2:.*]]()
// CHECK-SAME: !llvm<"[2 x float]">
// CHECK-LABEL: llvm.func @multiple
gpu.func @multiple(%arg0: f32)
workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
// Workgroup buffers.
// CHECK: llvm.mlir.addressof @[[buffer1]]
// CHECK: llvm.mlir.addressof @[[buffer2]]
// Private buffers.
// CHECK: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
// CHECK: llvm.alloca %[[c3]] x !llvm.float
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
// CHECK: llvm.alloca %[[c4]] x !llvm.float
%c0 = constant 0 : index
store %arg0, %arg1[%c0] : memref<1xf32, 3>
store %arg0, %arg2[%c0] : memref<2xf32, 3>
store %arg0, %arg3[%c0] : memref<3xf32, 5>
store %arg0, %arg4[%c0] : memref<4xf32, 5>
"terminator"() : () -> ()
}
}

View File

@ -1,9 +1,10 @@
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s --dump-input-on-failure
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK-LABEL: func @gpu_index_ops()
func @gpu_index_ops()
attributes { gpu.kernel } {
-> (index, index, index, index, index, index,
index, index, index, index, index, index) {
// CHECK: rocdl.workitem.id.x : !llvm.i32
%tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
// CHECK: rocdl.workitem.id.y : !llvm.i32
@ -32,68 +33,71 @@ gpu.module @kernel_module {
// CHECK: rocdl.grid.dim.z : !llvm.i32
%gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
std.return
std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ
: index, index, index, index, index, index,
index, index, index, index, index, index
}
}
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_fabs_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_fabs_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_fabs
func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.absf %arg_f32 : f32
// CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.absf %arg_f64 : f64
// CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_ceil_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_ceil_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_ceil
func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.ceilf %arg_f32 : f32
// CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.ceilf %arg_f64 : f64
// CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_cos_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_cos_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_cos
func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.cos %arg_f32 : f32
// CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.cos %arg_f64 : f64
// CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_exp
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%exp_f32 = std.exp %arg_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result_f32 = std.exp %exp_f32 : f32
%result32 = std.exp %exp_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.exp %arg_f64 : f64
// CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}
@ -101,20 +105,20 @@ gpu.module @kernel_module {
// -----
// Test that we handled properly operation with SymbolTable other than module op
gpu.module @kernel_module {
gpu.module @test_module {
"test.symbol_scope"() ({
// CHECK: test.symbol_scope
// CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_exp
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%exp_f32 = std.exp %arg_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result_f32 = std.exp %exp_f32 : f32
%result32 = std.exp %exp_f32 : f32
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.exp %arg_f64 : f64
// CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
"test.finish" () : () -> ()
}) : () -> ()
@ -122,60 +126,60 @@ gpu.module @kernel_module {
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_log_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_log_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_log
func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.log %arg_f32 : f32
// CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.log %arg_f64 : f64
// CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_log10_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_log10_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_log10
func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.log10 %arg_f32 : f32
// CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.log10 %arg_f64 : f64
// CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_log2_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_log2_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_log2
func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.log2 %arg_f32 : f32
// CHECK: llvm.call @__ocml_log2_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.log2 %arg_f64 : f64
// CHECK: llvm.call @__ocml_log2_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}
// -----
gpu.module @kernel_module {
gpu.module @test_module {
// CHECK: llvm.func @__ocml_tanh_f32(!llvm.float) -> !llvm.float
// CHECK: llvm.func @__ocml_tanh_f64(!llvm.double) -> !llvm.double
// CHECK-LABEL: func @gpu_tanh
func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) {
func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
%result32 = std.tanh %arg_f32 : f32
// CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
%result64 = std.tanh %arg_f64 : f64
// CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
std.return
std.return %result32, %result64 : f32, f64
}
}