forked from OSchip/llvm-project
[mlir][nvvm][rocdl] refactor NVVM and ROCDL dialect. NFC.
- Extract common logic between -convert-gpu-to-nvvm and -convert-gpu-to-rocdl. - Cope with the fact that alloca operates on different addrspaces between NVVM and ROCDL. - Modernize unit tests for ROCDL dialect. Differential Revision: https://reviews.llvm.org/D79021
This commit is contained in:
parent
291d24838f
commit
9ad5e57316
|
@ -11,11 +11,19 @@
|
|||
#include <memory>
|
||||
|
||||
namespace mlir {
|
||||
class LLVMTypeConverter;
|
||||
class OwningRewritePatternList;
|
||||
|
||||
template <typename OpT>
|
||||
class OperationPass;
|
||||
|
||||
namespace gpu {
|
||||
class GPUModuleOp;
|
||||
} // namespace gpu
|
||||
template <typename OpT> class OperationPass;
|
||||
|
||||
/// Collect a set of patterns to convert from the GPU dialect to ROCDL.
|
||||
void populateGpuToROCDLConversionPatterns(LLVMTypeConverter &converter,
|
||||
OwningRewritePatternList &patterns);
|
||||
|
||||
/// Creates a pass that lowers GPU dialect operations to ROCDL counterparts.
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
|
||||
|
|
|
@ -0,0 +1,171 @@
|
|||
//===- GPUOpsLowering.h - GPU FuncOp / ReturnOp lowering -------*- C++ -*--===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
|
||||
#define MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
|
||||
|
||||
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
|
||||
#include "mlir/Dialect/StandardOps/IR/Ops.h"
|
||||
#include "mlir/IR/Builders.h"
|
||||
|
||||
namespace mlir {
|
||||
|
||||
template <unsigned AllocaAddrSpace>
|
||||
struct GPUFuncOpLowering : ConvertToLLVMPattern {
|
||||
explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
|
||||
: ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
|
||||
typeConverter.getDialect()->getContext(),
|
||||
typeConverter) {}
|
||||
|
||||
LogicalResult
|
||||
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const override {
|
||||
assert(operands.empty() && "func op is not expected to have operands");
|
||||
auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
|
||||
Location loc = gpuFuncOp.getLoc();
|
||||
|
||||
SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
|
||||
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
|
||||
for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
|
||||
Value attribution = en.value();
|
||||
|
||||
auto type = attribution.getType().dyn_cast<MemRefType>();
|
||||
assert(type && type.hasStaticShape() && "unexpected type in attribution");
|
||||
|
||||
uint64_t numElements = type.getNumElements();
|
||||
|
||||
auto elementType = typeConverter.convertType(type.getElementType())
|
||||
.template cast<LLVM::LLVMType>();
|
||||
auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
|
||||
std::string name = std::string(
|
||||
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
|
||||
auto globalOp = rewriter.create<LLVM::GlobalOp>(
|
||||
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
|
||||
LLVM::Linkage::Internal, name, /*value=*/Attribute(),
|
||||
gpu::GPUDialect::getWorkgroupAddressSpace());
|
||||
workgroupBuffers.push_back(globalOp);
|
||||
}
|
||||
|
||||
// Rewrite the original GPU function to an LLVM function.
|
||||
auto funcType = typeConverter.convertType(gpuFuncOp.getType())
|
||||
.template cast<LLVM::LLVMType>()
|
||||
.getPointerElementTy();
|
||||
|
||||
// Remap proper input types.
|
||||
TypeConverter::SignatureConversion signatureConversion(
|
||||
gpuFuncOp.front().getNumArguments());
|
||||
typeConverter.convertFunctionSignature(
|
||||
gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
|
||||
|
||||
// Create the new function operation. Only copy those attributes that are
|
||||
// not specific to function modeling.
|
||||
SmallVector<NamedAttribute, 4> attributes;
|
||||
for (const auto &attr : gpuFuncOp.getAttrs()) {
|
||||
if (attr.first == SymbolTable::getSymbolAttrName() ||
|
||||
attr.first == impl::getTypeAttrName() ||
|
||||
attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
|
||||
continue;
|
||||
attributes.push_back(attr);
|
||||
}
|
||||
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
|
||||
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
|
||||
LLVM::Linkage::External, attributes);
|
||||
|
||||
{
|
||||
// Insert operations that correspond to converted workgroup and private
|
||||
// memory attributions to the body of the function. This must operate on
|
||||
// the original function, before the body region is inlined in the new
|
||||
// function to maintain the relation between block arguments and the
|
||||
// parent operation that assigns their semantics.
|
||||
OpBuilder::InsertionGuard guard(rewriter);
|
||||
|
||||
// Rewrite workgroup memory attributions to addresses of global buffers.
|
||||
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
|
||||
unsigned numProperArguments = gpuFuncOp.getNumArguments();
|
||||
auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
|
||||
|
||||
Value zero = nullptr;
|
||||
if (!workgroupBuffers.empty())
|
||||
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
|
||||
rewriter.getI32IntegerAttr(0));
|
||||
for (auto en : llvm::enumerate(workgroupBuffers)) {
|
||||
LLVM::GlobalOp global = en.value();
|
||||
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
|
||||
auto elementType = global.getType().getArrayElementType();
|
||||
Value memory = rewriter.create<LLVM::GEPOp>(
|
||||
loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
|
||||
address, ArrayRef<Value>{zero, zero});
|
||||
|
||||
// Build a memref descriptor pointing to the buffer to plug with the
|
||||
// existing memref infrastructure. This may use more registers than
|
||||
// otherwise necessary given that memref sizes are fixed, but we can try
|
||||
// and canonicalize that away later.
|
||||
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
|
||||
auto type = attribution.getType().cast<MemRefType>();
|
||||
auto descr = MemRefDescriptor::fromStaticShape(
|
||||
rewriter, loc, typeConverter, type, memory);
|
||||
signatureConversion.remapInput(numProperArguments + en.index(), descr);
|
||||
}
|
||||
|
||||
// Rewrite private memory attributions to alloca'ed buffers.
|
||||
unsigned numWorkgroupAttributions =
|
||||
gpuFuncOp.getNumWorkgroupAttributions();
|
||||
auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
|
||||
for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
|
||||
Value attribution = en.value();
|
||||
auto type = attribution.getType().cast<MemRefType>();
|
||||
assert(type && type.hasStaticShape() &&
|
||||
"unexpected type in attribution");
|
||||
|
||||
// Explicitly drop memory space when lowering private memory
|
||||
// attributions since NVVM models it as `alloca`s in the default
|
||||
// memory space and does not support `alloca`s with addrspace(5).
|
||||
auto ptrType = typeConverter.convertType(type.getElementType())
|
||||
.template cast<LLVM::LLVMType>()
|
||||
.getPointerTo(AllocaAddrSpace);
|
||||
Value numElements = rewriter.create<LLVM::ConstantOp>(
|
||||
gpuFuncOp.getLoc(), int64Ty,
|
||||
rewriter.getI64IntegerAttr(type.getNumElements()));
|
||||
Value allocated = rewriter.create<LLVM::AllocaOp>(
|
||||
gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
|
||||
auto descr = MemRefDescriptor::fromStaticShape(
|
||||
rewriter, loc, typeConverter, type, allocated);
|
||||
signatureConversion.remapInput(
|
||||
numProperArguments + numWorkgroupAttributions + en.index(), descr);
|
||||
}
|
||||
}
|
||||
|
||||
// Move the region to the new function, update the entry block signature.
|
||||
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
|
||||
llvmFuncOp.end());
|
||||
rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
|
||||
signatureConversion);
|
||||
|
||||
rewriter.eraseOp(gpuFuncOp);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
struct GPUReturnOpLowering : public ConvertToLLVMPattern {
|
||||
GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
|
||||
: ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
|
||||
typeConverter.getDialect()->getContext(),
|
||||
typeConverter) {}
|
||||
|
||||
LogicalResult
|
||||
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const override {
|
||||
rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace mlir
|
||||
|
||||
#endif // MLIR_CONVERSION_GPUCOMMON_GPUOPSLOWERING_H_
|
|
@ -21,6 +21,7 @@
|
|||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
||||
#include "../GPUCommon/GPUOpsLowering.h"
|
||||
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
|
||||
#include "../GPUCommon/OpToFuncCallLowering.h"
|
||||
#include "../PassDetail.h"
|
||||
|
@ -88,155 +89,6 @@ struct GPUShuffleOpLowering : public ConvertToLLVMPattern {
|
|||
}
|
||||
};
|
||||
|
||||
struct GPUFuncOpLowering : ConvertToLLVMPattern {
|
||||
explicit GPUFuncOpLowering(LLVMTypeConverter &typeConverter)
|
||||
: ConvertToLLVMPattern(gpu::GPUFuncOp::getOperationName(),
|
||||
typeConverter.getDialect()->getContext(),
|
||||
typeConverter) {}
|
||||
|
||||
LogicalResult
|
||||
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const override {
|
||||
assert(operands.empty() && "func op is not expected to have operands");
|
||||
auto gpuFuncOp = cast<gpu::GPUFuncOp>(op);
|
||||
Location loc = gpuFuncOp.getLoc();
|
||||
|
||||
SmallVector<LLVM::GlobalOp, 3> workgroupBuffers;
|
||||
workgroupBuffers.reserve(gpuFuncOp.getNumWorkgroupAttributions());
|
||||
for (auto en : llvm::enumerate(gpuFuncOp.getWorkgroupAttributions())) {
|
||||
Value attribution = en.value();
|
||||
|
||||
auto type = attribution.getType().dyn_cast<MemRefType>();
|
||||
assert(type && type.hasStaticShape() && "unexpected type in attribution");
|
||||
|
||||
uint64_t numElements = type.getNumElements();
|
||||
|
||||
auto elementType = typeConverter.convertType(type.getElementType())
|
||||
.cast<LLVM::LLVMType>();
|
||||
auto arrayType = LLVM::LLVMType::getArrayTy(elementType, numElements);
|
||||
std::string name = std::string(
|
||||
llvm::formatv("__wg_{0}_{1}", gpuFuncOp.getName(), en.index()));
|
||||
auto globalOp = rewriter.create<LLVM::GlobalOp>(
|
||||
gpuFuncOp.getLoc(), arrayType, /*isConstant=*/false,
|
||||
LLVM::Linkage::Internal, name, /*value=*/Attribute(),
|
||||
gpu::GPUDialect::getWorkgroupAddressSpace());
|
||||
workgroupBuffers.push_back(globalOp);
|
||||
}
|
||||
|
||||
// Rewrite the original GPU function to an LLVM function.
|
||||
auto funcType = typeConverter.convertType(gpuFuncOp.getType())
|
||||
.cast<LLVM::LLVMType>()
|
||||
.getPointerElementTy();
|
||||
|
||||
// Remap proper input types.
|
||||
TypeConverter::SignatureConversion signatureConversion(
|
||||
gpuFuncOp.front().getNumArguments());
|
||||
typeConverter.convertFunctionSignature(
|
||||
gpuFuncOp.getType(), /*isVariadic=*/false, signatureConversion);
|
||||
|
||||
// Create the new function operation. Only copy those attributes that are
|
||||
// not specific to function modeling.
|
||||
SmallVector<NamedAttribute, 4> attributes;
|
||||
for (const auto &attr : gpuFuncOp.getAttrs()) {
|
||||
if (attr.first == SymbolTable::getSymbolAttrName() ||
|
||||
attr.first == impl::getTypeAttrName() ||
|
||||
attr.first == gpu::GPUFuncOp::getNumWorkgroupAttributionsAttrName())
|
||||
continue;
|
||||
attributes.push_back(attr);
|
||||
}
|
||||
auto llvmFuncOp = rewriter.create<LLVM::LLVMFuncOp>(
|
||||
gpuFuncOp.getLoc(), gpuFuncOp.getName(), funcType,
|
||||
LLVM::Linkage::External, attributes);
|
||||
|
||||
{
|
||||
// Insert operations that correspond to converted workgroup and private
|
||||
// memory attributions to the body of the function. This must operate on
|
||||
// the original function, before the body region is inlined in the new
|
||||
// function to maintain the relation between block arguments and the
|
||||
// parent operation that assigns their semantics.
|
||||
OpBuilder::InsertionGuard guard(rewriter);
|
||||
|
||||
// Rewrite workgroup memory attributions to addresses of global buffers.
|
||||
rewriter.setInsertionPointToStart(&gpuFuncOp.front());
|
||||
unsigned numProperArguments = gpuFuncOp.getNumArguments();
|
||||
auto i32Type = LLVM::LLVMType::getInt32Ty(typeConverter.getDialect());
|
||||
|
||||
Value zero = nullptr;
|
||||
if (!workgroupBuffers.empty())
|
||||
zero = rewriter.create<LLVM::ConstantOp>(loc, i32Type,
|
||||
rewriter.getI32IntegerAttr(0));
|
||||
for (auto en : llvm::enumerate(workgroupBuffers)) {
|
||||
LLVM::GlobalOp global = en.value();
|
||||
Value address = rewriter.create<LLVM::AddressOfOp>(loc, global);
|
||||
auto elementType = global.getType().getArrayElementType();
|
||||
Value memory = rewriter.create<LLVM::GEPOp>(
|
||||
loc, elementType.getPointerTo(global.addr_space().getZExtValue()),
|
||||
address, ArrayRef<Value>{zero, zero});
|
||||
|
||||
// Build a memref descriptor pointing to the buffer to plug with the
|
||||
// existing memref infrastructure. This may use more registers than
|
||||
// otherwise necessary given that memref sizes are fixed, but we can try
|
||||
// and canonicalize that away later.
|
||||
Value attribution = gpuFuncOp.getWorkgroupAttributions()[en.index()];
|
||||
auto type = attribution.getType().cast<MemRefType>();
|
||||
auto descr = MemRefDescriptor::fromStaticShape(
|
||||
rewriter, loc, typeConverter, type, memory);
|
||||
signatureConversion.remapInput(numProperArguments + en.index(), descr);
|
||||
}
|
||||
|
||||
// Rewrite private memory attributions to alloca'ed buffers.
|
||||
unsigned numWorkgroupAttributions =
|
||||
gpuFuncOp.getNumWorkgroupAttributions();
|
||||
auto int64Ty = LLVM::LLVMType::getInt64Ty(typeConverter.getDialect());
|
||||
for (auto en : llvm::enumerate(gpuFuncOp.getPrivateAttributions())) {
|
||||
Value attribution = en.value();
|
||||
auto type = attribution.getType().cast<MemRefType>();
|
||||
assert(type && type.hasStaticShape() &&
|
||||
"unexpected type in attribution");
|
||||
|
||||
// Explicitly drop memory space when lowering private memory
|
||||
// attributions since NVVM models it as `alloca`s in the default
|
||||
// memory space and does not support `alloca`s with addrspace(5).
|
||||
auto ptrType = typeConverter.convertType(type.getElementType())
|
||||
.cast<LLVM::LLVMType>()
|
||||
.getPointerTo();
|
||||
Value numElements = rewriter.create<LLVM::ConstantOp>(
|
||||
gpuFuncOp.getLoc(), int64Ty,
|
||||
rewriter.getI64IntegerAttr(type.getNumElements()));
|
||||
Value allocated = rewriter.create<LLVM::AllocaOp>(
|
||||
gpuFuncOp.getLoc(), ptrType, numElements, /*alignment=*/0);
|
||||
auto descr = MemRefDescriptor::fromStaticShape(
|
||||
rewriter, loc, typeConverter, type, allocated);
|
||||
signatureConversion.remapInput(
|
||||
numProperArguments + numWorkgroupAttributions + en.index(), descr);
|
||||
}
|
||||
}
|
||||
|
||||
// Move the region to the new function, update the entry block signature.
|
||||
rewriter.inlineRegionBefore(gpuFuncOp.getBody(), llvmFuncOp.getBody(),
|
||||
llvmFuncOp.end());
|
||||
rewriter.applySignatureConversion(&llvmFuncOp.getBody(),
|
||||
signatureConversion);
|
||||
|
||||
rewriter.eraseOp(gpuFuncOp);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
struct GPUReturnOpLowering : public ConvertToLLVMPattern {
|
||||
GPUReturnOpLowering(LLVMTypeConverter &typeConverter)
|
||||
: ConvertToLLVMPattern(gpu::ReturnOp::getOperationName(),
|
||||
typeConverter.getDialect()->getContext(),
|
||||
typeConverter) {}
|
||||
|
||||
LogicalResult
|
||||
matchAndRewrite(Operation *op, ArrayRef<Value> operands,
|
||||
ConversionPatternRewriter &rewriter) const override {
|
||||
rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(op, operands);
|
||||
return success();
|
||||
}
|
||||
};
|
||||
|
||||
/// Import the GPU Ops to NVVM Patterns.
|
||||
#include "GPUToNVVM.cpp.inc"
|
||||
|
||||
|
@ -300,8 +152,11 @@ void mlir::populateGpuToNVVMConversionPatterns(
|
|||
NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
|
||||
NVVM::GridDimYOp, NVVM::GridDimZOp>,
|
||||
GPUShuffleOpLowering, GPUFuncOpLowering, GPUReturnOpLowering>(
|
||||
converter);
|
||||
GPUShuffleOpLowering, GPUReturnOpLowering,
|
||||
// Explicitly drop memory space when lowering private memory
|
||||
// attributions since NVVM models it as `alloca`s in the default
|
||||
// memory space and does not support `alloca`s with addrspace(5).
|
||||
GPUFuncOpLowering<0>>(converter);
|
||||
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__nv_fabsf",
|
||||
"__nv_fabs");
|
||||
patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__nv_ceilf",
|
||||
|
|
|
@ -14,11 +14,16 @@
|
|||
#include "mlir/Conversion/GPUToROCDL/GPUToROCDLPass.h"
|
||||
|
||||
#include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
|
||||
#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
|
||||
#include "mlir/Dialect/GPU/GPUDialect.h"
|
||||
#include "mlir/Dialect/GPU/Passes.h"
|
||||
#include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
|
||||
#include "mlir/Dialect/Vector/VectorOps.h"
|
||||
#include "mlir/Pass/Pass.h"
|
||||
#include "mlir/Transforms/DialectConversion.h"
|
||||
#include "llvm/Support/FormatVariadic.h"
|
||||
|
||||
#include "../GPUCommon/GPUOpsLowering.h"
|
||||
#include "../GPUCommon/IndexIntrinsicsOpLowering.h"
|
||||
#include "../GPUCommon/OpToFuncCallLowering.h"
|
||||
#include "../PassDetail.h"
|
||||
|
@ -38,41 +43,25 @@ public:
|
|||
void runOnOperation() override {
|
||||
gpu::GPUModuleOp m = getOperation();
|
||||
|
||||
OwningRewritePatternList patterns;
|
||||
LLVMTypeConverter converter(m.getContext());
|
||||
populateStdToLLVMConversionPatterns(converter, patterns);
|
||||
patterns.insert<
|
||||
GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
|
||||
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
|
||||
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
|
||||
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
|
||||
ROCDL::GridDimYOp, ROCDL::GridDimZOp>>(
|
||||
converter);
|
||||
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
|
||||
"__ocml_fabs_f64");
|
||||
patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
|
||||
"__ocml_ceil_f64");
|
||||
patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
|
||||
"__ocml_cos_f64");
|
||||
patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
|
||||
"__ocml_exp_f64");
|
||||
patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
|
||||
"__ocml_log_f64");
|
||||
patterns.insert<OpToFuncCallLowering<Log10Op>>(
|
||||
converter, "__ocml_log10_f32", "__ocml_log10_f64");
|
||||
patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
|
||||
"__ocml_log2_f64");
|
||||
patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
|
||||
"__ocml_tanh_f64");
|
||||
|
||||
ConversionTarget target(getContext());
|
||||
target.addLegalDialect<LLVM::LLVMDialect, ROCDL::ROCDLDialect>();
|
||||
OwningRewritePatternList patterns;
|
||||
|
||||
populateGpuRewritePatterns(m.getContext(), patterns);
|
||||
applyPatternsAndFoldGreedily(m, patterns);
|
||||
patterns.clear();
|
||||
|
||||
populateVectorToLLVMConversionPatterns(converter, patterns);
|
||||
populateStdToLLVMConversionPatterns(converter, patterns);
|
||||
populateGpuToROCDLConversionPatterns(converter, patterns);
|
||||
LLVMConversionTarget target(getContext());
|
||||
target.addIllegalDialect<gpu::GPUDialect>();
|
||||
target.addIllegalOp<LLVM::CosOp, LLVM::ExpOp, LLVM::FAbsOp, LLVM::FCeilOp,
|
||||
LLVM::LogOp, LLVM::Log10Op, LLVM::Log2Op>();
|
||||
target.addIllegalOp<FuncOp>();
|
||||
target.addLegalDialect<ROCDL::ROCDLDialect>();
|
||||
// TODO(whchung): Remove once we support replacing non-root ops.
|
||||
target.addLegalOp<gpu::YieldOp, gpu::GPUModuleOp, gpu::ModuleEndOp>();
|
||||
if (failed(applyPartialConversion(m, target, patterns, &converter)))
|
||||
signalPassFailure();
|
||||
}
|
||||
|
@ -80,6 +69,36 @@ public:
|
|||
|
||||
} // anonymous namespace
|
||||
|
||||
void mlir::populateGpuToROCDLConversionPatterns(
|
||||
LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
|
||||
patterns.insert<
|
||||
GPUIndexIntrinsicOpLowering<gpu::ThreadIdOp, ROCDL::ThreadIdXOp,
|
||||
ROCDL::ThreadIdYOp, ROCDL::ThreadIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockDimOp, ROCDL::BlockDimXOp,
|
||||
ROCDL::BlockDimYOp, ROCDL::BlockDimZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::BlockIdOp, ROCDL::BlockIdXOp,
|
||||
ROCDL::BlockIdYOp, ROCDL::BlockIdZOp>,
|
||||
GPUIndexIntrinsicOpLowering<gpu::GridDimOp, ROCDL::GridDimXOp,
|
||||
ROCDL::GridDimYOp, ROCDL::GridDimZOp>,
|
||||
GPUFuncOpLowering<5>, GPUReturnOpLowering>(converter);
|
||||
patterns.insert<OpToFuncCallLowering<AbsFOp>>(converter, "__ocml_fabs_f32",
|
||||
"__ocml_fabs_f64");
|
||||
patterns.insert<OpToFuncCallLowering<CeilFOp>>(converter, "__ocml_ceil_f32",
|
||||
"__ocml_ceil_f64");
|
||||
patterns.insert<OpToFuncCallLowering<CosOp>>(converter, "__ocml_cos_f32",
|
||||
"__ocml_cos_f64");
|
||||
patterns.insert<OpToFuncCallLowering<ExpOp>>(converter, "__ocml_exp_f32",
|
||||
"__ocml_exp_f64");
|
||||
patterns.insert<OpToFuncCallLowering<LogOp>>(converter, "__ocml_log_f32",
|
||||
"__ocml_log_f64");
|
||||
patterns.insert<OpToFuncCallLowering<Log10Op>>(converter, "__ocml_log10_f32",
|
||||
"__ocml_log10_f64");
|
||||
patterns.insert<OpToFuncCallLowering<Log2Op>>(converter, "__ocml_log2_f32",
|
||||
"__ocml_log2_f64");
|
||||
patterns.insert<OpToFuncCallLowering<TanhOp>>(converter, "__ocml_tanh_f32",
|
||||
"__ocml_tanh_f64");
|
||||
}
|
||||
|
||||
std::unique_ptr<OperationPass<gpu::GPUModuleOp>>
|
||||
mlir::createLowerGpuOpsToROCDLOpsPass() {
|
||||
return std::make_unique<LowerGpuOpsToROCDLOpsPass>();
|
||||
|
|
|
@ -0,0 +1,231 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck --check-prefix=NVVM %s
|
||||
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-rocdl --split-input-file %s | FileCheck --check-prefix=ROCDL %s
|
||||
|
||||
gpu.module @kernel {
|
||||
// NVVM-LABEL: llvm.func @private
|
||||
gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
|
||||
// Allocate private memory inside the function.
|
||||
// NVVM: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
|
||||
// NVVM: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
|
||||
|
||||
// ROCDL: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
|
||||
// ROCDL: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
|
||||
|
||||
// Populate the memref descriptor.
|
||||
// NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
|
||||
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
|
||||
|
||||
// ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(5)*, float addrspace(5)*, i64, [1 x i64], [1 x i64] }">
|
||||
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
|
||||
|
||||
// "Store" lowering should work just as any other memref, only check that
|
||||
// we emit some core instructions.
|
||||
// NVVM: llvm.extractvalue %[[descr6:.*]]
|
||||
// NVVM: llvm.getelementptr
|
||||
// NVVM: llvm.store
|
||||
|
||||
// ROCDL: llvm.extractvalue %[[descr6:.*]]
|
||||
// ROCDL: llvm.getelementptr
|
||||
// ROCDL: llvm.store
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0] : memref<4xf32, 5>
|
||||
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel {
|
||||
// Workgroup buffers are allocated as globals.
|
||||
// NVVM: llvm.mlir.global internal @[[buffer:.*]]()
|
||||
// NVVM-SAME: addr_space = 3
|
||||
// NVVM-SAME: !llvm<"[4 x float]">
|
||||
|
||||
// ROCDL: llvm.mlir.global internal @[[buffer:.*]]()
|
||||
// ROCDL-SAME: addr_space = 3
|
||||
// ROCDL-SAME: !llvm<"[4 x float]">
|
||||
|
||||
// NVVM-LABEL: llvm.func @workgroup
|
||||
// NVVM-SAME: {
|
||||
|
||||
// ROCDL-LABEL: llvm.func @workgroup
|
||||
// ROCDL-SAME: {
|
||||
gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
|
||||
// Get the address of the first element in the global array.
|
||||
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
|
||||
// NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
|
||||
// NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
|
||||
// NVVM-SAME: !llvm<"float addrspace(3)*">
|
||||
|
||||
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
|
||||
// ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
|
||||
// ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
|
||||
// ROCDL-SAME: !llvm<"float addrspace(3)*">
|
||||
|
||||
// Populate the memref descriptor.
|
||||
// NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
|
||||
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
|
||||
|
||||
// ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
|
||||
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
|
||||
|
||||
// "Store" lowering should work just as any other memref, only check that
|
||||
// we emit some core instructions.
|
||||
// NVVM: llvm.extractvalue %[[descr6:.*]]
|
||||
// NVVM: llvm.getelementptr
|
||||
// NVVM: llvm.store
|
||||
|
||||
// ROCDL: llvm.extractvalue %[[descr6:.*]]
|
||||
// ROCDL: llvm.getelementptr
|
||||
// ROCDL: llvm.store
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0] : memref<4xf32, 3>
|
||||
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel {
|
||||
// Check that the total size was computed correctly.
|
||||
// NVVM: llvm.mlir.global internal @[[buffer:.*]]()
|
||||
// NVVM-SAME: addr_space = 3
|
||||
// NVVM-SAME: !llvm<"[48 x float]">
|
||||
|
||||
// ROCDL: llvm.mlir.global internal @[[buffer:.*]]()
|
||||
// ROCDL-SAME: addr_space = 3
|
||||
// ROCDL-SAME: !llvm<"[48 x float]">
|
||||
|
||||
// NVVM-LABEL: llvm.func @workgroup3d
|
||||
// ROCDL-LABEL: llvm.func @workgroup3d
|
||||
gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
|
||||
// Get the address of the first element in the global array.
|
||||
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
|
||||
// NVVM: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
|
||||
// NVVM: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
|
||||
// NVVM-SAME: !llvm<"float addrspace(3)*">
|
||||
|
||||
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
|
||||
// ROCDL: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
|
||||
// ROCDL: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
|
||||
// ROCDL-SAME: !llvm<"float addrspace(3)*">
|
||||
|
||||
// Populate the memref descriptor.
|
||||
// NVVM: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
|
||||
// NVVM: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// NVVM: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// NVVM: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// NVVM: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// NVVM: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// NVVM: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
|
||||
// NVVM: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
|
||||
// NVVM: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
|
||||
// NVVM: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
|
||||
// NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
|
||||
// NVVM: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
|
||||
// NVVM: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
|
||||
// NVVM: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
|
||||
// NVVM: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// NVVM: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
|
||||
|
||||
// ROCDL: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
|
||||
// ROCDL: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// ROCDL: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// ROCDL: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// ROCDL: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
|
||||
// ROCDL: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
|
||||
// ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
|
||||
// ROCDL: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
|
||||
// ROCDL: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// ROCDL: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
|
||||
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel {
|
||||
// Check that several buffers are defined.
|
||||
// NVVM: llvm.mlir.global internal @[[buffer1:.*]]()
|
||||
// NVVM-SAME: !llvm<"[1 x float]">
|
||||
// NVVM: llvm.mlir.global internal @[[buffer2:.*]]()
|
||||
// NVVM-SAME: !llvm<"[2 x float]">
|
||||
|
||||
// ROCDL: llvm.mlir.global internal @[[buffer1:.*]]()
|
||||
// ROCDL-SAME: !llvm<"[1 x float]">
|
||||
// ROCDL: llvm.mlir.global internal @[[buffer2:.*]]()
|
||||
// ROCDL-SAME: !llvm<"[2 x float]">
|
||||
|
||||
// NVVM-LABEL: llvm.func @multiple
|
||||
// ROCDL-LABEL: llvm.func @multiple
|
||||
gpu.func @multiple(%arg0: f32)
|
||||
workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
|
||||
private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
|
||||
|
||||
// Workgroup buffers.
|
||||
// NVVM: llvm.mlir.addressof @[[buffer1]]
|
||||
// NVVM: llvm.mlir.addressof @[[buffer2]]
|
||||
|
||||
// ROCDL: llvm.mlir.addressof @[[buffer1]]
|
||||
// ROCDL: llvm.mlir.addressof @[[buffer2]]
|
||||
|
||||
// Private buffers.
|
||||
// NVVM: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
|
||||
// NVVM: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
|
||||
// NVVM: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
|
||||
// NVVM: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
|
||||
|
||||
// ROCDL: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
|
||||
// ROCDL: llvm.alloca %[[c3]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
|
||||
// ROCDL: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
|
||||
// ROCDL: llvm.alloca %[[c4]] x !llvm.float : (!llvm.i64) -> !llvm<"float addrspace(5)*">
|
||||
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0] : memref<1xf32, 3>
|
||||
store %arg0, %arg2[%c0] : memref<2xf32, 3>
|
||||
store %arg0, %arg3[%c0] : memref<3xf32, 5>
|
||||
store %arg0, %arg4[%c0] : memref<4xf32, 5>
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
|
@ -1,145 +0,0 @@
|
|||
// RUN: mlir-opt -allow-unregistered-dialect --convert-gpu-to-nvvm --split-input-file %s | FileCheck %s
|
||||
|
||||
gpu.module @kernel {
|
||||
// CHECK-LABEL: llvm.func @private
|
||||
gpu.func @private(%arg0: f32) private(%arg1: memref<4xf32, 5>) {
|
||||
// Allocate private memory inside the function.
|
||||
// CHECK: %[[size:.*]] = llvm.mlir.constant(4 : i64) : !llvm.i64
|
||||
// CHECK: %[[raw:.*]] = llvm.alloca %[[size]] x !llvm.float : (!llvm.i64) -> !llvm<"float*">
|
||||
|
||||
// Populate the memref descriptor.
|
||||
// CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float*, float*, i64, [1 x i64], [1 x i64] }">
|
||||
// CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
|
||||
|
||||
// "Store" lowering should work just as any other memref, only check that
|
||||
// we emit some core instructions.
|
||||
// CHECK: llvm.extractvalue %[[descr6:.*]]
|
||||
// CHECK: llvm.getelementptr
|
||||
// CHECK: llvm.store
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0] : memref<4xf32, 5>
|
||||
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel {
|
||||
// Workgroup buffers are allocated as globals.
|
||||
// CHECK: llvm.mlir.global internal @[[buffer:.*]]()
|
||||
// CHECK-SAME: addr_space = 3
|
||||
// CHECK-SAME: !llvm<"[4 x float]">
|
||||
|
||||
// CHECK-LABEL: llvm.func @workgroup
|
||||
// CHECK-SAME: {
|
||||
gpu.func @workgroup(%arg0: f32) workgroup(%arg1: memref<4xf32, 3>) {
|
||||
// Get the address of the first element in the global array.
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
|
||||
// CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[4 x float] addrspace(3)*">
|
||||
// CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
|
||||
// CHECK-SAME: !llvm<"float addrspace(3)*">
|
||||
|
||||
// Populate the memref descriptor.
|
||||
// CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [1 x i64], [1 x i64] }">
|
||||
// CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c1]], %[[descr5]][4, 0]
|
||||
|
||||
// "Store" lowering should work just as any other memref, only check that
|
||||
// we emit some core instructions.
|
||||
// CHECK: llvm.extractvalue %[[descr6:.*]]
|
||||
// CHECK: llvm.getelementptr
|
||||
// CHECK: llvm.store
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0] : memref<4xf32, 3>
|
||||
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel {
|
||||
// Check that the total size was computed correctly.
|
||||
// CHECK: llvm.mlir.global internal @[[buffer:.*]]()
|
||||
// CHECK-SAME: addr_space = 3
|
||||
// CHECK-SAME: !llvm<"[48 x float]">
|
||||
|
||||
// CHECK-LABEL: llvm.func @workgroup3d
|
||||
gpu.func @workgroup3d(%arg0: f32) workgroup(%arg1: memref<4x2x6xf32, 3>) {
|
||||
// Get the address of the first element in the global array.
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : i32) : !llvm.i32
|
||||
// CHECK: %[[addr:.*]] = llvm.mlir.addressof @[[buffer]] : !llvm<"[48 x float] addrspace(3)*">
|
||||
// CHECK: %[[raw:.*]] = llvm.getelementptr %[[addr]][%[[c0]], %[[c0]]]
|
||||
// CHECK-SAME: !llvm<"float addrspace(3)*">
|
||||
|
||||
// Populate the memref descriptor.
|
||||
// CHECK: %[[descr1:.*]] = llvm.mlir.undef : !llvm<"{ float addrspace(3)*, float addrspace(3)*, i64, [3 x i64], [3 x i64] }">
|
||||
// CHECK: %[[descr2:.*]] = llvm.insertvalue %[[raw]], %[[descr1]][0]
|
||||
// CHECK: %[[descr3:.*]] = llvm.insertvalue %[[raw]], %[[descr2]][1]
|
||||
// CHECK: %[[c0:.*]] = llvm.mlir.constant(0 : index) : !llvm.i64
|
||||
// CHECK: %[[descr4:.*]] = llvm.insertvalue %[[c0]], %[[descr3]][2]
|
||||
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : index) : !llvm.i64
|
||||
// CHECK: %[[descr5:.*]] = llvm.insertvalue %[[c4]], %[[descr4]][3, 0]
|
||||
// CHECK: %[[c12:.*]] = llvm.mlir.constant(12 : index) : !llvm.i64
|
||||
// CHECK: %[[descr6:.*]] = llvm.insertvalue %[[c12]], %[[descr5]][4, 0]
|
||||
// CHECK: %[[c2:.*]] = llvm.mlir.constant(2 : index) : !llvm.i64
|
||||
// CHECK: %[[descr7:.*]] = llvm.insertvalue %[[c2]], %[[descr6]][3, 1]
|
||||
// CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
|
||||
// CHECK: %[[descr8:.*]] = llvm.insertvalue %[[c6]], %[[descr7]][4, 1]
|
||||
// CHECK: %[[c6:.*]] = llvm.mlir.constant(6 : index) : !llvm.i64
|
||||
// CHECK: %[[descr9:.*]] = llvm.insertvalue %[[c6]], %[[descr8]][3, 2]
|
||||
// CHECK: %[[c1:.*]] = llvm.mlir.constant(1 : index) : !llvm.i64
|
||||
// CHECK: %[[descr10:.*]] = llvm.insertvalue %[[c1]], %[[descr9]][4, 2]
|
||||
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0,%c0,%c0] : memref<4x2x6xf32, 3>
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel {
|
||||
// Check that several buffers are defined.
|
||||
// CHECK: llvm.mlir.global internal @[[buffer1:.*]]()
|
||||
// CHECK-SAME: !llvm<"[1 x float]">
|
||||
// CHECK: llvm.mlir.global internal @[[buffer2:.*]]()
|
||||
// CHECK-SAME: !llvm<"[2 x float]">
|
||||
|
||||
// CHECK-LABEL: llvm.func @multiple
|
||||
gpu.func @multiple(%arg0: f32)
|
||||
workgroup(%arg1: memref<1xf32, 3>, %arg2: memref<2xf32, 3>)
|
||||
private(%arg3: memref<3xf32, 5>, %arg4: memref<4xf32, 5>) {
|
||||
|
||||
// Workgroup buffers.
|
||||
// CHECK: llvm.mlir.addressof @[[buffer1]]
|
||||
// CHECK: llvm.mlir.addressof @[[buffer2]]
|
||||
|
||||
// Private buffers.
|
||||
// CHECK: %[[c3:.*]] = llvm.mlir.constant(3 : i64)
|
||||
// CHECK: llvm.alloca %[[c3]] x !llvm.float
|
||||
// CHECK: %[[c4:.*]] = llvm.mlir.constant(4 : i64)
|
||||
// CHECK: llvm.alloca %[[c4]] x !llvm.float
|
||||
|
||||
%c0 = constant 0 : index
|
||||
store %arg0, %arg1[%c0] : memref<1xf32, 3>
|
||||
store %arg0, %arg2[%c0] : memref<2xf32, 3>
|
||||
store %arg0, %arg3[%c0] : memref<3xf32, 5>
|
||||
store %arg0, %arg4[%c0] : memref<4xf32, 5>
|
||||
"terminator"() : () -> ()
|
||||
}
|
||||
}
|
|
@ -1,9 +1,10 @@
|
|||
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s
|
||||
// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file | FileCheck %s --dump-input-on-failure
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK-LABEL: func @gpu_index_ops()
|
||||
func @gpu_index_ops()
|
||||
attributes { gpu.kernel } {
|
||||
-> (index, index, index, index, index, index,
|
||||
index, index, index, index, index, index) {
|
||||
// CHECK: rocdl.workitem.id.x : !llvm.i32
|
||||
%tIdX = "gpu.thread_id"() {dimension = "x"} : () -> (index)
|
||||
// CHECK: rocdl.workitem.id.y : !llvm.i32
|
||||
|
@ -32,68 +33,71 @@ gpu.module @kernel_module {
|
|||
// CHECK: rocdl.grid.dim.z : !llvm.i32
|
||||
%gDimZ = "gpu.grid_dim"() {dimension = "z"} : () -> (index)
|
||||
|
||||
std.return
|
||||
std.return %tIdX, %tIdY, %tIdZ, %bDimX, %bDimY, %bDimZ,
|
||||
%bIdX, %bIdY, %bIdZ, %gDimX, %gDimY, %gDimZ
|
||||
: index, index, index, index, index, index,
|
||||
index, index, index, index, index, index
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_fabs_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_fabs_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_fabs
|
||||
func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_fabs(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%result32 = std.absf %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_fabs_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.absf %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_fabs_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_ceil_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_ceil_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_ceil
|
||||
func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_ceil(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%result32 = std.ceilf %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_ceil_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.ceilf %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_ceil_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_cos_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_cos_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_cos
|
||||
func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_cos(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%result32 = std.cos %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_cos_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.cos %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_cos_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_exp
|
||||
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%exp_f32 = std.exp %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result_f32 = std.exp %exp_f32 : f32
|
||||
%result32 = std.exp %exp_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.exp %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -101,20 +105,20 @@ gpu.module @kernel_module {
|
|||
// -----
|
||||
|
||||
// Test that we handled properly operation with SymbolTable other than module op
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
"test.symbol_scope"() ({
|
||||
// CHECK: test.symbol_scope
|
||||
// CHECK: llvm.func @__ocml_exp_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_exp_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_exp
|
||||
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_exp(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%exp_f32 = std.exp %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result_f32 = std.exp %exp_f32 : f32
|
||||
%result32 = std.exp %exp_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_exp_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.exp %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_exp_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
"test.finish" () : () -> ()
|
||||
}) : () -> ()
|
||||
|
@ -122,60 +126,60 @@ gpu.module @kernel_module {
|
|||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_log_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_log_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_log
|
||||
func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_log(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%result32 = std.log %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_log_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.log %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_log_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_log10_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_log10_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_log10
|
||||
func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_log10(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%result32 = std.log10 %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_log10_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.log10 %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_log10_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_log2_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_log2_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_log2
|
||||
func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_log2(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%result32 = std.log2 %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_log2_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.log2 %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_log2_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
||||
// -----
|
||||
|
||||
gpu.module @kernel_module {
|
||||
gpu.module @test_module {
|
||||
// CHECK: llvm.func @__ocml_tanh_f32(!llvm.float) -> !llvm.float
|
||||
// CHECK: llvm.func @__ocml_tanh_f64(!llvm.double) -> !llvm.double
|
||||
// CHECK-LABEL: func @gpu_tanh
|
||||
func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) {
|
||||
func @gpu_tanh(%arg_f32 : f32, %arg_f64 : f64) -> (f32, f64) {
|
||||
%result32 = std.tanh %arg_f32 : f32
|
||||
// CHECK: llvm.call @__ocml_tanh_f32(%{{.*}}) : (!llvm.float) -> !llvm.float
|
||||
%result64 = std.tanh %arg_f64 : f64
|
||||
// CHECK: llvm.call @__ocml_tanh_f64(%{{.*}}) : (!llvm.double) -> !llvm.double
|
||||
std.return
|
||||
std.return %result32, %result64 : f32, f64
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue