[MLIR][GPU] Add gpu.set_default_device op

This op is added to allow MLIR code running on multi-GPU systems to
select the GPU they want to execute operations on when no GPU is
otherwise specified.

Reviewed By: mehdi_amini

Differential Revision: https://reviews.llvm.org/D119883
This commit is contained in:
Krzysztof Drewniak 2022-02-15 20:23:44 +00:00
parent 9febd1e573
commit 84718d37db
5 changed files with 71 additions and 7 deletions

View File

@ -273,7 +273,7 @@ def GPU_GPUFuncOp : GPU_Op<"func", [
/// Returns the type of this function.
/// FIXME: We should drive this via the ODS `type` param.
FunctionType getType() {
FunctionType getType() {
return getTypeAttr().getValue().cast<FunctionType>();
}
@ -1006,6 +1006,18 @@ def GPU_MemsetOp : GPU_Op<"memset",
let hasFolder = 1;
}
def GPU_SetDefaultDeviceOp : GPU_Op<"set_default_device",
[MemoryEffects<[MemWrite]>]>,
Arguments<(ins I32:$devIndex)> {
let summary = "Set default GPU for operations after this by index";
let description = [{
Operation that sets the current default GPU, using a zero-based index
into the set of GPUs on the system. The default GPU setting may be
thread-local.
}];
let assemblyFormat = "attr-dict $devIndex";
}
def GPU_SubgroupMmaLoadMatrixOp : GPU_Op<"subgroup_mma_load_matrix",
[MemoryEffects<[MemRead]>]>{

View File

@ -185,6 +185,10 @@ protected:
{llvmPointerType /* void *dst */, llvmInt32Type /* unsigned int value */,
llvmIntPtrType /* intptr_t sizeBytes */,
llvmPointerType /* void *stream */}};
FunctionCallBuilder setDefaultDeviceCallBuilder = {
"mgpuSetDefaultDevice",
llvmVoidType,
{llvmInt32Type /* uint32_t devIndex */}};
};
/// A rewrite pattern to convert gpu.host_register operations into a GPU runtime
@ -342,6 +346,21 @@ private:
matchAndRewrite(gpu::MemsetOp memsetOp, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
};
/// A rewrite pattern to convert gpu.set_default_device to a GPU runtime call.
/// Currently supports CUDA and ROCm (HIP)
class ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern
: public ConvertOpToGpuRuntimeCallPattern<gpu::SetDefaultDeviceOp> {
public:
ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern(
LLVMTypeConverter &typeConverter)
: ConvertOpToGpuRuntimeCallPattern<gpu::SetDefaultDeviceOp>(
typeConverter) {}
LogicalResult
matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override;
};
} // namespace
void GpuToLLVMConversionPass::runOnOperation() {
@ -844,6 +863,15 @@ LogicalResult ConvertMemsetOpToGpuRuntimeCallPattern::matchAndRewrite(
return success();
}
LogicalResult ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern::matchAndRewrite(
gpu::SetDefaultDeviceOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const {
Location loc = op.getLoc();
setDefaultDeviceCallBuilder.create(loc, rewriter, {adaptor.devIndex()});
rewriter.replaceOp(op, {});
return success();
}
std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
mlir::createGpuToLLVMConversionPass() {
return std::make_unique<GpuToLLVMConversionPass>();
@ -861,6 +889,7 @@ void mlir::populateGpuToLLVMConversionPatterns(
ConvertHostRegisterOpToGpuRuntimeCallPattern,
ConvertMemcpyOpToGpuRuntimeCallPattern,
ConvertMemsetOpToGpuRuntimeCallPattern,
ConvertSetDefaultDeviceOpToGpuRuntimeCallPattern,
ConvertWaitAsyncOpToGpuRuntimeCallPattern,
ConvertWaitOpToGpuRuntimeCallPattern,
ConvertAsyncYieldToGpuRuntimeCallPattern>(converter);

View File

@ -35,16 +35,20 @@
fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
}(expr)
// Make the primary context of device 0 current for the duration of the instance
// and restore the previous context on destruction.
thread_local static int32_t defaultDevice = 0;
// Make the primary context of the current default device current for the
// duration
// of the instance and restore the previous context on destruction.
class ScopedContext {
public:
ScopedContext() {
// Static reference to CUDA primary context for device ordinal 0.
// Static reference to CUDA primary context for device ordinal
// defaultDevice.
static CUcontext context = [] {
CUDA_REPORT_IF_ERROR(cuInit(/*flags=*/0));
CUdevice device;
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/0));
CUDA_REPORT_IF_ERROR(cuDeviceGet(&device, /*ordinal=*/defaultDevice));
CUcontext ctx;
// Note: this does not affect the current context.
CUDA_REPORT_IF_ERROR(cuDevicePrimaryCtxRetain(&ctx, device));
@ -187,3 +191,8 @@ mgpuMemHostRegisterMemRef(int64_t rank, StridedMemRefType<char, 1> *descriptor,
auto *ptr = descriptor->data + descriptor->offset * elementSizeBytes;
mgpuMemHostRegister(ptr, sizeBytes);
}
extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) {
defaultDevice = device;
CUDA_REPORT_IF_ERROR(cudaSetDevice(device));
}

View File

@ -30,16 +30,18 @@
fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \
}(expr)
thread_local static int32_t defaultDevice = 0;
// Sets the `Context` for the duration of the instance and restores the previous
// context on destruction.
class ScopedContext {
public:
ScopedContext() {
// Static reference to HIP primary context for device ordinal 0.
// Static reference to HIP primary context for device ordinal defaultDevice.
static hipCtx_t context = [] {
HIP_REPORT_IF_ERROR(hipInit(/*flags=*/0));
hipDevice_t device;
HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/0));
HIP_REPORT_IF_ERROR(hipDeviceGet(&device, /*ordinal=*/defaultDevice));
hipCtx_t ctx;
HIP_REPORT_IF_ERROR(hipDevicePrimaryCtxRetain(&ctx, device));
return ctx;
@ -199,3 +201,8 @@ mgpuMemGetDeviceMemRef1dInt32(int32_t *allocated, int32_t *aligned,
mgpuMemGetDevicePointer(aligned, &devicePtr);
return {devicePtr, devicePtr, offset, {size}, {stride}};
}
extern "C" void mgpuSetDefaultDevice(int32_t device) {
defaultDevice = device;
HIP_REPORT_IF_ERROR(hipSetDevice(device));
}

View File

@ -252,4 +252,11 @@ module attributes {gpu.container_module} {
gpu.device_async_wait %token {numGroups = 1 : i32}
return
}
// CHECK-LABEL: func @set_default_device
func @set_default_device(%arg0: i32) {
// CHECK: gpu.set_default_device
gpu.set_default_device %arg0
return
}
}