forked from OSchip/llvm-project
142 lines
4.6 KiB
C++
142 lines
4.6 KiB
C++
//===-- CUDASaxpy.cpp - Example of CUDA saxpy with StreamExecutor API -----===//
|
|
//
|
|
// The LLVM Compiler Infrastructure
|
|
//
|
|
// This file is distributed under the University of Illinois Open Source
|
|
// License. See LICENSE.TXT for details.
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
///
|
|
/// \file
|
|
/// This file contains example code demonstrating the usage of the
|
|
/// StreamExecutor API. Snippets of this file will be included as code examples
|
|
/// in documentation. Taking these examples from a real source file guarantees
|
|
/// that the examples will always compile.
|
|
///
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
#include <algorithm>
|
|
#include <cassert>
|
|
#include <cstdlib>
|
|
#include <vector>
|
|
|
|
#include "streamexecutor/StreamExecutor.h"
|
|
|
|
/// [Example saxpy compiler-generated]
|
|
// Code in this namespace is generated by the compiler (e.g. clang).
|
|
//
|
|
// The name of this namespace may depend on the compiler that generated it, so
|
|
// this is just an example name.
|
|
namespace __compilergen {
|
|
|
|
// Specialization of the streamexecutor::Kernel template class for the parameter
|
|
// types of the saxpy(float A, float *X, float *Y) kernel.
|
|
using SaxpyKernel =
|
|
streamexecutor::Kernel<float, streamexecutor::GlobalDeviceMemory<float>,
|
|
streamexecutor::GlobalDeviceMemory<float>>;
|
|
|
|
// A string containing the PTX code generated by the device compiler for the
|
|
// saxpy kernel. String contents not shown here.
|
|
extern const char *SaxpyPTX;
|
|
|
|
// A global instance of a loader spec that knows how to load the code in the
|
|
// SaxpyPTX string.
|
|
static streamexecutor::MultiKernelLoaderSpec SaxpyLoaderSpec = []() {
|
|
streamexecutor::MultiKernelLoaderSpec Spec;
|
|
Spec.addCUDAPTXInMemory("saxpy", {{{2, 0}, SaxpyPTX}});
|
|
return Spec;
|
|
}();
|
|
|
|
} // namespace __compilergen
|
|
/// [Example saxpy compiler-generated]
|
|
|
|
/// [Example saxpy host PTX]
|
|
// The PTX text for a saxpy kernel.
|
|
const char *__compilergen::SaxpyPTX = R"(
|
|
.version 4.3
|
|
.target sm_20
|
|
.address_size 64
|
|
|
|
.visible .entry saxpy(.param .f32 A, .param .u64 X, .param .u64 Y) {
|
|
.reg .f32 %AValue;
|
|
.reg .f32 %XValue;
|
|
.reg .f32 %YValue;
|
|
.reg .f32 %Result;
|
|
|
|
.reg .b64 %XBaseAddrGeneric;
|
|
.reg .b64 %YBaseAddrGeneric;
|
|
.reg .b64 %XBaseAddrGlobal;
|
|
.reg .b64 %YBaseAddrGlobal;
|
|
.reg .b64 %XAddr;
|
|
.reg .b64 %YAddr;
|
|
.reg .b64 %ThreadByteOffset;
|
|
|
|
.reg .b32 %TID;
|
|
|
|
ld.param.f32 %AValue, [A];
|
|
ld.param.u64 %XBaseAddrGeneric, [X];
|
|
ld.param.u64 %YBaseAddrGeneric, [Y];
|
|
cvta.to.global.u64 %XBaseAddrGlobal, %XBaseAddrGeneric;
|
|
cvta.to.global.u64 %YBaseAddrGlobal, %YBaseAddrGeneric;
|
|
mov.u32 %TID, %tid.x;
|
|
mul.wide.u32 %ThreadByteOffset, %TID, 4;
|
|
add.s64 %XAddr, %ThreadByteOffset, %XBaseAddrGlobal;
|
|
add.s64 %YAddr, %ThreadByteOffset, %YBaseAddrGlobal;
|
|
ld.global.f32 %XValue, [%XAddr];
|
|
ld.global.f32 %YValue, [%YAddr];
|
|
fma.rn.f32 %Result, %AValue, %XValue, %YValue;
|
|
st.global.f32 [%XAddr], %Result;
|
|
ret;
|
|
}
|
|
)";
|
|
/// [Example saxpy host PTX]
|
|
|
|
int main() {
|
|
/// [Example saxpy host main]
|
|
namespace se = ::streamexecutor;
|
|
namespace cg = ::__compilergen;
|
|
|
|
// Create some host data.
|
|
float A = 42.0f;
|
|
std::vector<float> HostX = {0, 1, 2, 3};
|
|
std::vector<float> HostY = {4, 5, 6, 7};
|
|
size_t ArraySize = HostX.size();
|
|
|
|
// Get a device object.
|
|
se::Platform *Platform =
|
|
getOrDie(se::PlatformManager::getPlatformByName("CUDA"));
|
|
if (Platform->getDeviceCount() == 0) {
|
|
return EXIT_FAILURE;
|
|
}
|
|
se::Device Device = getOrDie(Platform->getDevice(0));
|
|
|
|
// Load the kernel onto the device.
|
|
cg::SaxpyKernel Kernel =
|
|
getOrDie(Device.createKernel<cg::SaxpyKernel>(cg::SaxpyLoaderSpec));
|
|
|
|
se::RegisteredHostMemory<float> RegisteredX =
|
|
getOrDie(Device.registerHostMemory<float>(HostX));
|
|
se::RegisteredHostMemory<float> RegisteredY =
|
|
getOrDie(Device.registerHostMemory<float>(HostY));
|
|
|
|
// Allocate memory on the device.
|
|
se::GlobalDeviceMemory<float> X =
|
|
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
|
se::GlobalDeviceMemory<float> Y =
|
|
getOrDie(Device.allocateDeviceMemory<float>(ArraySize));
|
|
|
|
// Run operations on a stream.
|
|
se::Stream Stream = getOrDie(Device.createStream());
|
|
Stream.thenCopyH2D(RegisteredX, X)
|
|
.thenCopyH2D(RegisteredY, Y)
|
|
.thenLaunch(ArraySize, 1, Kernel, A, X, Y)
|
|
.thenCopyD2H(X, RegisteredX);
|
|
// Wait for the stream to complete.
|
|
se::dieIfError(Stream.blockHostUntilDone());
|
|
|
|
// Process output data in HostX.
|
|
std::vector<float> ExpectedX = {4, 47, 90, 133};
|
|
assert(std::equal(ExpectedX.begin(), ExpectedX.end(), HostX.begin()));
|
|
/// [Example saxpy host main]
|
|
}
|