llvm-project/parallel-libs/acxxel/examples/simple_example.cu

//===--- simple_example.cu - Simple example of using Acxxel ---------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
///
/// This file is a simple example of using Acxxel.
///
//===----------------------------------------------------------------------===//

/// [Example simple saxpy]
#include "acxxel.h"

#include <array>
#include <cstdio>
#include <cstdlib>

// A standard CUDA kernel.
__global__ void saxpyKernel(float A, float *X, float *Y, int N) {
  int I = (blockDim.x * blockIdx.x) + threadIdx.x;
  if (I < N)
    X[I] = A * X[I] + Y[I];
}

// A host library wrapping the CUDA kernel. All Acxxel calls are in here.
template <size_t N>
void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {
  // Get the CUDA platform and make a CUDA stream.
  acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();
  acxxel::Stream Stream = CUDA->createStream().takeValue();

  // Allocate space for device arrays.
  auto DeviceX = CUDA->mallocD<float>(N).takeValue();
  auto DeviceY = CUDA->mallocD<float>(N).takeValue();

  // Copy X and Y out to the device.
  Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);

  // Launch the kernel using triple-chevron notation.
  saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);

  // Copy the results back to the host.
  acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();

  // Check for any errors.
  if (Status.isError()) {
    std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",
                 Status.getMessage().c_str());
    std::exit(EXIT_FAILURE);
  }
}
/// [Example simple saxpy]

/// [Example CUDA simple saxpy]
template <size_t N>
void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {
  // This size is needed all over the place, so give it a name.
  constexpr size_t Size = N * sizeof(float);

  // Allocate space for device arrays.
  float *DeviceX;
  float *DeviceY;
  cudaMalloc(&DeviceX, Size);
  cudaMalloc(&DeviceY, Size);

  // Copy X and Y out to the device.
  cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);
  cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);

  // Launch the kernel using triple-chevron notation.
  saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);

  // Copy the results back to the host.
  cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);

  // Free resources.
  cudaFree(DeviceX);
  cudaFree(DeviceY);

  // Check for any errors.
  cudaError_t Error = cudaGetLastError();
  if (Error) {
    std::fprintf(stderr, "Error performing cudart saxpy: %s\n",
                 cudaGetErrorString(Error));
    std::exit(EXIT_FAILURE);
  }
}
/// [Example CUDA simple saxpy]

template <typename F> void testSaxpy(F &&SaxpyFunction) {
  float A = 2.f;
  std::array<float, 3> X = {{0.f, 1.f, 2.f}};
  std::array<float, 3> Y = {{3.f, 4.f, 5.f}};
  std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};
  SaxpyFunction(A, X, Y);
  for (int I = 0; I < 3; ++I)
    if (X[I] != Expected[I]) {
      std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],
                   Expected[I]);
      std::exit(EXIT_FAILURE);
    }
}

int main() {
  testSaxpy(saxpy<3>);
  testSaxpy(cudaSaxpy<3>);
}
Initial check-in of Acxxel (StreamExecutor renamed) Summary: Acxxel is basically a simplified redesign of StreamExecutor. Here are the major points where Acxxel differs from the current StreamExecutor design: * Acxxel doesn't support the kernel and kernel loader types designed for emission by the compiler to support type-safe kernel launches. For CUDA, kernels in Acxxel can be seamlessly launched using the standard CUDA triple-chevron kernel launch syntax that is available with clang and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the old-fashioned way, as one array of pointers to arguments and another array of argument sizes. Although OpenCL doesn't get a type-safe kernel launch method, it does still get the benefit of all the memory management wrappers. In the future, clang may add support for triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL kernel launch method. * Acxxel does not depend on any other code in LLVM, so it builds completely independently from LLVM. The goal will be to check in Acxxel and remove StreamExecutor, or perhaps to remove the old StreamExecutor and rename Acxxel to StreamExecutor, so I think Acxxel should be thought of as a new version of StreamExecutor, not as a separate project. Reviewers: jlebar, jprice Subscribers: beanz, mgorny, modocache, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D25701 llvm-svn: 285111 2016-10-26 04:18:56 +08:00			`//===--- simple_example.cu - Simple example of using Acxxel ---------------===//`
			`//`
Update the file headers across all of the LLVM projects in the monorepo to reflect the new license. We understand that people may be surprised that we're moving the header entirely to discuss the new license. We checked this carefully with the Foundation's lawyer and we believe this is the correct approach. Essentially, all code in the project is now made available by the LLVM project under our new license, so you will see that the license headers include that license only. Some of our contributors have contributed code under our old license, and accordingly, we have retained a copy of our old license notice in the top-level files in each project and repository. llvm-svn: 351636 2019-01-19 16:50:56 +08:00			`// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.`
			`// See https://llvm.org/LICENSE.txt for license information.`
			`// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception`
Initial check-in of Acxxel (StreamExecutor renamed) Summary: Acxxel is basically a simplified redesign of StreamExecutor. Here are the major points where Acxxel differs from the current StreamExecutor design: * Acxxel doesn't support the kernel and kernel loader types designed for emission by the compiler to support type-safe kernel launches. For CUDA, kernels in Acxxel can be seamlessly launched using the standard CUDA triple-chevron kernel launch syntax that is available with clang and nvcc. For CUDA and OpenCL, kernel arguments can be passed in the old-fashioned way, as one array of pointers to arguments and another array of argument sizes. Although OpenCL doesn't get a type-safe kernel launch method, it does still get the benefit of all the memory management wrappers. In the future, clang may add support for triple-chevron OpenCL kernel launchs, or some other type-safe OpenCL kernel launch method. * Acxxel does not depend on any other code in LLVM, so it builds completely independently from LLVM. The goal will be to check in Acxxel and remove StreamExecutor, or perhaps to remove the old StreamExecutor and rename Acxxel to StreamExecutor, so I think Acxxel should be thought of as a new version of StreamExecutor, not as a separate project. Reviewers: jlebar, jprice Subscribers: beanz, mgorny, modocache, parallel_libs-commits Differential Revision: https://reviews.llvm.org/D25701 llvm-svn: 285111 2016-10-26 04:18:56 +08:00			`//`
			`//===----------------------------------------------------------------------===//`
			`///`
			`/// This file is a simple example of using Acxxel.`
			`///`
			`//===----------------------------------------------------------------------===//`

			`/// [Example simple saxpy]`
			`#include "acxxel.h"`

			`#include <array>`
			`#include <cstdio>`
			`#include <cstdlib>`

			`// A standard CUDA kernel.`
			`__global__ void saxpyKernel(float A, float X, float Y, int N) {`
			`int I = (blockDim.x * blockIdx.x) + threadIdx.x;`
			`if (I < N)`
			`X[I] = A * X[I] + Y[I];`
			`}`

			`// A host library wrapping the CUDA kernel. All Acxxel calls are in here.`
			`template <size_t N>`
			`void saxpy(float A, std::array<float, N> &X, const std::array<float, N> &Y) {`
			`// Get the CUDA platform and make a CUDA stream.`
			`acxxel::Platform *CUDA = acxxel::getCUDAPlatform().getValue();`
			`acxxel::Stream Stream = CUDA->createStream().takeValue();`

			`// Allocate space for device arrays.`
			`auto DeviceX = CUDA->mallocD<float>(N).takeValue();`
			`auto DeviceY = CUDA->mallocD<float>(N).takeValue();`

			`// Copy X and Y out to the device.`
			`Stream.syncCopyHToD(X, DeviceX).syncCopyHToD(Y, DeviceY);`

			`// Launch the kernel using triple-chevron notation.`
			`saxpyKernel<<<1, N, 0, Stream>>>(A, DeviceX, DeviceY, N);`

			`// Copy the results back to the host.`
			`acxxel::Status Status = Stream.syncCopyDToH(DeviceX, X).takeStatus();`

			`// Check for any errors.`
			`if (Status.isError()) {`
			`std::fprintf(stderr, "Error performing acxxel saxpy: %s\n",`
			`Status.getMessage().c_str());`
			`std::exit(EXIT_FAILURE);`
			`}`
			`}`
			`/// [Example simple saxpy]`

			`/// [Example CUDA simple saxpy]`
			`template <size_t N>`
			`void cudaSaxpy(float A, std::array<float, N> &X, std::array<float, N> &Y) {`
			`// This size is needed all over the place, so give it a name.`
			`constexpr size_t Size = N * sizeof(float);`

			`// Allocate space for device arrays.`
			`float *DeviceX;`
			`float *DeviceY;`
			`cudaMalloc(&DeviceX, Size);`
			`cudaMalloc(&DeviceY, Size);`

			`// Copy X and Y out to the device.`
			`cudaMemcpy(DeviceX, X.data(), Size, cudaMemcpyHostToDevice);`
			`cudaMemcpy(DeviceY, Y.data(), Size, cudaMemcpyHostToDevice);`

			`// Launch the kernel using triple-chevron notation.`
			`saxpyKernel<<<1, N>>>(A, DeviceX, DeviceY, N);`

			`// Copy the results back to the host.`
			`cudaMemcpy(X.data(), DeviceX, Size, cudaMemcpyDeviceToHost);`

			`// Free resources.`
			`cudaFree(DeviceX);`
			`cudaFree(DeviceY);`

			`// Check for any errors.`
			`cudaError_t Error = cudaGetLastError();`
			`if (Error) {`
			`std::fprintf(stderr, "Error performing cudart saxpy: %s\n",`
			`cudaGetErrorString(Error));`
			`std::exit(EXIT_FAILURE);`
			`}`
			`}`
			`/// [Example CUDA simple saxpy]`

			`template <typename F> void testSaxpy(F &&SaxpyFunction) {`
			`float A = 2.f;`
			`std::array<float, 3> X = {{0.f, 1.f, 2.f}};`
			`std::array<float, 3> Y = {{3.f, 4.f, 5.f}};`
			`std::array<float, 3> Expected = {{3.f, 6.f, 9.f}};`
			`SaxpyFunction(A, X, Y);`
			`for (int I = 0; I < 3; ++I)`
			`if (X[I] != Expected[I]) {`
			`std::fprintf(stderr, "Result mismatch at index %d, %f != %f\n", I, X[I],`
			`Expected[I]);`
			`std::exit(EXIT_FAILURE);`
			`}`
			`}`

			`int main() {`
			`testSaxpy(saxpy<3>);`
			`testSaxpy(cudaSaxpy<3>);`
			`}`