forked from OSchip/llvm-project
[OpenMP] Completely remove old device runtime
This patch completely removes the old OpenMP device runtime. Previously, the old runtime had the prefix `libomptarget-new-` and the old runtime was simply called `libomptarget-`. This patch makes the formerly new runtime the only runtime available. The entire project has been deleted, and all references to the `libomptarget-new` runtime has been replaced with `libomptarget-`. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D118934
This commit is contained in:
parent
0cc6165d05
commit
034adaf5be
|
@ -1203,8 +1203,7 @@ CGOpenMPRuntimeGPU::CGOpenMPRuntimeGPU(CodeGenModule &CGM)
|
|||
llvm_unreachable("OpenMP can only handle device code.");
|
||||
|
||||
llvm::OpenMPIRBuilder &OMPBuilder = getOMPBuilder();
|
||||
if (CGM.getLangOpts().OpenMPTargetNewRuntime &&
|
||||
!CGM.getLangOpts().OMPHostIRFile.empty()) {
|
||||
if (!CGM.getLangOpts().OMPHostIRFile.empty()) {
|
||||
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTargetDebug,
|
||||
"__omp_rtl_debug_kind");
|
||||
OMPBuilder.createGlobalFlag(CGM.getLangOpts().OpenMPTeamSubscription,
|
||||
|
|
|
@ -290,11 +290,7 @@ void AMDGPUOpenMPToolChain::addClangTargetOptions(
|
|||
return;
|
||||
|
||||
std::string BitcodeSuffix;
|
||||
if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
|
||||
options::OPT_fno_openmp_target_new_runtime, true))
|
||||
BitcodeSuffix = "new-amdgpu-" + GPUArch;
|
||||
else
|
||||
BitcodeSuffix = "amdgcn-" + GPUArch;
|
||||
BitcodeSuffix = "amdgcn-" + GPUArch;
|
||||
|
||||
addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
|
||||
getTriple());
|
||||
|
|
|
@ -5936,13 +5936,6 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
options::OPT_fno_openmp_cuda_mode, /*Default=*/false))
|
||||
CmdArgs.push_back("-fopenmp-cuda-mode");
|
||||
|
||||
// When in OpenMP offloading mode, enable or disable the new device
|
||||
// runtime.
|
||||
if (Args.hasFlag(options::OPT_fopenmp_target_new_runtime,
|
||||
options::OPT_fno_openmp_target_new_runtime,
|
||||
/*Default=*/true))
|
||||
CmdArgs.push_back("-fopenmp-target-new-runtime");
|
||||
|
||||
// When in OpenMP offloading mode, enable debugging on the device.
|
||||
Args.AddAllArgs(CmdArgs, options::OPT_fopenmp_target_debug_EQ);
|
||||
if (Args.hasFlag(options::OPT_fopenmp_target_debug,
|
||||
|
@ -8187,9 +8180,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
StringRef Arch = TCArgs.getLastArgValue(options::OPT_march_EQ);
|
||||
|
||||
std::string BitcodeSuffix;
|
||||
if (TCArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
|
||||
options::OPT_fno_openmp_target_new_runtime, true))
|
||||
BitcodeSuffix += "new-";
|
||||
if (TC->getTriple().isNVPTX())
|
||||
BitcodeSuffix += "nvptx-";
|
||||
else if (TC->getTriple().isAMDGPU())
|
||||
|
|
|
@ -749,11 +749,7 @@ void CudaToolChain::addClangTargetOptions(
|
|||
return;
|
||||
|
||||
std::string BitcodeSuffix;
|
||||
if (DriverArgs.hasFlag(options::OPT_fopenmp_target_new_runtime,
|
||||
options::OPT_fno_openmp_target_new_runtime, true))
|
||||
BitcodeSuffix = "new-nvptx-" + GpuArch.str();
|
||||
else
|
||||
BitcodeSuffix = "nvptx-" + GpuArch.str();
|
||||
BitcodeSuffix = "nvptx-" + GpuArch.str();
|
||||
|
||||
addOpenMPDeviceRTL(getDriver(), DriverArgs, CC1Args, BitcodeSuffix,
|
||||
getTriple());
|
||||
|
|
|
@ -3484,9 +3484,6 @@ void CompilerInvocation::GenerateLangArgs(const LangOptions &Opts,
|
|||
GenerateArg(Args, OPT_fopenmp_version_EQ, Twine(Opts.OpenMP), SA);
|
||||
}
|
||||
|
||||
if (Opts.OpenMPTargetNewRuntime)
|
||||
GenerateArg(Args, OPT_fopenmp_target_new_runtime, SA);
|
||||
|
||||
if (Opts.OpenMPThreadSubscription)
|
||||
GenerateArg(Args, OPT_fopenmp_assume_threads_oversubscription, SA);
|
||||
|
||||
|
@ -3877,9 +3874,6 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
|
|||
Opts.OpenMP && Args.hasArg(options::OPT_fopenmp_enable_irbuilder);
|
||||
bool IsTargetSpecified =
|
||||
Opts.OpenMPIsDevice || Args.hasArg(options::OPT_fopenmp_targets_EQ);
|
||||
Opts.OpenMPTargetNewRuntime =
|
||||
Opts.OpenMPIsDevice &&
|
||||
Args.hasArg(options::OPT_fopenmp_target_new_runtime);
|
||||
|
||||
Opts.ConvergentFunctions = Opts.ConvergentFunctions || Opts.OpenMPIsDevice;
|
||||
|
||||
|
@ -3927,17 +3921,13 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
|
|||
// Set either by a specific value or to a default if not specified.
|
||||
if (Opts.OpenMPIsDevice && (Args.hasArg(OPT_fopenmp_target_debug) ||
|
||||
Args.hasArg(OPT_fopenmp_target_debug_EQ))) {
|
||||
if (Opts.OpenMPTargetNewRuntime) {
|
||||
Opts.OpenMPTargetDebug = getLastArgIntValue(
|
||||
Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
|
||||
if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
|
||||
Opts.OpenMPTargetDebug = 1;
|
||||
} else {
|
||||
Diags.Report(diag::err_drv_debug_no_new_runtime);
|
||||
}
|
||||
Opts.OpenMPTargetDebug = getLastArgIntValue(
|
||||
Args, OPT_fopenmp_target_debug_EQ, Opts.OpenMPTargetDebug, Diags);
|
||||
if (!Opts.OpenMPTargetDebug && Args.hasArg(OPT_fopenmp_target_debug))
|
||||
Opts.OpenMPTargetDebug = 1;
|
||||
}
|
||||
|
||||
if (Opts.OpenMPIsDevice && Opts.OpenMPTargetNewRuntime) {
|
||||
if (Opts.OpenMPIsDevice) {
|
||||
if (Args.hasArg(OPT_fopenmp_assume_teams_oversubscription))
|
||||
Opts.OpenMPTeamSubscription = true;
|
||||
if (Args.hasArg(OPT_fopenmp_assume_threads_oversubscription))
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
// REQUIRES: x86-registered-target
|
||||
// REQUIRES: amdgpu-registered-target
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib %s 2>&1 \
|
||||
// RUN: | FileCheck %s
|
||||
|
||||
// verify the tools invocations
|
||||
|
@ -14,7 +14,7 @@
|
|||
// CHECK: clang{{.*}}"-cc1" "-triple" "x86_64-unknown-linux-gnu"{{.*}}"-o" "{{.*}}a-{{.*}}.o" "-x" "ir" "{{.*}}a-{{.*}}.bc"
|
||||
// CHECK: ld{{.*}}"-o" "a.out"{{.*}}"{{.*}}amdgpu-openmp-toolchain-{{.*}}.o" "{{.*}}a-{{.*}}.o" "-lomp" "-lomptarget"
|
||||
|
||||
// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
|
||||
// RUN: %clang -ccc-print-phases --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 %s 2>&1 \
|
||||
// RUN: | FileCheck --check-prefix=CHECK-PHASES %s
|
||||
// phases
|
||||
// CHECK-PHASES: 0: input, "{{.*}}amdgpu-openmp-toolchain.c", c, (host-openmp)
|
||||
|
@ -36,13 +36,13 @@
|
|||
// CHECK-PHASES: 16: linker, {4, 15}, image, (host-openmp)
|
||||
|
||||
// handling of --libomptarget-amdgcn-bc-path
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIBOMPTARGET
|
||||
// CHECK-LIBOMPTARGET: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}Inputs/hip_dev_lib/libomptarget-amdgcn-gfx803.bc"{{.*}}
|
||||
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-NOGPULIB
|
||||
// CHECK-NOGPULIB-NOT: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-target-cpu" "gfx803" "-fcuda-is-device" "-mlink-builtin-bitcode"{{.*}}libomptarget-amdgcn-gfx803.bc"{{.*}}
|
||||
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -ccc-print-bindings -save-temps -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-PRINT-BINDINGS
|
||||
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[INPUT:.*]]"],
|
||||
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang",{{.*}} output: "[[HOST_BC:.*]]"
|
||||
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "clang", inputs: ["[[HOST_BC]]"], output: "[[HOST_S:.*]]"
|
||||
|
@ -56,13 +56,13 @@
|
|||
// CHECK-PRINT-BINDINGS: "x86_64-unknown-linux-gnu" - "GNU::Linker", inputs: ["[[HOST_O]]", "[[OFFLOAD_O]]"], output:
|
||||
|
||||
// verify the llc is invoked for textual assembly output
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK-SAVE-ASM
|
||||
// CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=asm" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.s"
|
||||
// CHECK-SAVE-ASM: llc{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906-linked.bc" "-mtriple=amdgcn-amd-amdhsa" "-mcpu=gfx906" "-filetype=obj" "-o"{{.*}}amdgpu-openmp-toolchain-{{.*}}-gfx906.o"
|
||||
|
||||
// check the handling of -c
|
||||
// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
|
||||
// RUN: %clang -ccc-print-bindings -c --target=x86_64-unknown-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx906 --libomptarget-amdgcn-bc-path=%S/Inputs/hip_dev_lib -save-temps %s 2>&1 \
|
||||
// RUN: | FileCheck %s --check-prefix=CHECK-C
|
||||
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang",
|
||||
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang",{{.*}}output: "[[HOST_BC:.*]]"
|
||||
|
@ -72,8 +72,8 @@
|
|||
// CHECK-C: "x86_64-unknown-linux-gnu" - "clang::as"
|
||||
// CHECK-C: "x86_64-unknown-linux-gnu" - "offload bundler"
|
||||
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
|
||||
// RUN: %clang -### --target=x86_64-unknown-linux-gnu -emit-llvm -S -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-EMIT-LLVM-IR
|
||||
// CHECK-EMIT-LLVM-IR: clang{{.*}}"-cc1"{{.*}}"-triple" "amdgcn-amd-amdhsa"{{.*}}"-emit-llvm"
|
||||
|
||||
// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -fno-openmp-target-new-runtime -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
|
||||
// RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -Xopenmp-target=amdgcn-amd-amdhsa -march=gfx803 -lm --rocm-device-lib-path=%S/Inputs/rocm/amdgcn/bitcode %s 2>&1 | FileCheck %s --check-prefix=CHECK-LIB-DEVICE
|
||||
// CHECK-LIB-DEVICE: {{.*}}llvm-link{{.*}}ocml.bc"{{.*}}ockl.bc"{{.*}}oclc_daz_opt_on.bc"{{.*}}oclc_unsafe_math_off.bc"{{.*}}oclc_finite_only_off.bc"{{.*}}oclc_correctly_rounded_sqrt_on.bc"{{.*}}oclc_wavefrontsize64_on.bc"{{.*}}oclc_isa_version_803.bc"
|
||||
|
|
|
@ -155,43 +155,24 @@
|
|||
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
|
||||
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-nvptx-test.bc \
|
||||
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
|
||||
// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime -save-temps -no-canonical-prefixes %s 2>&1 \
|
||||
// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=CHK-BCLIB %s
|
||||
|
||||
/// Specify the directory containing the bitcode lib, check clang picks the right one
|
||||
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
|
||||
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget \
|
||||
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
|
||||
// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
|
||||
// RUN: -fopenmp-relocatable-target -save-temps \
|
||||
// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-BCLIB-DIR %s
|
||||
|
||||
/// Check with the new runtime enabled
|
||||
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
|
||||
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
|
||||
// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \
|
||||
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget/libomptarget-new-nvptx-test.bc \
|
||||
// RUN: -save-temps -no-canonical-prefixes %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW %s
|
||||
|
||||
/// Check with new runtime and specifying the directory
|
||||
// RUN: %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
|
||||
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
|
||||
|
||||
// RUN: -fopenmp-relocatable-target -fopenmp-target-new-runtime \
|
||||
// RUN: --libomptarget-nvptx-bc-path=%S/Inputs/libomptarget -save-temps \
|
||||
// RUN: -no-canonical-prefixes %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=CHK-BCLIB-NEW-DIR %s
|
||||
|
||||
/// Create a bogus bitcode library and find it with LIBRARY_PATH
|
||||
// RUN: env LIBRARY_PATH=%S/Inputs/libomptarget/subdir %clang -### -fopenmp=libomp -fopenmp-targets=nvptx64-nvidia-cuda \
|
||||
// RUN: -Xopenmp-target -march=sm_35 --cuda-path=%S/Inputs/CUDA_102/usr/local/cuda \
|
||||
// RUN: -fopenmp-relocatable-target -fno-openmp-target-new-runtime -save-temps \
|
||||
// RUN: -fopenmp-relocatable-target -save-temps \
|
||||
// RUN: -no-canonical-prefixes %s 2>&1 | FileCheck -check-prefix=CHK-ENV-BCLIB %s
|
||||
|
||||
// CHK-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-nvptx-test.bc
|
||||
// CHK-BCLIB-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-nvptx-sm_35.bc
|
||||
// CHK-BCLIB-NEW: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget-new-nvptx-test.bc
|
||||
// CHK-BCLIB-NEW-DIR: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}libomptarget{{/|\\\\}}libomptarget-new-nvptx-sm_35.bc
|
||||
// CHK-ENV-BCLIB: clang{{.*}}-triple{{.*}}nvptx64-nvidia-cuda{{.*}}-mlink-builtin-bitcode{{.*}}subdir{{/|\\\\}}libomptarget-nvptx-sm_35.bc
|
||||
// CHK-BCLIB-NOT: {{error:|warning:}}
|
||||
|
||||
|
@ -204,7 +185,7 @@
|
|||
// RUN: -fopenmp-relocatable-target -save-temps -no-canonical-prefixes %s 2>&1 \
|
||||
// RUN: | FileCheck -check-prefix=CHK-BCLIB-WARN %s
|
||||
|
||||
// CHK-BCLIB-WARN: no library 'libomptarget-new-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library
|
||||
// CHK-BCLIB-WARN: no library 'libomptarget-nvptx-sm_35.bc' found in the default clang lib directory or in LIBRARY_PATH; use '--libomptarget-nvptx-bc-path' to specify nvptx bitcode library
|
||||
|
||||
/// ###########################################################################
|
||||
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --global-value-regex "__omp_rtl_"
|
||||
// Test target codegen - host bc file has to be created first.
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-new-runtime -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-target-debug=111 -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-EQ
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-DEFAULT
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-threads-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-THREADS
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix=CHECK-TEAMS
|
||||
// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-assume-teams-oversubscription -fopenmp-is-device -o - | FileCheck %s --check-prefix=CHECK-RUNTIME
|
||||
// expected-no-diagnostics
|
||||
|
||||
#ifndef HEADER
|
||||
|
|
|
@ -38,13 +38,11 @@ endif()
|
|||
# This is a list of all the targets that are supported/tested right now.
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newRTL")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} amdgcn-amd-amdhsa-newDriver")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newRTL")
|
||||
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-newDriver")
|
||||
|
||||
# Once the plugins for the different targets are validated, they will be added to
|
||||
|
@ -81,7 +79,6 @@ set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${LIBOMP_LIBRARY_DIR}" CACHE STRING
|
|||
|
||||
# Build offloading plugins and device RTLs if they are available.
|
||||
add_subdirectory(plugins)
|
||||
add_subdirectory(deviceRTLs)
|
||||
add_subdirectory(DeviceRTL)
|
||||
add_subdirectory(tools)
|
||||
|
||||
|
|
|
@ -180,7 +180,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)
|
|||
list(APPEND bc_files ${outfile})
|
||||
endforeach()
|
||||
|
||||
set(bclib_name "libomptarget-new-${target_name}-${target_cpu}.bc")
|
||||
set(bclib_name "libomptarget-${target_name}-${target_cpu}.bc")
|
||||
|
||||
# Link to a bitcode library.
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/linked_${bclib_name}
|
||||
|
@ -212,7 +212,7 @@ function(compileDeviceRTLLibrary target_cpu target_name)
|
|||
|
||||
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
|
||||
|
||||
set(bclib_target_name "omptarget-new-${target_name}-${target_cpu}-bc")
|
||||
set(bclib_target_name "omptarget-${target_name}-${target_cpu}-bc")
|
||||
|
||||
add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
|
||||
|
||||
|
|
|
@ -1,14 +0,0 @@
|
|||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
#
|
||||
# ##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Build a device RTL for each available machine.
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
add_subdirectory(amdgcn)
|
||||
add_subdirectory(nvptx)
|
|
@ -1,193 +0,0 @@
|
|||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Build the AMDGCN Device RTL bitcode library using clang -ffreestanding
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
set(LIBOMPTARGET_BUILD_AMDGCN_BCLIB FALSE CACHE BOOL
|
||||
"Can be set to true to enable building this library.")
|
||||
|
||||
if (NOT LIBOMPTARGET_BUILD_AMDGCN_BCLIB)
|
||||
libomptarget_say("Not building AMDGCN device RTL: Disabled by LIBOMPTARGET_BUILD_AMDGCN_BCLIB")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
|
||||
libomptarget_say("Not building AMDGCN device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
|
||||
return()
|
||||
endif()
|
||||
|
||||
|
||||
# Copied from nvptx CMakeLists
|
||||
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
|
||||
set(aux_triple x86_64-unknown-linux-gnu)
|
||||
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
|
||||
set(aux_triple powerpc64le-unknown-linux-gnu)
|
||||
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
set(aux_triple aarch64-unknown-linux-gnu)
|
||||
else()
|
||||
libomptarget_say("Not building AMDGCN device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if (LLVM_DIR)
|
||||
# Builds that use pre-installed LLVM have LLVM_DIR set.
|
||||
find_program(CLANG_TOOL clang PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
|
||||
find_program(LINK_TOOL llvm-link PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
|
||||
find_program(OPT_TOOL opt PATHS ${LLVM_TOOLS_BINARY_DIR} NO_DEFAULT_PATH)
|
||||
if ((NOT CLANG_TOOL) OR (NOT LINK_TOOL) OR (NOT OPT_TOOL))
|
||||
libomptarget_say("Not building AMDGCN device RTL. Missing clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} or opt: ${OPT_TOOL}")
|
||||
return()
|
||||
else()
|
||||
libomptarget_say("Building AMDGCN device RTL. Using clang: ${CLANG_TOOL}, llvm-link: ${LINK_TOOL} and opt: ${OPT_TOOL}")
|
||||
endif()
|
||||
elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING AND NOT OPENMP_STANDALONE_BUILD)
|
||||
# LLVM in-tree builds may use CMake target names to discover the tools.
|
||||
set(CLANG_TOOL $<TARGET_FILE:clang>)
|
||||
set(LINK_TOOL $<TARGET_FILE:llvm-link>)
|
||||
set(OPT_TOOL $<TARGET_FILE:opt>)
|
||||
libomptarget_say("Building AMDGCN device RTL. Using clang from in-tree build")
|
||||
else()
|
||||
libomptarget_say("Not building AMDGCN device RTL. No appropriate clang found")
|
||||
return()
|
||||
endif()
|
||||
|
||||
project(omptarget-amdgcn)
|
||||
|
||||
add_custom_target(omptarget-amdgcn ALL)
|
||||
|
||||
#optimization level
|
||||
set(optimization_level 2)
|
||||
|
||||
# Activate RTL message dumps if requested by the user.
|
||||
if(LIBOMPTARGET_NVPTX_DEBUG)
|
||||
set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g)
|
||||
endif()
|
||||
|
||||
get_filename_component(devicertl_base_directory
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
DIRECTORY)
|
||||
|
||||
set(cuda_sources
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip
|
||||
${devicertl_base_directory}/common/src/cancel.cu
|
||||
${devicertl_base_directory}/common/src/critical.cu
|
||||
${devicertl_base_directory}/common/src/data_sharing.cu
|
||||
${devicertl_base_directory}/common/src/libcall.cu
|
||||
${devicertl_base_directory}/common/src/loop.cu
|
||||
${devicertl_base_directory}/common/src/omp_data.cu
|
||||
${devicertl_base_directory}/common/src/omptarget.cu
|
||||
${devicertl_base_directory}/common/src/parallel.cu
|
||||
${devicertl_base_directory}/common/src/reduction.cu
|
||||
${devicertl_base_directory}/common/src/support.cu
|
||||
${devicertl_base_directory}/common/src/shuffle.cpp
|
||||
${devicertl_base_directory}/common/src/sync.cu
|
||||
${devicertl_base_directory}/common/src/task.cu)
|
||||
|
||||
set(h_files
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h
|
||||
${devicertl_base_directory}/common/debug.h
|
||||
${devicertl_base_directory}/common/omptarget.h
|
||||
${devicertl_base_directory}/common/omptargeti.h
|
||||
${devicertl_base_directory}/common/state-queue.h
|
||||
${devicertl_base_directory}/common/state-queuei.h
|
||||
${devicertl_base_directory}/common/support.h)
|
||||
|
||||
# for both in-tree and out-of-tree build
|
||||
if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY)
|
||||
set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR})
|
||||
else()
|
||||
set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY})
|
||||
endif()
|
||||
|
||||
# create gfx bitcode libraries
|
||||
set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900 gfx902 gfx906 gfx908 gfx90a gfx1010 gfx1030 gfx1031)
|
||||
if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST)
|
||||
set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST})
|
||||
endif()
|
||||
|
||||
# Prepend -I to each list element
|
||||
set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
|
||||
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN PREPEND "-I")
|
||||
|
||||
macro(add_cuda_bc_library)
|
||||
set(cu_cmd ${CLANG_TOOL}
|
||||
-xc++
|
||||
-c
|
||||
-mllvm -openmp-opt-disable
|
||||
-std=c++14
|
||||
-ffreestanding
|
||||
-target amdgcn-amd-amdhsa
|
||||
-emit-llvm
|
||||
-Xclang -aux-triple -Xclang ${aux_triple}
|
||||
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
|
||||
-D__AMDGCN__
|
||||
-Xclang -target-cpu -Xclang ${mcpu}
|
||||
-fvisibility=hidden
|
||||
-Wno-unused-value
|
||||
-nogpulib
|
||||
-O${optimization_level}
|
||||
${CUDA_DEBUG}
|
||||
-I${CMAKE_CURRENT_SOURCE_DIR}/src
|
||||
-I${devicertl_base_directory}/common/include
|
||||
-I${devicertl_base_directory}
|
||||
-I${devicertl_base_directory}/../include
|
||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_AMDGCN})
|
||||
|
||||
set(bc1_files)
|
||||
|
||||
foreach(file ${ARGN})
|
||||
get_filename_component(fname ${file} NAME_WE)
|
||||
set(bc1_filename ${fname}.${mcpu}.bc)
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${bc1_filename}
|
||||
COMMAND ${cu_cmd} ${file} -o ${bc1_filename}
|
||||
DEPENDS ${file} ${h_files})
|
||||
|
||||
list(APPEND bc1_files ${bc1_filename})
|
||||
endforeach()
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT linkout.cuda.${mcpu}.bc
|
||||
COMMAND ${LINK_TOOL} ${bc1_files} -o linkout.cuda.${mcpu}.bc
|
||||
DEPENDS ${bc1_files})
|
||||
|
||||
list(APPEND bc_files linkout.cuda.${mcpu}.bc)
|
||||
endmacro()
|
||||
|
||||
set(libname "omptarget-amdgcn")
|
||||
|
||||
set(toolchain_deps "")
|
||||
if(TARGET llvm-link)
|
||||
list(APPEND toolchain_deps llvm-link)
|
||||
endif()
|
||||
if(TARGET opt)
|
||||
list(APPEND toolchain_deps opt)
|
||||
endif()
|
||||
|
||||
foreach(mcpu ${mcpus})
|
||||
set(bc_files)
|
||||
add_cuda_bc_library(${cuda_sources})
|
||||
|
||||
set(bc_libname lib${libname}-${mcpu}.bc)
|
||||
add_custom_command(
|
||||
OUTPUT ${bc_libname}
|
||||
COMMAND ${LINK_TOOL} ${bc_files} | ${OPT_TOOL} --always-inline -o ${OUTPUTDIR}/${bc_libname}
|
||||
DEPENDS ${bc_files} ${toolchain_deps})
|
||||
|
||||
add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname})
|
||||
|
||||
install(FILES ${OUTPUTDIR}/${bc_libname}
|
||||
DESTINATION "${OPENMP_INSTALL_LIBDIR}"
|
||||
)
|
||||
endforeach()
|
|
@ -1,19 +0,0 @@
|
|||
//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _AMDGCN_INTERFACE_H_
|
||||
#define _AMDGCN_INTERFACE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define EXTERN extern "C"
|
||||
typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
|
||||
|
||||
EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads();
|
||||
|
||||
#endif
|
|
@ -1,34 +0,0 @@
|
|||
//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock
|
||||
// cannot be implemented - if one thread gets the lock, it can't continue on to
|
||||
// the next instruction in order to do anything as the other threads are waiting
|
||||
// to take the lock.
|
||||
// These functions will be implemented to provide the documented semantics for
|
||||
// a SIMD => wavefront mapping once that is implemented.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/debug.h"
|
||||
|
||||
static void warn() {
|
||||
PRINT0(LD_ALL, "Locks are not supported in this thread mapping model");
|
||||
}
|
||||
|
||||
void __kmpc_impl_init_lock(omp_lock_t *) { warn(); }
|
||||
void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); }
|
||||
void __kmpc_impl_set_lock(omp_lock_t *) { warn(); }
|
||||
void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); }
|
||||
int __kmpc_impl_test_lock(omp_lock_t *lock) {
|
||||
warn();
|
||||
return 0;
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,64 +0,0 @@
|
|||
//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "target_impl.h"
|
||||
|
||||
// Partially derived fom hcc_detail/device_functions.h
|
||||
|
||||
// HW_ID Register bit structure
|
||||
// WAVE_ID 3:0 Wave buffer slot number. 0-9.
|
||||
// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU.
|
||||
// PIPE_ID 7:6 Pipeline from which the wave was dispatched.
|
||||
// CU_ID 11:8 Compute Unit the wave is assigned to.
|
||||
// SH_ID 12 Shader Array (within an SE) the wave is assigned to.
|
||||
// SE_ID 14:13 Shader Engine the wave is assigned to.
|
||||
// TG_ID 19:16 Thread-group ID
|
||||
// VM_ID 23:20 Virtual Memory ID
|
||||
// QUEUE_ID 26:24 Queue from which this wave was dispatched.
|
||||
// STATE_ID 29:27 State ID (graphics only, not compute).
|
||||
// ME_ID 31:30 Micro-engine ID.
|
||||
|
||||
enum {
|
||||
HW_ID = 4, // specify that the hardware register to read is HW_ID
|
||||
|
||||
HW_ID_CU_ID_SIZE = 4, // size of CU_ID field in bits
|
||||
HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register
|
||||
|
||||
HW_ID_SE_ID_SIZE = 2, // sizeof SE_ID field in bits
|
||||
HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register
|
||||
};
|
||||
|
||||
// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit
|
||||
// immediate and returns a 32 bit value.
|
||||
// The encoding of the immediate parameter is:
|
||||
// ID 5:0 Which register to read from
|
||||
// OFFSET 10:6 Range: 0..31
|
||||
// WIDTH 15:11 Range: 1..32
|
||||
|
||||
// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width)
|
||||
// where hwreg forms a 16 bit immediate encoded by the assembler thus:
|
||||
// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) {
|
||||
// return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11);
|
||||
// }
|
||||
#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11))
|
||||
|
||||
// Note: The results can be changed by a context switch
|
||||
// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper
|
||||
// bound on how many compute units are available. Some values in this
|
||||
// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs.
|
||||
|
||||
EXTERN uint32_t __kmpc_impl_smid() {
|
||||
uint32_t cu_id = __builtin_amdgcn_s_getreg(
|
||||
ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID));
|
||||
uint32_t se_id = __builtin_amdgcn_s_getreg(
|
||||
ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID));
|
||||
return (se_id << HW_ID_CU_ID_SIZE) + cu_id;
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,83 +0,0 @@
|
|||
//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Declarations and definitions of target specific functions and constants
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H
|
||||
#define OMPTARGET_AMDGCN_TARGET_IMPL_H
|
||||
|
||||
#ifndef __AMDGCN__
|
||||
#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__"
|
||||
#endif
|
||||
|
||||
#include "amdgcn_interface.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// subset of inttypes.h
|
||||
#define PRId64 "ld"
|
||||
#define PRIu64 "lu"
|
||||
|
||||
typedef uint64_t __kmpc_impl_lanemask_t;
|
||||
|
||||
#define INLINE inline
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#define ALIGN(N) __attribute__((aligned(N)))
|
||||
#define PLUGIN_ACCESSIBLE \
|
||||
__attribute__((used)) /* Don't discard values the plugin reads */ \
|
||||
__attribute__((weak)) /* We may have multiple definitions */ \
|
||||
__attribute__((retain)) /* Also needed to keep values alive */ \
|
||||
__attribute__((visibility("protected"))) /* Access via SHT_HASH */ \
|
||||
__attribute__((section(".data"))) /* Not .bss, can write before load */
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
|
||||
INLINE constexpr const llvm::omp::GV &getGridValue() {
|
||||
return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The following def must match the absolute limit hardwired in the host RTL
|
||||
// max number of threads per team
|
||||
enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
|
||||
enum { WARPSIZE = getGridValue().GV_Warp_Size };
|
||||
|
||||
// Maximum number of omp state objects per SM allocated statically in global
|
||||
// memory.
|
||||
#define OMP_STATE_COUNT 32
|
||||
#define MAX_SM 64
|
||||
|
||||
#define OMP_ACTIVE_PARALLEL_LEVEL 128
|
||||
|
||||
// Data sharing related quantities, need to match what is used in the compiler.
|
||||
enum DATA_SHARING_SIZES {
|
||||
// The size reserved for data in a shared memory slot.
|
||||
DS_Slot_Size = getGridValue().GV_Slot_Size,
|
||||
// The slot size that should be reserved for a working warp.
|
||||
DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
|
||||
// The maximum number of warps in use
|
||||
DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
|
||||
};
|
||||
|
||||
enum : __kmpc_impl_lanemask_t {
|
||||
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
|
||||
};
|
||||
|
||||
// The return code of printf is not checked in the call sites in this library.
|
||||
// A call to a function named printf currently hits some special case handling
|
||||
// for opencl, which translates to calls that do not presently exist for openmp
|
||||
// Therefore, for now, stub out printf while building this library.
|
||||
#define printf(...)
|
||||
|
||||
#endif
|
|
@ -1,226 +0,0 @@
|
|||
//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Definitions of target specific functions
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target_impl.h"
|
||||
#include "target_interface.h"
|
||||
|
||||
// Implementations initially derived from hcc
|
||||
|
||||
// Initialized with a 64-bit mask with bits set in positions less than the
|
||||
// thread's lane number in the warp
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
|
||||
uint32_t lane = GetLaneId();
|
||||
int64_t ballot = __kmpc_impl_activemask();
|
||||
uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1;
|
||||
return mask & ballot;
|
||||
}
|
||||
|
||||
// Initialized with a 64-bit mask with bits set in positions greater than the
|
||||
// thread's lane number in the warp
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
|
||||
uint32_t lane = GetLaneId();
|
||||
if (lane == (WARPSIZE - 1))
|
||||
return 0;
|
||||
uint64_t ballot = __kmpc_impl_activemask();
|
||||
uint64_t mask = (~((uint64_t)0)) << (lane + 1);
|
||||
return mask & ballot;
|
||||
}
|
||||
|
||||
EXTERN double __kmpc_impl_get_wtick() { return ((double)1E-9); }
|
||||
|
||||
EXTERN double __kmpc_impl_get_wtime() {
|
||||
// The intrinsics for measuring time have undocumented frequency
|
||||
// This will probably need to be found by measurement on a number of
|
||||
// architectures. Until then, return 0, which is very inaccurate as a
|
||||
// timer but resolves the undefined symbol at link time.
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Warp vote function
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
||||
return __builtin_amdgcn_read_exec();
|
||||
}
|
||||
|
||||
static void pteam_mem_barrier(uint32_t num_threads, uint32_t *barrier_state) {
|
||||
__atomic_thread_fence(__ATOMIC_ACQUIRE);
|
||||
|
||||
uint32_t num_waves = (num_threads + WARPSIZE - 1) / WARPSIZE;
|
||||
|
||||
// Partial barrier implementation for amdgcn.
|
||||
// Uses two 16 bit unsigned counters. One for the number of waves to have
|
||||
// reached the barrier, and one to count how many times the barrier has been
|
||||
// passed. These are packed in a single atomically accessed 32 bit integer.
|
||||
// Low bits for the number of waves, assumed zero before this call.
|
||||
// High bits to count the number of times the barrier has been passed.
|
||||
|
||||
// precondition: num_waves != 0;
|
||||
// invariant: num_waves * WARPSIZE == num_threads;
|
||||
// precondition: num_waves < 0xffffu;
|
||||
|
||||
// Increment the low 16 bits once, using the lowest active thread.
|
||||
uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
|
||||
bool isLowest = GetLaneId() == lowestActiveThread;
|
||||
|
||||
if (isLowest) {
|
||||
uint32_t load = __atomic_fetch_add(barrier_state, 1,
|
||||
__ATOMIC_RELAXED); // commutative
|
||||
|
||||
// Record the number of times the barrier has been passed
|
||||
uint32_t generation = load & 0xffff0000u;
|
||||
|
||||
if ((load & 0x0000ffffu) == (num_waves - 1)) {
|
||||
// Reached num_waves in low bits so this is the last wave.
|
||||
// Set low bits to zero and increment high bits
|
||||
load += 0x00010000u; // wrap is safe
|
||||
load &= 0xffff0000u; // because bits zeroed second
|
||||
|
||||
// Reset the wave counter and release the waiting waves
|
||||
__atomic_store_n(barrier_state, load, __ATOMIC_RELAXED);
|
||||
} else {
|
||||
// more waves still to go, spin until generation counter changes
|
||||
do {
|
||||
__builtin_amdgcn_s_sleep(0);
|
||||
load = __atomic_load_n(barrier_state, __ATOMIC_RELAXED);
|
||||
} while ((load & 0xffff0000u) == generation);
|
||||
}
|
||||
}
|
||||
__atomic_thread_fence(__ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
uint32_t __kmpc_L0_Barrier [[clang::loader_uninitialized]];
|
||||
#pragma allocate(__kmpc_L0_Barrier) allocator(omp_pteam_mem_alloc)
|
||||
|
||||
EXTERN void __kmpc_impl_target_init() {
|
||||
// Don't have global ctors, and shared memory is not zero init
|
||||
__atomic_store_n(&__kmpc_L0_Barrier, 0u, __ATOMIC_RELEASE);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||
pteam_mem_barrier(num_threads, &__kmpc_L0_Barrier);
|
||||
}
|
||||
|
||||
namespace {
|
||||
uint32_t get_grid_dim(uint32_t n, uint16_t d) {
|
||||
uint32_t q = n / d;
|
||||
return q + (n > q * d);
|
||||
}
|
||||
uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
|
||||
uint16_t group_size) {
|
||||
uint32_t r = grid_size - group_id * group_size;
|
||||
return (r < group_size) ? r : group_size;
|
||||
}
|
||||
} // namespace
|
||||
|
||||
EXTERN int __kmpc_get_hardware_num_blocks() {
|
||||
return get_grid_dim(__builtin_amdgcn_grid_size_x(),
|
||||
__builtin_amdgcn_workgroup_size_x());
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_get_hardware_num_threads_in_block() {
|
||||
return get_workgroup_dim(__builtin_amdgcn_workgroup_id_x(),
|
||||
__builtin_amdgcn_grid_size_x(),
|
||||
__builtin_amdgcn_workgroup_size_x());
|
||||
}
|
||||
|
||||
EXTERN unsigned __kmpc_get_warp_size() {
|
||||
return WARPSIZE;
|
||||
}
|
||||
|
||||
EXTERN unsigned GetWarpId() { return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE; }
|
||||
EXTERN unsigned GetLaneId() {
|
||||
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
|
||||
}
|
||||
|
||||
EXTERN uint32_t __kmpc_amdgcn_gpu_num_threads() {
|
||||
return __kmpc_get_hardware_num_threads_in_block();
|
||||
}
|
||||
|
||||
// Atomics
|
||||
uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
|
||||
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
|
||||
return __builtin_amdgcn_atomic_inc32(Address, Val, __ATOMIC_SEQ_CST, "");
|
||||
}
|
||||
uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
|
||||
return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
|
||||
uint32_t R;
|
||||
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
|
||||
return R;
|
||||
}
|
||||
uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
|
||||
(void)__atomic_compare_exchange(Address, &Compare, &Val, false,
|
||||
__ATOMIC_SEQ_CST, __ATOMIC_RELAXED);
|
||||
return Compare;
|
||||
}
|
||||
|
||||
unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
|
||||
unsigned long long Val) {
|
||||
unsigned long long R;
|
||||
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
|
||||
return R;
|
||||
}
|
||||
unsigned long long __kmpc_atomic_add(unsigned long long *Address,
|
||||
unsigned long long Val) {
|
||||
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
// Stub implementations
|
||||
// Weak to allow overriding by local versions while comparing different
|
||||
// potential implementations
|
||||
__attribute__((weak)) EXTERN void *__kmpc_impl_malloc(size_t) {
|
||||
return nullptr;
|
||||
}
|
||||
__attribute__((weak)) EXTERN void __kmpc_impl_free(void *) {}
|
||||
|
||||
EXTERN
|
||||
int32_t __llvm_omp_vprintf(const char *Format, void *Arguments, uint32_t) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
||||
lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF));
|
||||
hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32);
|
||||
}
|
||||
|
||||
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
||||
return (((uint64_t)hi) << 32) | (uint64_t)lo;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); }
|
||||
|
||||
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
|
||||
// AMDGCN doesn't need to sync threads in a warp
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_threadfence() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_threadfence_block() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "workgroup");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_threadfence_system() {
|
||||
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "");
|
||||
}
|
||||
|
||||
// Calls to the AMDGCN layer (assuming 1D layout)
|
||||
EXTERN int __kmpc_get_hardware_thread_id_in_block() { return __builtin_amdgcn_workitem_id_x(); }
|
||||
EXTERN int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); }
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,44 +0,0 @@
|
|||
//===--------- allocator.h - OpenMP target memory allocator ------- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Macros for allocating variables in different address spaces.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef OMPTARGET_ALLOCATOR_H
|
||||
#define OMPTARGET_ALLOCATOR_H
|
||||
|
||||
#if _OPENMP
|
||||
// Follows the pattern in interface.h
|
||||
// Clang sema checks this type carefully, needs to closely match that from omp.h
|
||||
typedef enum omp_allocator_handle_t {
|
||||
omp_null_allocator = 0,
|
||||
omp_default_mem_alloc = 1,
|
||||
omp_large_cap_mem_alloc = 2,
|
||||
omp_const_mem_alloc = 3,
|
||||
omp_high_bw_mem_alloc = 4,
|
||||
omp_low_lat_mem_alloc = 5,
|
||||
omp_cgroup_mem_alloc = 6,
|
||||
omp_pteam_mem_alloc = 7,
|
||||
omp_thread_mem_alloc = 8,
|
||||
KMP_ALLOCATOR_MAX_HANDLE = ~(0U)
|
||||
} omp_allocator_handle_t;
|
||||
|
||||
#define __PRAGMA(STR) _Pragma(#STR)
|
||||
#define OMP_PRAGMA(STR) __PRAGMA(omp STR)
|
||||
|
||||
#define SHARED(NAME) \
|
||||
NAME [[clang::loader_uninitialized]]; \
|
||||
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
|
||||
|
||||
#define EXTERN_SHARED(NAME) \
|
||||
NAME; \
|
||||
OMP_PRAGMA(allocate(NAME) allocator(omp_pteam_mem_alloc))
|
||||
#endif
|
||||
|
||||
#endif // OMPTARGET_ALLOCATOR_H
|
|
@ -1,293 +0,0 @@
|
|||
//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains debug macros to be used in the application.
|
||||
//
|
||||
// Usage guide
|
||||
//
|
||||
// PRINT0(flag, str) : if debug flag is on, print (no arguments)
|
||||
// PRINT(flag, str, args) : if debug flag is on, print (arguments)
|
||||
// DON(flag) : return true if debug flag is on
|
||||
//
|
||||
// ASSERT(flag, cond, str, args): if test flag is on, test the condition
|
||||
// if the condition is false, print str+args
|
||||
// and assert.
|
||||
// CAUTION: cond may be evaluate twice
|
||||
// AON(flag) : return true if test flag is on
|
||||
//
|
||||
// WARNING(flag, str, args) : if warning flag is on, print the warning
|
||||
// WON(flag) : return true if warning flag is on
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _OMPTARGET_NVPTX_DEBUG_H_
|
||||
#define _OMPTARGET_NVPTX_DEBUG_H_
|
||||
|
||||
#include "target_interface.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set desired level of debugging
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define LD_SET_NONE 0ULL /* none */
|
||||
#define LD_SET_ALL -1ULL /* all */
|
||||
|
||||
// pos 1
|
||||
#define LD_SET_LOOP 0x1ULL /* basic loop */
|
||||
#define LD_SET_LOOPD 0x2ULL /* basic loop */
|
||||
#define LD_SET_PAR 0x4ULL /* basic parallel */
|
||||
#define LD_SET_PARD 0x8ULL /* basic parallel */
|
||||
|
||||
// pos 2
|
||||
#define LD_SET_SYNC 0x10ULL /* sync info */
|
||||
#define LD_SET_SYNCD 0x20ULL /* sync info */
|
||||
#define LD_SET_WAIT 0x40ULL /* state when waiting */
|
||||
#define LD_SET_TASK 0x80ULL /* print task info (high level) */
|
||||
|
||||
// pos 3
|
||||
#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */
|
||||
#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */
|
||||
#define LD_SET_ENV 0x400ULL /* env info */
|
||||
#define LD_SET_CANCEL 0x800ULL /* print cancel info */
|
||||
|
||||
// pos 4
|
||||
#define LD_SET_MEM 0x1000ULL /* malloc / free */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set the desired flags to print selected output.
|
||||
|
||||
// these are some examples of possible definitions that can be used for
|
||||
// debugging.
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL)
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save
|
||||
// on cuda buffer
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO)
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV)
|
||||
//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR)
|
||||
|
||||
#ifndef OMPTARGET_NVPTX_DEBUG
|
||||
#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE
|
||||
#elif OMPTARGET_NVPTX_DEBUG
|
||||
#warning debug is used, not good for measurements
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set desired level of asserts
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// available flags
|
||||
|
||||
#define LT_SET_NONE 0x0 /* unsafe */
|
||||
#define LT_SET_SAFETY \
|
||||
0x1 /* check malloc type of stuff, input at creation, cheap */
|
||||
#define LT_SET_INPUT 0x2 /* check also all runtime inputs */
|
||||
#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set the desired flags
|
||||
|
||||
#ifndef OMPTARGET_NVPTX_TEST
|
||||
#if OMPTARGET_NVPTX_DEBUG
|
||||
#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY)
|
||||
#else
|
||||
#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY)
|
||||
#endif
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set desired level of warnings
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// available flags
|
||||
|
||||
#define LW_SET_ALL -1
|
||||
#define LW_SET_NONE 0x0
|
||||
#define LW_SET_ENV 0x1
|
||||
#define LW_SET_INPUT 0x2
|
||||
#define LW_SET_FUSSY 0x4
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// set the desired flags
|
||||
|
||||
#if OMPTARGET_NVPTX_DEBUG
|
||||
#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE)
|
||||
#else
|
||||
#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY)
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// implementation for debug
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING
|
||||
#include "common/support.h"
|
||||
|
||||
template <typename... Arguments>
|
||||
NOINLINE static void log(const char *fmt, Arguments... parameters) {
|
||||
printf(fmt, (int)GetBlockIdInKernel(),
|
||||
(int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
|
||||
(int)GetLaneId(), parameters...);
|
||||
}
|
||||
|
||||
#endif
|
||||
#if OMPTARGET_NVPTX_TEST
|
||||
|
||||
template <typename... Arguments>
|
||||
NOINLINE static void check(bool cond, const char *fmt,
|
||||
Arguments... parameters) {
|
||||
if (!cond) {
|
||||
printf(fmt, (int)GetBlockIdInKernel(),
|
||||
(int)__kmpc_get_hardware_thread_id_in_block(), (int)GetWarpId(),
|
||||
(int)GetLaneId(), parameters...);
|
||||
__builtin_trap();
|
||||
}
|
||||
}
|
||||
|
||||
NOINLINE static void check(bool cond) {
|
||||
if (!cond)
|
||||
__builtin_trap();
|
||||
}
|
||||
#endif
|
||||
|
||||
// set flags that are tested (inclusion properties)
|
||||
|
||||
#define LD_ALL (LD_SET_ALL)
|
||||
|
||||
#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD)
|
||||
#define LD_LOOPD (LD_SET_LOOPD)
|
||||
#define LD_PAR (LD_SET_PAR | LD_SET_PARD)
|
||||
#define LD_PARD (LD_SET_PARD)
|
||||
|
||||
// pos 2
|
||||
#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD)
|
||||
#define LD_SYNCD (LD_SET_SYNCD)
|
||||
#define LD_WAIT (LD_SET_WAIT)
|
||||
#define LD_TASK (LD_SET_TASK)
|
||||
|
||||
// pos 3
|
||||
#define LD_IO (LD_SET_IO | LD_SET_IOD)
|
||||
#define LD_IOD (LD_SET_IOD)
|
||||
#define LD_ENV (LD_SET_ENV)
|
||||
#define LD_CANCEL (LD_SET_CANCEL)
|
||||
|
||||
// pos 3
|
||||
#define LD_MEM (LD_SET_MEM)
|
||||
|
||||
// implement
|
||||
#if OMPTARGET_NVPTX_DEBUG
|
||||
|
||||
#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag))
|
||||
|
||||
#define PRINT0(_flag, _str) \
|
||||
{ \
|
||||
if (omptarget_device_environment.debug_level && DON(_flag)) { \
|
||||
log("<b %2d, t %4d, w %2d, l %2d>: " _str); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define PRINT(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (omptarget_device_environment.debug_level && DON(_flag)) { \
|
||||
log("<b %2d, t %4d, w %2d, l %2d>: " _str, _args); \
|
||||
} \
|
||||
}
|
||||
#else
|
||||
|
||||
#define DON(_flag) (0)
|
||||
#define PRINT0(flag, str)
|
||||
#define PRINT(flag, str, _args...)
|
||||
|
||||
#endif
|
||||
|
||||
// for printing without worrying about precision, pointers...
|
||||
#define P64(_x) ((unsigned long long)(_x))
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// early defs for test
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY)
|
||||
#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY)
|
||||
#define LT_FUSSY (LT_SET_FUSSY)
|
||||
|
||||
#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY
|
||||
|
||||
#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
|
||||
#define ASSERT0(_flag, _cond, _str) \
|
||||
{ \
|
||||
if (TON(_flag)) { \
|
||||
check(_cond); \
|
||||
} \
|
||||
}
|
||||
#define ASSERT(_flag, _cond, _str, _args...) \
|
||||
{ \
|
||||
if (TON(_flag)) { \
|
||||
check(_cond); \
|
||||
} \
|
||||
}
|
||||
|
||||
#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT
|
||||
|
||||
#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag))
|
||||
#define ASSERT0(_flag, _cond, _str) \
|
||||
{ \
|
||||
if (TON(_flag)) { \
|
||||
check((_cond), "<b %3d, t %4d, w %2d, l %2d> ASSERT: " _str "\n"); \
|
||||
} \
|
||||
}
|
||||
#define ASSERT(_flag, _cond, _str, _args...) \
|
||||
{ \
|
||||
if (TON(_flag)) { \
|
||||
check((_cond), "<b %3d, t %4d, w %2d, l %d2> ASSERT: " _str "\n", \
|
||||
_args); \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define TON(_flag) (0)
|
||||
#define ASSERT0(_flag, _cond, _str)
|
||||
#define ASSERT(_flag, _cond, _str, _args...)
|
||||
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// early defs for warning
|
||||
|
||||
#define LW_ALL (LW_SET_ALL)
|
||||
#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV)
|
||||
#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT)
|
||||
#define LW_FUSSY (LW_SET_FUSSY)
|
||||
|
||||
#if OMPTARGET_NVPTX_WARNING
|
||||
|
||||
#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag))
|
||||
#define WARNING0(_flag, _str) \
|
||||
{ \
|
||||
if (WON(_flag)) { \
|
||||
log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str); \
|
||||
} \
|
||||
}
|
||||
#define WARNING(_flag, _str, _args...) \
|
||||
{ \
|
||||
if (WON(_flag)) { \
|
||||
log("<b %2d, t %4d, w %2d, l %2d> WARNING: " _str, _args); \
|
||||
} \
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#define WON(_flag) (0)
|
||||
#define WARNING0(_flag, _str)
|
||||
#define WARNING(_flag, _str, _args...)
|
||||
|
||||
#endif
|
||||
|
||||
#endif
|
|
@ -1,405 +0,0 @@
|
|||
case 0:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
);
|
||||
break;
|
||||
case 1:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0]);
|
||||
break;
|
||||
case 2:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1]);
|
||||
break;
|
||||
case 3:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2]);
|
||||
break;
|
||||
case 4:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
);
|
||||
break;
|
||||
case 5:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4]);
|
||||
break;
|
||||
case 6:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5]);
|
||||
break;
|
||||
case 7:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6]);
|
||||
break;
|
||||
case 8:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
);
|
||||
break;
|
||||
case 9:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8]);
|
||||
break;
|
||||
case 10:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9]);
|
||||
break;
|
||||
case 11:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10]);
|
||||
break;
|
||||
case 12:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
);
|
||||
break;
|
||||
case 13:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12]);
|
||||
break;
|
||||
case 14:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13]);
|
||||
break;
|
||||
case 15:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14]);
|
||||
break;
|
||||
case 16:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
);
|
||||
break;
|
||||
case 17:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16]);
|
||||
break;
|
||||
case 18:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17]);
|
||||
break;
|
||||
case 19:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18]);
|
||||
break;
|
||||
case 20:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
);
|
||||
break;
|
||||
case 21:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20]);
|
||||
break;
|
||||
case 22:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21]);
|
||||
break;
|
||||
case 23:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22]);
|
||||
break;
|
||||
case 24:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
);
|
||||
break;
|
||||
case 25:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24]);
|
||||
break;
|
||||
case 26:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24], args[25]);
|
||||
break;
|
||||
case 27:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24], args[25], args[26]);
|
||||
break;
|
||||
case 28:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24], args[25], args[26], args[27]
|
||||
);
|
||||
break;
|
||||
case 29:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24], args[25], args[26], args[27]
|
||||
, args[28]);
|
||||
break;
|
||||
case 30:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24], args[25], args[26], args[27]
|
||||
, args[28], args[29]);
|
||||
break;
|
||||
case 31:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24], args[25], args[26], args[27]
|
||||
, args[28], args[29], args[30]);
|
||||
break;
|
||||
case 32:
|
||||
((void (*)(kmp_int32 *, kmp_int32 *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
, void *, void *, void *, void *
|
||||
))fn)(&global_tid, &bound_tid
|
||||
, args[0], args[1], args[2], args[3]
|
||||
, args[4], args[5], args[6], args[7]
|
||||
, args[8], args[9], args[10], args[11]
|
||||
, args[12], args[13], args[14], args[15]
|
||||
, args[16], args[17], args[18], args[19]
|
||||
, args[20], args[21], args[22], args[23]
|
||||
, args[24], args[25], args[26], args[27]
|
||||
, args[28], args[29], args[30], args[31]
|
||||
);
|
||||
break;
|
|
@ -1,94 +0,0 @@
|
|||
//===-- target.h ---------- OpenMP device runtime target implementation ---===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Target region interfaces are simple interfaces designed to allow middle-end
|
||||
// (=LLVM) passes to analyze and transform the code. To achieve good performance
|
||||
// it may be required to run the associated passes. However, implementations of
|
||||
// this interface shall always provide a correct implementation as close to the
|
||||
// user expected code as possible.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
|
||||
#define LLVM_OPENMP_LIBOMPTARGET_DEVICERTLS_COMMON_TARGET_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
extern "C" {
|
||||
|
||||
/// Forward declaration of the source location identifier "ident".
|
||||
typedef struct ident ident_t;
|
||||
|
||||
/// The target region _kernel_ interface for GPUs
|
||||
///
|
||||
/// This deliberatly simple interface provides the middle-end (=LLVM) with
|
||||
/// easier means to reason about the semantic of the code and transform it as
|
||||
/// well. The runtime calls are therefore also desiged to carry sufficient
|
||||
/// information necessary for optimizations.
|
||||
///
|
||||
///
|
||||
/// Intended usage:
|
||||
///
|
||||
/// \code
|
||||
/// void kernel(...) {
|
||||
/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
|
||||
/// /* UseGenericStateMachine */ true,
|
||||
/// /* RequiresFullRuntime */ ... );
|
||||
/// if (ThreadKind == -1) {
|
||||
/// // User defined kernel code.
|
||||
/// }
|
||||
/// __kmpc_target_deinit(...);
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
/// Which can be transformed to:
|
||||
///
|
||||
/// \code
|
||||
/// void kernel(...) {
|
||||
/// ThreadKind = __kmpc_target_init(Ident, /* Mode */ 1,
|
||||
/// /* UseGenericStateMachine */ false,
|
||||
/// /* RequiresFullRuntime */ ... );
|
||||
/// if (ThreadKind == -1) {
|
||||
/// // User defined kernel code.
|
||||
/// } else {
|
||||
/// assume(ThreadKind == ThreadId);
|
||||
/// // Custom, kernel-specific state machine code.
|
||||
/// }
|
||||
/// __kmpc_target_deinit(...);
|
||||
/// }
|
||||
/// \endcode
|
||||
///
|
||||
///
|
||||
///{
|
||||
|
||||
/// Initialization
|
||||
///
|
||||
/// Must be called by all threads.
|
||||
///
|
||||
/// \param Ident Source location identification, can be NULL.
|
||||
///
|
||||
int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
|
||||
bool UseGenericStateMachine,
|
||||
bool RequiresFullRuntime);
|
||||
|
||||
/// De-Initialization
|
||||
///
|
||||
/// Must be called by the main thread in generic mode, can be called by all
|
||||
/// threads. Must be called by all threads in SPMD mode.
|
||||
///
|
||||
/// In non-SPMD, this function releases the workers trapped in a state machine
|
||||
/// and also any memory dynamically allocated by the runtime.
|
||||
///
|
||||
/// \param Ident Source location identification, can be NULL.
|
||||
///
|
||||
void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
|
||||
bool RequiresFullRuntime);
|
||||
|
||||
///}
|
||||
}
|
||||
#endif
|
|
@ -1,102 +0,0 @@
|
|||
//===- shuffle.h - OpenMP variants of the shuffle idiom for all targets -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Shuffle function implementations for all supported targets.
|
||||
//
|
||||
// Note: We unify the mask type to uint64_t instead of __kmpc_impl_lanemask_t.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef LIBOMPTARGET_DEVICERTL_SHUFFLE_H
|
||||
#define LIBOMPTARGET_DEVICERTL_SHUFFLE_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#pragma omp declare target
|
||||
|
||||
/// External shuffle API
|
||||
///
|
||||
///{
|
||||
|
||||
extern "C" {
|
||||
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
||||
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||
}
|
||||
|
||||
///}
|
||||
|
||||
/// Forward declarations
|
||||
///
|
||||
///{
|
||||
extern "C" {
|
||||
unsigned GetLaneId();
|
||||
unsigned __kmpc_get_warp_size();
|
||||
void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
|
||||
uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
|
||||
}
|
||||
///}
|
||||
|
||||
/// Fallback implementations of the shuffle sync idiom.
|
||||
/// Unavailable at present (would error at link time if used).
|
||||
///
|
||||
///{
|
||||
|
||||
int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var, int32_t SrcLane);
|
||||
|
||||
int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var, uint32_t Delta,
|
||||
int32_t Width);
|
||||
|
||||
///}
|
||||
|
||||
/// AMDGCN implementations of the shuffle sync idiom.
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match(device = {arch(amdgcn)})
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
int Width = __kmpc_get_warp_size();
|
||||
int Self = GetLaneId();
|
||||
int Index = SrcLane + (Self & ~(Width - 1));
|
||||
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
|
||||
}
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
|
||||
uint32_t LaneDelta, int32_t Width) {
|
||||
int Self = GetLaneId();
|
||||
int Index = Self + LaneDelta;
|
||||
Index = (int)(LaneDelta + (Self & (Width - 1))) >= Width ? Self : Index;
|
||||
return __builtin_amdgcn_ds_bpermute(Index << 2, Var);
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
///}
|
||||
|
||||
/// NVPTX implementations of the shuffle and shuffle sync idiom.
|
||||
///
|
||||
///{
|
||||
#pragma omp begin declare variant match( \
|
||||
device = {arch(nvptx, nvptx64)}, implementation = {extension(match_any)})
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_sync(uint64_t Mask, int32_t Var,
|
||||
int32_t SrcLane) {
|
||||
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
|
||||
}
|
||||
|
||||
inline int32_t __kmpc_impl_shfl_down_sync(uint64_t Mask, int32_t Var,
|
||||
uint32_t Delta, int32_t Width) {
|
||||
int32_t T = ((__kmpc_get_warp_size() - Width) << 8) | 0x1f;
|
||||
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
|
||||
}
|
||||
|
||||
#pragma omp end declare variant
|
||||
///}
|
||||
|
||||
#pragma omp end declare target
|
||||
|
||||
#endif
|
|
@ -1,282 +0,0 @@
|
|||
//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the declarations of all library macros, types,
|
||||
// and functions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef OMPTARGET_H
|
||||
#define OMPTARGET_H
|
||||
|
||||
#include "common/allocator.h"
|
||||
#include "common/debug.h" // debug
|
||||
#include "common/state-queue.h"
|
||||
#include "common/support.h"
|
||||
#include "interface.h" // interfaces with omp, compiler, and user
|
||||
#include "target_impl.h"
|
||||
|
||||
#define OMPTARGET_NVPTX_VERSION 1.1
|
||||
|
||||
// used by the library for the interface with the app
|
||||
#define DISPATCH_FINISHED 0
|
||||
#define DISPATCH_NOTFINISHED 1
|
||||
|
||||
// used by dynamic scheduling
|
||||
#define FINISHED 0
|
||||
#define NOT_FINISHED 1
|
||||
#define LAST_CHUNK 2
|
||||
|
||||
#define BARRIER_COUNTER 0
|
||||
#define ORDERED_COUNTER 1
|
||||
|
||||
// Worker slot type which is initialized with the default worker slot
|
||||
// size of 4*32 bytes.
|
||||
struct __kmpc_data_sharing_slot {
|
||||
__kmpc_data_sharing_slot *Next;
|
||||
__kmpc_data_sharing_slot *Prev;
|
||||
void *PrevSlotStackPtr;
|
||||
void *DataEnd;
|
||||
char Data[DS_Worker_Warp_Slot_Size];
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// task ICV and (implicit & explicit) task state
|
||||
|
||||
class omptarget_nvptx_TaskDescr {
|
||||
public:
|
||||
// methods for flags
|
||||
INLINE omp_sched_t GetRuntimeSched() const;
|
||||
INLINE void SetRuntimeSched(omp_sched_t sched);
|
||||
INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; }
|
||||
INLINE int InL2OrHigherParallelRegion() const {
|
||||
return items.flags & TaskDescr_InParL2P;
|
||||
}
|
||||
INLINE int IsParallelConstruct() const {
|
||||
return items.flags & TaskDescr_IsParConstr;
|
||||
}
|
||||
INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
|
||||
// methods for other fields
|
||||
INLINE uint16_t &ThreadId() { return items.threadId; }
|
||||
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
|
||||
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
|
||||
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
|
||||
prev = taskDescr;
|
||||
}
|
||||
// init & copy
|
||||
INLINE void InitLevelZeroTaskDescr();
|
||||
INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
||||
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
|
||||
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
|
||||
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
|
||||
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
|
||||
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
|
||||
uint16_t tid, uint16_t tnum);
|
||||
INLINE void SaveLoopData();
|
||||
INLINE void RestoreLoopData() const;
|
||||
|
||||
private:
|
||||
// bits for flags: (6 used, 2 free)
|
||||
// 3 bits (SchedMask) for runtime schedule
|
||||
// 1 bit (InPar) if this thread has encountered one or more parallel region
|
||||
// 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task)
|
||||
// 1 bit (InParL2+) if this thread has encountered L2 or higher parallel
|
||||
// region
|
||||
static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4);
|
||||
static const uint8_t TaskDescr_InPar = 0x10;
|
||||
static const uint8_t TaskDescr_IsParConstr = 0x20;
|
||||
static const uint8_t TaskDescr_InParL2P = 0x40;
|
||||
|
||||
struct SavedLoopDescr_items {
|
||||
int64_t loopUpperBound;
|
||||
int64_t nextLowerBound;
|
||||
int64_t chunk;
|
||||
int64_t stride;
|
||||
kmp_sched_t schedule;
|
||||
} loopData;
|
||||
|
||||
struct TaskDescr_items {
|
||||
uint8_t flags; // 6 bit used (see flag above)
|
||||
uint8_t unused;
|
||||
uint16_t threadId; // thread id
|
||||
uint64_t runtimeChunkSize; // runtime chunk size
|
||||
} items;
|
||||
omptarget_nvptx_TaskDescr *prev;
|
||||
};
|
||||
|
||||
// build on kmp
|
||||
typedef struct omptarget_nvptx_ExplicitTaskDescr {
|
||||
omptarget_nvptx_TaskDescr
|
||||
taskDescr; // omptarget_nvptx task description (must be first)
|
||||
kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last)
|
||||
} omptarget_nvptx_ExplicitTaskDescr;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Descriptor of a parallel region (worksharing in general)
|
||||
|
||||
class omptarget_nvptx_WorkDescr {
|
||||
|
||||
public:
|
||||
// access to data
|
||||
INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; }
|
||||
|
||||
private:
|
||||
omptarget_nvptx_TaskDescr masterTaskICV;
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
class omptarget_nvptx_TeamDescr {
|
||||
public:
|
||||
// access to data
|
||||
INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() {
|
||||
return &levelZeroTaskDescr;
|
||||
}
|
||||
INLINE omptarget_nvptx_WorkDescr &WorkDescr() {
|
||||
return workDescrForActiveParallel;
|
||||
}
|
||||
|
||||
// init
|
||||
INLINE void InitTeamDescr();
|
||||
|
||||
INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) {
|
||||
worker_rootS[wid].DataEnd =
|
||||
&worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size;
|
||||
// We currently do not have a next slot.
|
||||
worker_rootS[wid].Next = 0;
|
||||
worker_rootS[wid].Prev = 0;
|
||||
worker_rootS[wid].PrevSlotStackPtr = 0;
|
||||
return (__kmpc_data_sharing_slot *)&worker_rootS[wid];
|
||||
}
|
||||
|
||||
private:
|
||||
omptarget_nvptx_TaskDescr
|
||||
levelZeroTaskDescr; // icv for team master initial thread
|
||||
omptarget_nvptx_WorkDescr
|
||||
workDescrForActiveParallel; // one, ONLY for the active par
|
||||
|
||||
ALIGN(16)
|
||||
__kmpc_data_sharing_slot worker_rootS[DS_Max_Warp_Number];
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// thread private data (struct of arrays for better coalescing)
|
||||
// tid refers here to the global thread id
|
||||
// do not support multiple concurrent kernel a this time
|
||||
class omptarget_nvptx_ThreadPrivateContext {
|
||||
public:
|
||||
// task
|
||||
INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) {
|
||||
return &levelOneTaskDescr[tid];
|
||||
}
|
||||
INLINE void SetTopLevelTaskDescr(int tid,
|
||||
omptarget_nvptx_TaskDescr *taskICV) {
|
||||
topTaskDescr[tid] = taskICV;
|
||||
}
|
||||
INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const;
|
||||
// schedule (for dispatch)
|
||||
INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; }
|
||||
INLINE int64_t &Chunk(int tid) { return chunk[tid]; }
|
||||
INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; }
|
||||
INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; }
|
||||
INLINE int64_t &Stride(int tid) { return stride[tid]; }
|
||||
|
||||
INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; }
|
||||
|
||||
INLINE void InitThreadPrivateContext(int tid);
|
||||
INLINE uint64_t &Cnt() { return cnt; }
|
||||
|
||||
private:
|
||||
// team context for this team
|
||||
omptarget_nvptx_TeamDescr teamContext;
|
||||
// task ICV for implicit threads in the only parallel region
|
||||
omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM];
|
||||
// pointer where to find the current task ICV (top of the stack)
|
||||
omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM];
|
||||
// schedule (for dispatch)
|
||||
kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for
|
||||
int64_t chunk[MAX_THREADS_PER_TEAM];
|
||||
int64_t loopUpperBound[MAX_THREADS_PER_TEAM];
|
||||
// state for dispatch with dyn/guided OR static (never use both at a time)
|
||||
int64_t nextLowerBound[MAX_THREADS_PER_TEAM];
|
||||
int64_t stride[MAX_THREADS_PER_TEAM];
|
||||
uint64_t cnt;
|
||||
};
|
||||
|
||||
/// Memory manager for statically allocated memory.
|
||||
class omptarget_nvptx_SimpleMemoryManager {
|
||||
private:
|
||||
struct MemDataTy {
|
||||
volatile unsigned keys[OMP_STATE_COUNT];
|
||||
} MemData[MAX_SM] ALIGN(128);
|
||||
|
||||
INLINE static uint32_t hash(unsigned key) {
|
||||
return key & (OMP_STATE_COUNT - 1);
|
||||
}
|
||||
|
||||
public:
|
||||
INLINE void Release();
|
||||
INLINE const void *Acquire(const void *buf, size_t size);
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global data tables
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
|
||||
extern uint32_t EXTERN_SHARED(usedMemIdx);
|
||||
extern uint32_t EXTERN_SHARED(usedSlotIdx);
|
||||
#if _OPENMP
|
||||
extern uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
|
||||
#else
|
||||
extern uint8_t EXTERN_SHARED(parallelLevel)[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
#endif
|
||||
extern uint16_t EXTERN_SHARED(threadLimit);
|
||||
extern uint16_t EXTERN_SHARED(threadsInTeam);
|
||||
extern uint16_t EXTERN_SHARED(nThreads);
|
||||
extern omptarget_nvptx_ThreadPrivateContext *
|
||||
EXTERN_SHARED(omptarget_nvptx_threadPrivateContext);
|
||||
|
||||
extern int8_t EXTERN_SHARED(execution_param);
|
||||
extern void *EXTERN_SHARED(ReductionScratchpadPtr);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// work function (outlined parallel/simd functions) and arguments.
|
||||
// needed for L1 parallelism only.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef void *omptarget_nvptx_WorkFn;
|
||||
extern omptarget_nvptx_WorkFn EXTERN_SHARED(omptarget_nvptx_workFn);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get private data structures
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor();
|
||||
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor();
|
||||
INLINE omptarget_nvptx_TaskDescr *
|
||||
getMyTopTaskDescriptor(bool isSPMDExecutionMode);
|
||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// inlined implementation
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __builtin_ffs(x); }
|
||||
INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __builtin_popcount(x); }
|
||||
INLINE uint32_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); }
|
||||
INLINE uint32_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); }
|
||||
|
||||
#include "common/omptargeti.h"
|
||||
|
||||
#endif
|
|
@ -1,223 +0,0 @@
|
|||
//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the declarations of all library macros, types,
|
||||
// and functions.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Task Descriptor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const {
|
||||
// sched starts from 1..4; encode it as 0..3; so add 1 here
|
||||
uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1;
|
||||
return (omp_sched_t)rc;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
|
||||
// sched starts from 1..4; encode it as 0..3; so sub 1 here
|
||||
uint8_t val = ((uint8_t)sched) - 1;
|
||||
// clear current sched
|
||||
items.flags &= ~TaskDescr_SchedMask;
|
||||
// set new sched
|
||||
items.flags |= val;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
|
||||
// slow method
|
||||
// flag:
|
||||
// default sched is static,
|
||||
// dyn is off (unused now anyway, but may need to sample from host ?)
|
||||
// not in parallel
|
||||
|
||||
items.flags = 0;
|
||||
items.threadId = 0; // is master
|
||||
items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
|
||||
}
|
||||
|
||||
// This is called when all threads are started together in SPMD mode.
|
||||
// OMP directives include target parallel, target distribute parallel for, etc.
|
||||
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
||||
// slow method
|
||||
// flag:
|
||||
// default sched is static,
|
||||
// dyn is off (unused now anyway, but may need to sample from host ?)
|
||||
// in L1 parallel
|
||||
|
||||
items.flags = TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
|
||||
items.threadId =
|
||||
__kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
|
||||
// called for 1st level)
|
||||
items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1
|
||||
prev = parentTaskDescr;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyData(
|
||||
omptarget_nvptx_TaskDescr *sourceTaskDescr) {
|
||||
items = sourceTaskDescr->items;
|
||||
}
|
||||
|
||||
INLINE void
|
||||
omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) {
|
||||
CopyData(sourceTaskDescr);
|
||||
prev = sourceTaskDescr->prev;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyParent(
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
||||
CopyData(parentTaskDescr);
|
||||
prev = parentTaskDescr;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr) {
|
||||
CopyParent(parentTaskDescr);
|
||||
items.flags = items.flags & ~TaskDescr_IsParConstr;
|
||||
ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task");
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
|
||||
omptarget_nvptx_TaskDescr *masterTaskDescr) {
|
||||
CopyParent(masterTaskDescr);
|
||||
// overwrite specific items;
|
||||
items.flags |=
|
||||
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
|
||||
omptarget_nvptx_TaskDescr *workTaskDescr) {
|
||||
Copy(workTaskDescr);
|
||||
//
|
||||
// overwrite specific items;
|
||||
//
|
||||
// The threadID should be __kmpc_get_hardware_thread_id_in_block() %
|
||||
// GetMasterThreadID(). This is so that the serial master (first lane in the
|
||||
// master warp) gets a threadId of 0. However, we know that this function is
|
||||
// always called in a parallel region where only workers are active. The
|
||||
// serial master thread never enters this region. When a parallel region is
|
||||
// executed serially, the threadId is set to 0 elsewhere and the
|
||||
// kmpc_serialized_* functions are called, which never activate this region.
|
||||
items.threadId =
|
||||
__kmpc_get_hardware_thread_id_in_block(); // get ids from cuda (only
|
||||
// called for 1st level)
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
|
||||
CopyParent(parentTaskDescr);
|
||||
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
|
||||
items.threadId = tid;
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() {
|
||||
loopData.loopUpperBound =
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId);
|
||||
loopData.nextLowerBound =
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId);
|
||||
loopData.schedule =
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId);
|
||||
loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId);
|
||||
loopData.stride =
|
||||
omptarget_nvptx_threadPrivateContext->Stride(items.threadId);
|
||||
}
|
||||
|
||||
INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const {
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk;
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) =
|
||||
loopData.loopUpperBound;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) =
|
||||
loopData.nextLowerBound;
|
||||
omptarget_nvptx_threadPrivateContext->Stride(items.threadId) =
|
||||
loopData.stride;
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) =
|
||||
loopData.schedule;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Thread Private Context
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE omptarget_nvptx_TaskDescr *
|
||||
omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const {
|
||||
ASSERT0(
|
||||
LT_FUSSY, tid < MAX_THREADS_PER_TEAM,
|
||||
"Getting top level, tid is larger than allocated data structure size");
|
||||
return topTaskDescr[tid];
|
||||
}
|
||||
|
||||
INLINE void
|
||||
omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
|
||||
// levelOneTaskDescr is init when starting the parallel region
|
||||
// top task descr is NULL (team master version will be fixed separately)
|
||||
topTaskDescr[tid] = NULL;
|
||||
// the following don't need to be init here; they are init when using dyn
|
||||
// sched
|
||||
// current_Event, events_Number, chunk, num_Iterations, schedule
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Team Descriptor
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
|
||||
levelZeroTaskDescr.InitLevelZeroTaskDescr();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Get private data structure for thread
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Utility routines for CUDA threads
|
||||
INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() {
|
||||
return omptarget_nvptx_threadPrivateContext->TeamContext();
|
||||
}
|
||||
|
||||
INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() {
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
return currTeamDescr.WorkDescr();
|
||||
}
|
||||
|
||||
INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) {
|
||||
return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
}
|
||||
|
||||
INLINE omptarget_nvptx_TaskDescr *
|
||||
getMyTopTaskDescriptor(bool isSPMDExecutionMode) {
|
||||
return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Memory management runtime functions.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE void omptarget_nvptx_SimpleMemoryManager::Release() {
|
||||
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
|
||||
"SlotIdx is too big or uninitialized.");
|
||||
ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT,
|
||||
"MemIdx is too big or uninitialized.");
|
||||
MemDataTy &MD = MemData[usedSlotIdx];
|
||||
__kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u);
|
||||
}
|
||||
|
||||
INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf,
|
||||
size_t size) {
|
||||
ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM,
|
||||
"SlotIdx is too big or uninitialized.");
|
||||
const unsigned sm = usedSlotIdx;
|
||||
MemDataTy &MD = MemData[sm];
|
||||
unsigned i = hash(GetBlockIdInKernel());
|
||||
while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) {
|
||||
i = hash(i + 1);
|
||||
}
|
||||
usedSlotIdx = sm;
|
||||
usedMemIdx = i;
|
||||
return static_cast<const char *>(buf) + (sm * OMP_STATE_COUNT + i) * size;
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Interface to be used in the implementation of OpenMP cancel.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/debug.h"
|
||||
#include "interface.h"
|
||||
|
||||
EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t cancelVal) {
|
||||
PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal);
|
||||
// disabled
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t cancelVal) {
|
||||
PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal);
|
||||
// disabled
|
||||
return 0;
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,31 +0,0 @@
|
|||
//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of critical with KMPC interface
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/debug.h"
|
||||
#include "interface.h"
|
||||
|
||||
EXTERN
|
||||
void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
|
||||
kmp_CriticalName *lck) {
|
||||
PRINT0(LD_IO, "call to kmpc_critical()\n");
|
||||
omp_set_lock((omp_lock_t *)lck);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
|
||||
kmp_CriticalName *lck) {
|
||||
PRINT0(LD_IO, "call to kmpc_end_critical()\n");
|
||||
omp_unset_lock((omp_lock_t *)lck);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,194 +0,0 @@
|
|||
//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of data sharing environments
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target/shuffle.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Runtime functions for trunk data sharing scheme.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static constexpr unsigned MinBytes = 8;
|
||||
|
||||
static constexpr unsigned Alignment = 8;
|
||||
|
||||
/// External symbol to access dynamic shared memory.
|
||||
extern unsigned char DynamicSharedBuffer[] __attribute__((aligned(Alignment)));
|
||||
#pragma omp allocate(DynamicSharedBuffer) allocator(omp_pteam_mem_alloc)
|
||||
|
||||
EXTERN void *__kmpc_get_dynamic_shared() { return DynamicSharedBuffer; }
|
||||
|
||||
EXTERN void *llvm_omp_get_dynamic_shared() {
|
||||
return __kmpc_get_dynamic_shared();
|
||||
}
|
||||
|
||||
template <unsigned BPerThread, unsigned NThreads = MAX_THREADS_PER_TEAM>
|
||||
struct alignas(32) ThreadStackTy {
|
||||
static constexpr unsigned BytesPerThread = BPerThread;
|
||||
static constexpr unsigned NumThreads = NThreads;
|
||||
static constexpr unsigned NumWarps = (NThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
|
||||
unsigned char Data[NumThreads][BytesPerThread];
|
||||
unsigned char Usage[NumThreads];
|
||||
};
|
||||
|
||||
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes * 8, 1> MainSharedStack;
|
||||
#pragma omp allocate(MainSharedStack) allocator(omp_pteam_mem_alloc)
|
||||
|
||||
[[clang::loader_uninitialized]] ThreadStackTy<MinBytes,
|
||||
MAX_THREADS_PER_TEAM / 4>
|
||||
WorkerSharedStack;
|
||||
#pragma omp allocate(WorkerSharedStack) allocator(omp_pteam_mem_alloc)
|
||||
|
||||
EXTERN void *__kmpc_alloc_shared(size_t Bytes) {
|
||||
size_t AlignedBytes = Bytes + (Bytes % MinBytes);
|
||||
int TID = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (__kmpc_is_generic_main_thread(TID)) {
|
||||
// Main thread alone, use shared memory if space is available.
|
||||
if (MainSharedStack.Usage[0] + AlignedBytes <= MainSharedStack.BytesPerThread) {
|
||||
void *Ptr = &MainSharedStack.Data[0][MainSharedStack.Usage[0]];
|
||||
MainSharedStack.Usage[0] += AlignedBytes;
|
||||
return Ptr;
|
||||
}
|
||||
} else if (TID < WorkerSharedStack.NumThreads) {
|
||||
if (WorkerSharedStack.Usage[TID] + AlignedBytes <= WorkerSharedStack.BytesPerThread) {
|
||||
void *Ptr = &WorkerSharedStack.Data[TID][WorkerSharedStack.Usage[TID]];
|
||||
WorkerSharedStack.Usage[TID] += AlignedBytes;
|
||||
return Ptr;
|
||||
}
|
||||
}
|
||||
// Fallback to malloc
|
||||
return SafeMalloc(Bytes, "AllocGlobalFallback");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes) {
|
||||
size_t AlignedBytes = Bytes + (Bytes % MinBytes);
|
||||
int TID = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (__kmpc_is_generic_main_thread(TID)) {
|
||||
if (Ptr >= &MainSharedStack.Data[0][0] &&
|
||||
Ptr < &MainSharedStack.Data[MainSharedStack.NumThreads][0]) {
|
||||
MainSharedStack.Usage[0] -= AlignedBytes;
|
||||
return;
|
||||
}
|
||||
} else if (TID < WorkerSharedStack.NumThreads) {
|
||||
if (Ptr >= &WorkerSharedStack.Data[0][0] &&
|
||||
Ptr < &WorkerSharedStack.Data[WorkerSharedStack.NumThreads][0]) {
|
||||
int TID = __kmpc_get_hardware_thread_id_in_block();
|
||||
WorkerSharedStack.Usage[TID] -= AlignedBytes;
|
||||
return;
|
||||
}
|
||||
}
|
||||
SafeFree(Ptr, "FreeGlobalFallback");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_data_sharing_init_stack() {
|
||||
for (unsigned i = 0; i < MainSharedStack.NumWarps; ++i)
|
||||
MainSharedStack.Usage[i] = 0;
|
||||
for (unsigned i = 0; i < WorkerSharedStack.NumThreads; ++i)
|
||||
WorkerSharedStack.Usage[i] = 0;
|
||||
}
|
||||
|
||||
/// Allocate storage in shared memory to communicate arguments from the main
|
||||
/// thread to the workers in generic mode. If we exceed
|
||||
/// NUM_SHARED_VARIABLES_IN_SHARED_MEM we will malloc space for communication.
|
||||
#define NUM_SHARED_VARIABLES_IN_SHARED_MEM 64
|
||||
|
||||
[[clang::loader_uninitialized]] static void
|
||||
*SharedMemVariableSharingSpace[NUM_SHARED_VARIABLES_IN_SHARED_MEM];
|
||||
#pragma omp allocate(SharedMemVariableSharingSpace) \
|
||||
allocator(omp_pteam_mem_alloc)
|
||||
[[clang::loader_uninitialized]] static void **SharedMemVariableSharingSpacePtr;
|
||||
#pragma omp allocate(SharedMemVariableSharingSpacePtr) \
|
||||
allocator(omp_pteam_mem_alloc)
|
||||
|
||||
// Begin a data sharing context. Maintain a list of references to shared
|
||||
// variables. This list of references to shared variables will be passed
|
||||
// to one or more threads.
|
||||
// In L0 data sharing this is called by master thread.
|
||||
// In L1 data sharing this is called by active warp master thread.
|
||||
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) {
|
||||
if (nArgs <= NUM_SHARED_VARIABLES_IN_SHARED_MEM) {
|
||||
SharedMemVariableSharingSpacePtr = &SharedMemVariableSharingSpace[0];
|
||||
} else {
|
||||
SharedMemVariableSharingSpacePtr =
|
||||
(void **)SafeMalloc(nArgs * sizeof(void *), "new extended args");
|
||||
}
|
||||
*GlobalArgs = SharedMemVariableSharingSpacePtr;
|
||||
}
|
||||
|
||||
// End a data sharing context. There is no need to have a list of refs
|
||||
// to shared variables because the context in which those variables were
|
||||
// shared has now ended. This should clean-up the list of references only
|
||||
// without affecting the actual global storage of the variables.
|
||||
// In L0 data sharing this is called by master thread.
|
||||
// In L1 data sharing this is called by active warp master thread.
|
||||
EXTERN void __kmpc_end_sharing_variables() {
|
||||
if (SharedMemVariableSharingSpacePtr != &SharedMemVariableSharingSpace[0])
|
||||
SafeFree(SharedMemVariableSharingSpacePtr, "new extended args");
|
||||
}
|
||||
|
||||
// This function will return a list of references to global variables. This
|
||||
// is how the workers will get a reference to the globalized variable. The
|
||||
// members of this list will be passed to the outlined parallel function
|
||||
// preserving the order.
|
||||
// Called by all workers.
|
||||
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) {
|
||||
*GlobalArgs = SharedMemVariableSharingSpacePtr;
|
||||
}
|
||||
|
||||
// This function is used to init static memory manager. This manager is used to
|
||||
// manage statically allocated global memory. This memory is allocated by the
|
||||
// compiler and used to correctly implement globalization of the variables in
|
||||
// target, teams and distribute regions.
|
||||
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
|
||||
const void *buf, size_t size,
|
||||
int16_t is_shared,
|
||||
const void **frame) {
|
||||
if (is_shared) {
|
||||
*frame = buf;
|
||||
return;
|
||||
}
|
||||
if (isSPMDExecutionMode) {
|
||||
if (__kmpc_get_hardware_thread_id_in_block() == 0) {
|
||||
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
|
||||
}
|
||||
__kmpc_impl_syncthreads();
|
||||
return;
|
||||
}
|
||||
ASSERT0(LT_FUSSY,
|
||||
__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
|
||||
"Must be called only in the target master thread.");
|
||||
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
|
||||
__kmpc_impl_threadfence();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
|
||||
int16_t is_shared) {
|
||||
if (is_shared)
|
||||
return;
|
||||
if (isSPMDExecutionMode) {
|
||||
__kmpc_impl_syncthreads();
|
||||
if (__kmpc_get_hardware_thread_id_in_block() == 0) {
|
||||
omptarget_nvptx_simpleMemoryManager.Release();
|
||||
}
|
||||
return;
|
||||
}
|
||||
__kmpc_impl_threadfence();
|
||||
ASSERT0(LT_FUSSY,
|
||||
__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
|
||||
"Must be called only in the target master thread.");
|
||||
omptarget_nvptx_simpleMemoryManager.Release();
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,359 +0,0 @@
|
|||
//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file implements the OpenMP runtime functions that can be
|
||||
// invoked by the user in an OpenMP region
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
EXTERN double omp_get_wtick(void) {
|
||||
double rc = __kmpc_impl_get_wtick();
|
||||
PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN double omp_get_wtime(void) {
|
||||
double rc = __kmpc_impl_get_wtime();
|
||||
PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_num_threads(int num) {
|
||||
// Ignore it for SPMD mode.
|
||||
if (__kmpc_is_spmd_exec_mode())
|
||||
return;
|
||||
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
|
||||
PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
|
||||
if (num <= 0) {
|
||||
WARNING0(LW_INPUT, "expected positive num; ignore\n");
|
||||
} else if (parallelLevel[GetWarpId()] == 0) {
|
||||
nThreads = num;
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_threads(void) {
|
||||
int rc = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
|
||||
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_max_threads(void) {
|
||||
if (parallelLevel[GetWarpId()] > 0)
|
||||
// We're already in parallel region.
|
||||
return 1; // default is 1 thread avail
|
||||
// Not currently in a parallel region, return what was set.
|
||||
int rc = 1;
|
||||
if (parallelLevel[GetWarpId()] == 0)
|
||||
rc = nThreads;
|
||||
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
|
||||
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_thread_limit(void) {
|
||||
if (__kmpc_is_spmd_exec_mode())
|
||||
return __kmpc_get_hardware_num_threads_in_block();
|
||||
int rc = threadLimit;
|
||||
PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_thread_num() {
|
||||
int rc = GetOmpThreadId();
|
||||
PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_procs(void) {
|
||||
int rc = GetNumberOfProcsInDevice(__kmpc_is_spmd_exec_mode());
|
||||
PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_in_parallel(void) {
|
||||
int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
|
||||
PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_in_final(void) {
|
||||
// treat all tasks as final... Specs may expect runtime to keep
|
||||
// track more precisely if a task was actively set by users... This
|
||||
// is not explicitly specified; will treat as if runtime can
|
||||
// actively decide to put a non-final task into a final one.
|
||||
int rc = 1;
|
||||
PRINT(LD_IO, "call omp_in_final() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_dynamic(int flag) {
|
||||
PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag);
|
||||
}
|
||||
|
||||
EXTERN int omp_get_dynamic(void) {
|
||||
int rc = 0;
|
||||
PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_nested(int flag) {
|
||||
PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n",
|
||||
flag);
|
||||
}
|
||||
|
||||
EXTERN int omp_get_nested(void) {
|
||||
int rc = 0;
|
||||
PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_max_active_levels(int level) {
|
||||
PRINT(LD_IO,
|
||||
"call omp_set_max_active_levels(%d) is ignored (no nested support)\n",
|
||||
level);
|
||||
}
|
||||
|
||||
EXTERN int omp_get_max_active_levels(void) {
|
||||
int rc = 1;
|
||||
PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_level(void) {
|
||||
int level = __kmpc_parallel_level();
|
||||
PRINT(LD_IO, "call omp_get_level() returns %d\n", level);
|
||||
return level;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_active_level(void) {
|
||||
int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0;
|
||||
PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level)
|
||||
return level;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_ancestor_thread_num(int level) {
|
||||
if (__kmpc_is_spmd_exec_mode())
|
||||
return level == 1 ? __kmpc_get_hardware_thread_id_in_block() : 0;
|
||||
int rc = -1;
|
||||
// If level is 0 or all parallel regions are not active - return 0.
|
||||
unsigned parLevel = parallelLevel[GetWarpId()];
|
||||
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
|
||||
int totLevel = omp_get_level();
|
||||
if (level <= totLevel) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
|
||||
int steps = totLevel - level;
|
||||
PRINT(LD_IO, "backtrack %d steps\n", steps);
|
||||
ASSERT0(LT_FUSSY, currTaskDescr,
|
||||
"do not expect fct to be called in a non-active thread");
|
||||
do {
|
||||
if (DON(LD_IOD)) {
|
||||
// print current state
|
||||
omp_sched_t sched = currTaskDescr->GetRuntimeSched();
|
||||
PRINT(LD_ALL,
|
||||
"task descr %s %d: %s, in par %d, rt sched %d,"
|
||||
" chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n",
|
||||
"ancestor", steps,
|
||||
(currTaskDescr->IsParallelConstruct() ? "par" : "task"),
|
||||
(int)currTaskDescr->InParallelRegion(), (int)sched,
|
||||
currTaskDescr->RuntimeChunkSize(),
|
||||
(int)currTaskDescr->ThreadId(), (int)threadsInTeam,
|
||||
(int)nThreads);
|
||||
}
|
||||
|
||||
if (currTaskDescr->IsParallelConstruct()) {
|
||||
// found the level
|
||||
if (!steps) {
|
||||
rc = currTaskDescr->ThreadId();
|
||||
break;
|
||||
}
|
||||
steps--;
|
||||
}
|
||||
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
|
||||
} while (currTaskDescr);
|
||||
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
|
||||
}
|
||||
} else if (level == 0 ||
|
||||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||
level <= parLevel) ||
|
||||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
|
||||
rc = 0;
|
||||
}
|
||||
PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
|
||||
rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_team_size(int level) {
|
||||
if (__kmpc_is_spmd_exec_mode())
|
||||
return level == 1 ? __kmpc_get_hardware_num_threads_in_block() : 1;
|
||||
int rc = -1;
|
||||
unsigned parLevel = parallelLevel[GetWarpId()];
|
||||
// If level is 0 or all parallel regions are not active - return 1.
|
||||
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
|
||||
rc = threadsInTeam;
|
||||
} else if (level == 0 ||
|
||||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||
level <= parLevel) ||
|
||||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
|
||||
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
|
||||
rc = 1;
|
||||
}
|
||||
PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) {
|
||||
if (isRuntimeUninitialized()) {
|
||||
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
|
||||
"Expected SPMD mode only with uninitialized runtime.");
|
||||
*kind = omp_sched_static;
|
||||
*modifier = 1;
|
||||
} else {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
|
||||
*kind = currTaskDescr->GetRuntimeSched();
|
||||
*modifier = currTaskDescr->RuntimeChunkSize();
|
||||
}
|
||||
PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n",
|
||||
(int)*kind, *modifier);
|
||||
}
|
||||
|
||||
EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) {
|
||||
PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind,
|
||||
modifier);
|
||||
if (isRuntimeUninitialized()) {
|
||||
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
|
||||
"Expected SPMD mode only with uninitialized runtime.");
|
||||
return;
|
||||
}
|
||||
if (kind >= omp_sched_static && kind < omp_sched_auto) {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
getMyTopTaskDescriptor(__kmpc_is_spmd_exec_mode());
|
||||
currTaskDescr->SetRuntimeSched(kind);
|
||||
currTaskDescr->RuntimeChunkSize() = modifier;
|
||||
PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n",
|
||||
(int)currTaskDescr->GetRuntimeSched(),
|
||||
currTaskDescr->RuntimeChunkSize());
|
||||
}
|
||||
}
|
||||
|
||||
EXTERN omp_proc_bind_t omp_get_proc_bind(void) {
|
||||
PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n");
|
||||
return omp_proc_bind_true;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_places(void) {
|
||||
PRINT0(LD_IO, "call omp_get_num_places() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_place_num_procs(int place_num) {
|
||||
PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void omp_get_place_proc_ids(int place_num, int *ids) {
|
||||
PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_get_place_num(void) {
|
||||
PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_partition_num_places(void) {
|
||||
PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void omp_get_partition_place_nums(int *place_nums) {
|
||||
PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_get_cancellation(void) {
|
||||
int rc = 0;
|
||||
PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN void omp_set_default_device(int deviceId) {
|
||||
PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_get_default_device(void) {
|
||||
PRINT0(LD_IO,
|
||||
"call omp_get_default_device() is undef on device, returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_devices(void) {
|
||||
PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_num_teams(void) {
|
||||
int rc = GetNumberOfOmpTeams();
|
||||
PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
EXTERN int omp_get_team_num() {
|
||||
int rc = GetOmpTeamId();
|
||||
PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
// Unspecified on the device.
|
||||
EXTERN int omp_get_initial_device(void) {
|
||||
PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Unused for now.
|
||||
EXTERN int omp_get_max_task_priority(void) {
|
||||
PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// locks
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void omp_init_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_init_lock(lock);
|
||||
PRINT0(LD_IO, "call omp_init_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN void omp_destroy_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_destroy_lock(lock);
|
||||
PRINT0(LD_IO, "call omp_destroy_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN void omp_set_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_set_lock(lock);
|
||||
PRINT0(LD_IO, "call omp_set_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN void omp_unset_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_unset_lock(lock);
|
||||
PRINT0(LD_IO, "call omp_unset_lock()\n");
|
||||
}
|
||||
|
||||
EXTERN int omp_test_lock(omp_lock_t *lock) {
|
||||
int rc = __kmpc_impl_test_lock(lock);
|
||||
PRINT(LD_IO, "call omp_test_lock() return %d\n", rc);
|
||||
return rc;
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,813 +0,0 @@
|
|||
//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of the KMPC interface
|
||||
// for the loop construct plus other worksharing constructs that use the same
|
||||
// interface as loops.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target/shuffle.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// template class that encapsulate all the helper functions
|
||||
//
|
||||
// T is loop iteration type (32 | 64) (unsigned | signed)
|
||||
// ST is the signed version of T
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename ST> class omptarget_nvptx_LoopSupport {
|
||||
public:
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop with static scheduling with chunk
|
||||
|
||||
// Generic implementation of OMP loop scheduling with static policy
|
||||
/*! \brief Calculate initial bounds for static loop and stride
|
||||
* @param[in] loc location in code of the call (not used here)
|
||||
* @param[in] global_tid global thread id
|
||||
* @param[in] schetype type of scheduling (see omptarget-nvptx.h)
|
||||
* @param[in] plastiter pointer to last iteration
|
||||
* @param[in,out] pointer to loop lower bound. it will contain value of
|
||||
* lower bound of first chunk
|
||||
* @param[in,out] pointer to loop upper bound. It will contain value of
|
||||
* upper bound of first chunk
|
||||
* @param[in,out] pointer to loop stride. It will contain value of stride
|
||||
* between two successive chunks executed by the same thread
|
||||
* @param[in] loop increment bump
|
||||
* @param[in] chunk size
|
||||
*/
|
||||
|
||||
// helper function for static chunk
|
||||
INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride,
|
||||
ST chunk, T entityId, T numberOfEntities) {
|
||||
// each thread executes multiple chunks all of the same size, except
|
||||
// the last one
|
||||
|
||||
// distance between two successive chunks
|
||||
stride = numberOfEntities * chunk;
|
||||
lb = lb + entityId * chunk;
|
||||
T inputUb = ub;
|
||||
ub = lb + chunk - 1; // Clang uses i <= ub
|
||||
// Say ub' is the begining of the last chunk. Then who ever has a
|
||||
// lower bound plus a multiple of the increment equal to ub' is
|
||||
// the last one.
|
||||
T beginingLastChunk = inputUb - (inputUb % chunk);
|
||||
last = ((beginingLastChunk - lb) % stride) == 0;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Loop with static scheduling without chunk
|
||||
|
||||
// helper function for static no chunk
|
||||
INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride,
|
||||
ST &chunk, T entityId,
|
||||
T numberOfEntities) {
|
||||
// No chunk size specified. Each thread or warp gets at most one
|
||||
// chunk; chunks are all almost of equal size
|
||||
T loopSize = ub - lb + 1;
|
||||
|
||||
chunk = loopSize / numberOfEntities;
|
||||
T leftOver = loopSize - chunk * numberOfEntities;
|
||||
|
||||
if (entityId < leftOver) {
|
||||
chunk++;
|
||||
lb = lb + entityId * chunk;
|
||||
} else {
|
||||
lb = lb + entityId * chunk + leftOver;
|
||||
}
|
||||
|
||||
T inputUb = ub;
|
||||
ub = lb + chunk - 1; // Clang uses i <= ub
|
||||
last = lb <= inputUb && inputUb <= ub;
|
||||
stride = loopSize; // make sure we only do 1 chunk per warp
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for Static Init
|
||||
|
||||
INLINE static void for_static_init(int32_t gtid, int32_t schedtype,
|
||||
int32_t *plastiter, T *plower, T *pupper,
|
||||
ST *pstride, ST chunk,
|
||||
bool IsSPMDExecutionMode) {
|
||||
// When IsRuntimeUninitialized is true, we assume that the caller is
|
||||
// in an L0 parallel region and that all worker threads participate.
|
||||
|
||||
// Assume we are in teams region or that we use a single block
|
||||
// per target region
|
||||
ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
|
||||
|
||||
// All warps that are in excess of the maximum requested, do
|
||||
// not execute the loop
|
||||
PRINT(LD_LOOP,
|
||||
"OMP Thread %d: schedule type %d, chunk size = %lld, mytid "
|
||||
"%d, num tids %d\n",
|
||||
(int)gtid, (int)schedtype, (long long)chunk, (int)gtid,
|
||||
(int)numberOfActiveOMPThreads);
|
||||
ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads,
|
||||
"current thread is not needed here; error");
|
||||
|
||||
// copy
|
||||
int lastiter = 0;
|
||||
T lb = *plower;
|
||||
T ub = *pupper;
|
||||
ST stride = *pstride;
|
||||
// init
|
||||
switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) {
|
||||
case kmp_sched_static_chunk: {
|
||||
if (chunk > 0) {
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
} // note: if chunk <=0, use nochunk
|
||||
case kmp_sched_static_balanced_chunk: {
|
||||
if (chunk > 0) {
|
||||
// round up to make sure the chunk is enough to cover all iterations
|
||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||
T span = (tripCount + numberOfActiveOMPThreads - 1) /
|
||||
numberOfActiveOMPThreads;
|
||||
// perform chunk adjustment
|
||||
chunk = (span + chunk - 1) & ~(chunk - 1);
|
||||
|
||||
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
|
||||
T oldUb = ub;
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
if (ub > oldUb)
|
||||
ub = oldUb;
|
||||
break;
|
||||
}
|
||||
} // note: if chunk <=0, use nochunk
|
||||
case kmp_sched_static_nochunk: {
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_static_chunk: {
|
||||
if (chunk > 0) {
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
|
||||
GetNumberOfOmpTeams());
|
||||
break;
|
||||
} // note: if chunk <=0, use nochunk
|
||||
}
|
||||
case kmp_sched_distr_static_nochunk: {
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(),
|
||||
GetNumberOfOmpTeams());
|
||||
break;
|
||||
}
|
||||
case kmp_sched_distr_static_chunk_sched_static_chunkone: {
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk,
|
||||
numberOfActiveOMPThreads * GetOmpTeamId() + gtid,
|
||||
GetNumberOfOmpTeams() * numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
default: {
|
||||
ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype);
|
||||
PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n",
|
||||
(int)schedtype);
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid,
|
||||
numberOfActiveOMPThreads);
|
||||
break;
|
||||
}
|
||||
}
|
||||
// copy back
|
||||
*plastiter = lastiter;
|
||||
*plower = lb;
|
||||
*pupper = ub;
|
||||
*pstride = stride;
|
||||
PRINT(LD_LOOP,
|
||||
"Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last "
|
||||
"%d\n",
|
||||
(int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(),
|
||||
(long long)(*plower), (long long)(*pupper), (long long)(*pstride),
|
||||
(int)lastiter);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for dispatch Init
|
||||
|
||||
INLINE static int OrderedSchedule(kmp_sched_t schedule) {
|
||||
return schedule >= kmp_sched_ordered_first &&
|
||||
schedule <= kmp_sched_ordered_last;
|
||||
}
|
||||
|
||||
INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId,
|
||||
kmp_sched_t schedule, T lb, T ub, ST st,
|
||||
ST chunk) {
|
||||
if (isRuntimeUninitialized()) {
|
||||
// In SPMD mode no need to check parallelism level - dynamic scheduling
|
||||
// may appear only in L2 parallel regions with lightweight runtime.
|
||||
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
|
||||
return;
|
||||
}
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
|
||||
T tnum = GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
|
||||
T tripCount = ub - lb + 1; // +1 because ub is inclusive
|
||||
ASSERT0(LT_FUSSY, threadId < tnum,
|
||||
"current thread is not needed here; error");
|
||||
|
||||
/* Currently just ignore the monotonic and non-monotonic modifiers
|
||||
* (the compiler isn't producing them * yet anyway).
|
||||
* When it is we'll want to look at them somewhere here and use that
|
||||
* information to add to our schedule choice. We shouldn't need to pass
|
||||
* them on, they merely affect which schedule we can legally choose for
|
||||
* various dynamic cases. (In particular, whether or not a stealing scheme
|
||||
* is legal).
|
||||
*/
|
||||
schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule);
|
||||
|
||||
// Process schedule.
|
||||
if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) {
|
||||
if (OrderedSchedule(schedule))
|
||||
__kmpc_barrier(loc, threadId);
|
||||
PRINT(LD_LOOP,
|
||||
"go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n",
|
||||
(long)tnum, (long long)tripCount, (int)schedule);
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = tripCount; // one thread gets the whole loop
|
||||
} else if (schedule == kmp_sched_runtime) {
|
||||
// process runtime
|
||||
omp_sched_t rtSched = currTaskDescr->GetRuntimeSched();
|
||||
chunk = currTaskDescr->RuntimeChunkSize();
|
||||
switch (rtSched) {
|
||||
case omp_sched_static: {
|
||||
if (chunk > 0)
|
||||
schedule = kmp_sched_static_chunk;
|
||||
else
|
||||
schedule = kmp_sched_static_nochunk;
|
||||
break;
|
||||
}
|
||||
case omp_sched_auto: {
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = 1;
|
||||
break;
|
||||
}
|
||||
case omp_sched_dynamic:
|
||||
case omp_sched_guided: {
|
||||
schedule = kmp_sched_dynamic;
|
||||
break;
|
||||
}
|
||||
}
|
||||
PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
} else if (schedule == kmp_sched_auto) {
|
||||
schedule = kmp_sched_static_chunk;
|
||||
chunk = 1;
|
||||
PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
} else {
|
||||
PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
ASSERT(LT_FUSSY,
|
||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
|
||||
"unknown schedule %d & chunk %lld\n", (int)schedule,
|
||||
(long long)chunk);
|
||||
}
|
||||
|
||||
// init schedules
|
||||
if (schedule == kmp_sched_static_chunk) {
|
||||
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
|
||||
// save sched state
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
|
||||
// save ub
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
// compute static chunk
|
||||
ST stride;
|
||||
int lastiter = 0;
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
// save computed params
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
(int)tnum,
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
|
||||
tid));
|
||||
} else if (schedule == kmp_sched_static_balanced_chunk) {
|
||||
ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value");
|
||||
// save sched state
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
|
||||
// save ub
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
// compute static chunk
|
||||
ST stride;
|
||||
int lastiter = 0;
|
||||
// round up to make sure the chunk is enough to cover all iterations
|
||||
T span = (tripCount + tnum - 1) / tnum;
|
||||
// perform chunk adjustment
|
||||
chunk = (span + chunk - 1) & ~(chunk - 1);
|
||||
|
||||
T oldUb = ub;
|
||||
ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb.");
|
||||
if (ub > oldUb)
|
||||
ub = oldUb;
|
||||
// save computed params
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (static chunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
(int)tnum,
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
|
||||
tid));
|
||||
} else if (schedule == kmp_sched_static_nochunk) {
|
||||
ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value");
|
||||
// save sched state
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
|
||||
// save ub
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
// compute static chunk
|
||||
ST stride;
|
||||
int lastiter = 0;
|
||||
ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum);
|
||||
// save computed params
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
omptarget_nvptx_threadPrivateContext->Stride(tid) = stride;
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (static nochunk) : num threads = %d, ub = %" PRId64
|
||||
", next lower bound = %llu, stride = %llu\n",
|
||||
(int)tnum,
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
(unsigned long long)omptarget_nvptx_threadPrivateContext->Stride(
|
||||
tid));
|
||||
} else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) {
|
||||
// save data
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule;
|
||||
if (chunk < 1)
|
||||
chunk = 1;
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk;
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub;
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb;
|
||||
__kmpc_barrier(loc, threadId);
|
||||
if (tid == 0) {
|
||||
omptarget_nvptx_threadPrivateContext->Cnt() = 0;
|
||||
__kmpc_impl_threadfence_block();
|
||||
}
|
||||
__kmpc_barrier(loc, threadId);
|
||||
PRINT(LD_LOOP,
|
||||
"dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64
|
||||
", chunk %" PRIu64 "\n",
|
||||
(int)tnum,
|
||||
(unsigned long long)
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->Chunk(tid));
|
||||
}
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Support for dispatch next
|
||||
|
||||
INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val,
|
||||
int leader) {
|
||||
uint32_t lo, hi;
|
||||
__kmpc_impl_unpack(val, lo, hi);
|
||||
hi = __kmpc_impl_shfl_sync(active, hi, leader);
|
||||
lo = __kmpc_impl_shfl_sync(active, lo, leader);
|
||||
return __kmpc_impl_pack(lo, hi);
|
||||
}
|
||||
|
||||
INLINE static uint64_t NextIter() {
|
||||
__kmpc_impl_lanemask_t active = __kmpc_impl_activemask();
|
||||
uint32_t leader = __kmpc_impl_ffs(active) - 1;
|
||||
uint32_t change = __kmpc_impl_popc(active);
|
||||
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
|
||||
unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
|
||||
uint64_t warp_res;
|
||||
if (rank == 0) {
|
||||
warp_res = __kmpc_atomic_add(
|
||||
(unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(),
|
||||
(unsigned long long)change);
|
||||
}
|
||||
warp_res = Shuffle(active, warp_res, leader);
|
||||
return warp_res + rank;
|
||||
}
|
||||
|
||||
INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize,
|
||||
T loopLowerBound, T loopUpperBound) {
|
||||
T N = NextIter();
|
||||
lb = loopLowerBound + N * chunkSize;
|
||||
ub = lb + chunkSize - 1; // Clang uses i <= ub
|
||||
|
||||
// 3 result cases:
|
||||
// a. lb and ub < loopUpperBound --> NOT_FINISHED
|
||||
// b. lb < loopUpperBound and ub >= loopUpperBound: last chunk -->
|
||||
// NOT_FINISHED
|
||||
// c. lb and ub >= loopUpperBound: empty chunk --> FINISHED
|
||||
// a.
|
||||
if (lb <= loopUpperBound && ub < loopUpperBound) {
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n",
|
||||
(long long)lb, (long long)ub, (long long)loopUpperBound);
|
||||
return NOT_FINISHED;
|
||||
}
|
||||
// b.
|
||||
if (lb <= loopUpperBound) {
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n",
|
||||
(long long)lb, (long long)ub, (long long)loopUpperBound);
|
||||
ub = loopUpperBound;
|
||||
return LAST_CHUNK;
|
||||
}
|
||||
// c. if we are here, we are in case 'c'
|
||||
lb = loopUpperBound + 2;
|
||||
ub = loopUpperBound + 1;
|
||||
PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb,
|
||||
(long long)ub, (long long)loopUpperBound);
|
||||
return FINISHED;
|
||||
}
|
||||
|
||||
INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast,
|
||||
T *plower, T *pupper, ST *pstride) {
|
||||
if (isRuntimeUninitialized()) {
|
||||
// In SPMD mode no need to check parallelism level - dynamic scheduling
|
||||
// may appear only in L2 parallel regions with lightweight runtime.
|
||||
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(), "Expected non-SPMD mode.");
|
||||
if (*plast)
|
||||
return DISPATCH_FINISHED;
|
||||
*plast = 1;
|
||||
return DISPATCH_NOTFINISHED;
|
||||
}
|
||||
// ID of a thread in its own warp
|
||||
|
||||
// automatically selects thread or warp ID based on selected implementation
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
|
||||
"current thread is not needed here; error");
|
||||
// retrieve schedule
|
||||
kmp_sched_t schedule =
|
||||
omptarget_nvptx_threadPrivateContext->ScheduleType(tid);
|
||||
|
||||
// xxx reduce to one
|
||||
if (schedule == kmp_sched_static_chunk ||
|
||||
schedule == kmp_sched_static_nochunk) {
|
||||
T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid);
|
||||
T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid);
|
||||
// finished?
|
||||
if (myLb > ub) {
|
||||
PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n",
|
||||
(long long)myLb, (long long)ub);
|
||||
return DISPATCH_FINISHED;
|
||||
}
|
||||
// not finished, save current bounds
|
||||
ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid);
|
||||
*plower = myLb;
|
||||
T myUb = myLb + chunk - 1; // Clang uses i <= ub
|
||||
if (myUb > ub)
|
||||
myUb = ub;
|
||||
*pupper = myUb;
|
||||
*plast = (int32_t)(myUb == ub);
|
||||
|
||||
// increment next lower bound by the stride
|
||||
ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid);
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride;
|
||||
PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n",
|
||||
(long long)*plower, (long long)*pupper);
|
||||
return DISPATCH_NOTFINISHED;
|
||||
}
|
||||
ASSERT0(LT_FUSSY,
|
||||
schedule == kmp_sched_dynamic || schedule == kmp_sched_guided,
|
||||
"bad sched");
|
||||
T myLb, myUb;
|
||||
int finished = DynamicNextChunk(
|
||||
myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid),
|
||||
omptarget_nvptx_threadPrivateContext->NextLowerBound(tid),
|
||||
omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid));
|
||||
|
||||
if (finished == FINISHED)
|
||||
return DISPATCH_FINISHED;
|
||||
|
||||
// not finished (either not finished or last chunk)
|
||||
*plast = (int32_t)(finished == LAST_CHUNK);
|
||||
*plower = myLb;
|
||||
*pupper = myUb;
|
||||
*pstride = 1;
|
||||
|
||||
PRINT(LD_LOOP,
|
||||
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
|
||||
"last %d\n",
|
||||
(int)GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode()),
|
||||
(int)GetNumberOfWorkersInTeam(), (long long)*plower,
|
||||
(long long)*pupper, (long long)*pstride, (int)*plast);
|
||||
return DISPATCH_NOTFINISHED;
|
||||
}
|
||||
|
||||
INLINE static void dispatch_fini() {
|
||||
// nothing
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// end of template class that encapsulate all the helper functions
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP interface implementation (dyn loops)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// init
|
||||
EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid,
|
||||
int32_t schedule, int32_t lb, int32_t ub,
|
||||
int32_t st, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_init(
|
||||
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid,
|
||||
int32_t schedule, uint32_t lb, uint32_t ub,
|
||||
int32_t st, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_init(
|
||||
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid,
|
||||
int32_t schedule, int64_t lb, int64_t ub,
|
||||
int64_t st, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_init(
|
||||
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid,
|
||||
int32_t schedule, uint64_t lb, uint64_t ub,
|
||||
int64_t st, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_init(
|
||||
loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk);
|
||||
}
|
||||
|
||||
// next
|
||||
EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last,
|
||||
int32_t *p_lb, int32_t *p_ub, int32_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_4\n");
|
||||
return omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_next(
|
||||
loc, tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
|
||||
uint32_t *p_lb, uint32_t *p_ub,
|
||||
int32_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n");
|
||||
return omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_next(
|
||||
loc, tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last,
|
||||
int64_t *p_lb, int64_t *p_ub, int64_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_8\n");
|
||||
return omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_next(
|
||||
loc, tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, int32_t *p_last,
|
||||
uint64_t *p_lb, uint64_t *p_ub,
|
||||
int64_t *p_st) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n");
|
||||
return omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_next(
|
||||
loc, tid, p_last, p_lb, p_ub, p_st);
|
||||
}
|
||||
|
||||
// fini
|
||||
EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::dispatch_fini();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP interface implementation (static loops)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter, int32_t *plower,
|
||||
int32_t *pupper, int32_t *pstride,
|
||||
int32_t incr, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_distribute_static_init_4\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_distribute_static_init_4u\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter, int64_t *plower,
|
||||
int64_t *pupper, int64_t *pstride,
|
||||
int64_t incr, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_distribute_static_init_8\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_distribute_static_init_8u\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
__kmpc_is_spmd_exec_mode());
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter, uint32_t *plower,
|
||||
uint32_t *pupper, int32_t *pstride,
|
||||
int32_t incr, int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter, uint64_t *plower,
|
||||
uint64_t *pupper, int64_t *pstride,
|
||||
int64_t incr, int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<int32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_generic(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
|
||||
int32_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<uint32_t, int32_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t schedtype,
|
||||
int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<int64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_generic(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter,
|
||||
uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
|
||||
int64_t chunk) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n");
|
||||
omptarget_nvptx_LoopSupport<uint64_t, int64_t>::for_static_init(
|
||||
global_tid, schedtype, plastiter, plower, pupper, pstride, chunk,
|
||||
/*IsSPMDExecutionMode=*/false);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_distribute_static_fini\n");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_for_static_fini\n");
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,65 +0,0 @@
|
|||
//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the data objects used on the GPU device.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/allocator.h"
|
||||
#include "common/omptarget.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global device environment
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
PLUGIN_ACCESSIBLE
|
||||
DeviceEnvironmentTy omptarget_device_environment;
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global data holding OpenMP state information
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// OpenMP will try to call its ctor if we don't add the attribute explicitly
|
||||
[[clang::loader_uninitialized]] omptarget_nvptx_Queue<
|
||||
omptarget_nvptx_ThreadPrivateContext, OMP_STATE_COUNT>
|
||||
omptarget_nvptx_device_State[MAX_SM];
|
||||
|
||||
omptarget_nvptx_SimpleMemoryManager omptarget_nvptx_simpleMemoryManager;
|
||||
uint32_t SHARED(usedMemIdx);
|
||||
uint32_t SHARED(usedSlotIdx);
|
||||
|
||||
// SHARED doesn't work with array so we add the attribute explicitly.
|
||||
[[clang::loader_uninitialized]] uint8_t
|
||||
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
|
||||
#pragma omp allocate(parallelLevel) allocator(omp_pteam_mem_alloc)
|
||||
uint16_t SHARED(threadLimit);
|
||||
uint16_t SHARED(threadsInTeam);
|
||||
uint16_t SHARED(nThreads);
|
||||
// Pointer to this team's OpenMP state object
|
||||
omptarget_nvptx_ThreadPrivateContext *
|
||||
SHARED(omptarget_nvptx_threadPrivateContext);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The team master sets the outlined parallel function in this variable to
|
||||
// communicate with the workers. Since it is in shared memory, there is one
|
||||
// copy of these variables for each kernel, instance, and team.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
omptarget_nvptx_WorkFn SHARED(omptarget_nvptx_workFn);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenMP kernel execution parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
int8_t SHARED(execution_param);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Scratchpad for teams reduction.
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
void *SHARED(ReductionScratchpadPtr);
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,259 +0,0 @@
|
|||
//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the initialization code for the GPU
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "common/support.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// global data tables
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
extern omptarget_nvptx_Queue<omptarget_nvptx_ThreadPrivateContext,
|
||||
OMP_STATE_COUNT>
|
||||
omptarget_nvptx_device_State[MAX_SM];
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// init entry points
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void __kmpc_generic_kernel_init() {
|
||||
PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n",
|
||||
OMPTARGET_NVPTX_VERSION);
|
||||
|
||||
if (GetLaneId() == 0)
|
||||
parallelLevel[GetWarpId()] = 0;
|
||||
|
||||
int threadIdInBlock = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (threadIdInBlock != GetMasterThreadID())
|
||||
return;
|
||||
|
||||
setExecutionParameters(OMP_TGT_EXEC_MODE_GENERIC, OMP_TGT_RUNTIME_INITIALIZED);
|
||||
ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(),
|
||||
"__kmpc_kernel_init() must be called by team master warp only!");
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n");
|
||||
|
||||
// Get a state object from the queue.
|
||||
int slot = __kmpc_impl_smid() % MAX_SM;
|
||||
usedSlotIdx = slot;
|
||||
omptarget_nvptx_threadPrivateContext =
|
||||
omptarget_nvptx_device_State[slot].Dequeue();
|
||||
|
||||
// init thread private
|
||||
int threadId = 0;
|
||||
omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId);
|
||||
|
||||
// init team context
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
currTeamDescr.InitTeamDescr();
|
||||
// this thread will start execution... has to update its task ICV
|
||||
// to point to the level zero task ICV. That ICV was init in
|
||||
// InitTeamDescr()
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, currTeamDescr.LevelZeroTaskDescr());
|
||||
|
||||
// set number of threads and thread limit in team to started value
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
nThreads = GetNumberOfWorkersInTeam();
|
||||
threadLimit = nThreads;
|
||||
|
||||
__kmpc_data_sharing_init_stack();
|
||||
__kmpc_impl_target_init();
|
||||
}
|
||||
|
||||
static void __kmpc_generic_kernel_deinit() {
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n");
|
||||
// Enqueue omp state object for use by another team.
|
||||
int slot = usedSlotIdx;
|
||||
omptarget_nvptx_device_State[slot].Enqueue(
|
||||
omptarget_nvptx_threadPrivateContext);
|
||||
// Done with work. Kill the workers.
|
||||
omptarget_nvptx_workFn = 0;
|
||||
}
|
||||
|
||||
static void __kmpc_spmd_kernel_init(bool RequiresFullRuntime) {
|
||||
PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n");
|
||||
|
||||
setExecutionParameters(OMP_TGT_EXEC_MODE_SPMD,
|
||||
RequiresFullRuntime ? OMP_TGT_RUNTIME_INITIALIZED
|
||||
: OMP_TGT_RUNTIME_UNINITIALIZED);
|
||||
int threadId = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (threadId == 0) {
|
||||
usedSlotIdx = __kmpc_impl_smid() % MAX_SM;
|
||||
}
|
||||
|
||||
if (GetLaneId() == 0) {
|
||||
parallelLevel[GetWarpId()] =
|
||||
1 + (__kmpc_get_hardware_num_threads_in_block() > 1
|
||||
? OMP_ACTIVE_PARALLEL_LEVEL
|
||||
: 0);
|
||||
}
|
||||
|
||||
__kmpc_data_sharing_init_stack();
|
||||
if (!RequiresFullRuntime)
|
||||
return;
|
||||
|
||||
//
|
||||
// Team Context Initialization.
|
||||
//
|
||||
// In SPMD mode there is no master thread so use any cuda thread for team
|
||||
// context initialization.
|
||||
if (threadId == 0) {
|
||||
// Get a state object from the queue.
|
||||
omptarget_nvptx_threadPrivateContext =
|
||||
omptarget_nvptx_device_State[usedSlotIdx].Dequeue();
|
||||
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
// init team context
|
||||
currTeamDescr.InitTeamDescr();
|
||||
}
|
||||
__kmpc_impl_syncthreads();
|
||||
|
||||
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
|
||||
//
|
||||
// Initialize task descr for each thread.
|
||||
//
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
||||
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
||||
newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
|
||||
// install new top descriptor
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
|
||||
// init thread private from init value
|
||||
int ThreadLimit = GetNumberOfProcsInTeam(/* IsSPMD */ true);
|
||||
PRINT(LD_PAR,
|
||||
"thread will execute parallel region with id %d in a team of "
|
||||
"%d threads\n",
|
||||
(int)newTaskDescr->ThreadId(), (int)ThreadLimit);
|
||||
}
|
||||
|
||||
static void __kmpc_spmd_kernel_deinit(bool RequiresFullRuntime) {
|
||||
// We're not going to pop the task descr stack of each thread since
|
||||
// there are no more parallel regions in SPMD mode.
|
||||
if (!RequiresFullRuntime)
|
||||
return;
|
||||
|
||||
__kmpc_impl_syncthreads();
|
||||
int threadId = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (threadId == 0) {
|
||||
// Enqueue omp state object for use by another team.
|
||||
int slot = usedSlotIdx;
|
||||
omptarget_nvptx_device_State[slot].Enqueue(
|
||||
omptarget_nvptx_threadPrivateContext);
|
||||
}
|
||||
}
|
||||
|
||||
// Return true if the current target region is executed in SPMD mode.
|
||||
// NOTE: This function has to return 1 for SPMD mode, and 0 for generic mode.
|
||||
// That's because `__kmpc_parallel_51` checks if it's already in parallel region
|
||||
// by comparision between the parallel level and the return value of this
|
||||
// function.
|
||||
EXTERN int8_t __kmpc_is_spmd_exec_mode() {
|
||||
return (execution_param & OMP_TGT_EXEC_MODE_SPMD) == OMP_TGT_EXEC_MODE_SPMD;
|
||||
}
|
||||
|
||||
EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid) {
|
||||
return !__kmpc_is_spmd_exec_mode() && __kmpc_is_generic_main_thread_id(Tid);
|
||||
}
|
||||
|
||||
NOINLINE EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid) {
|
||||
return GetMasterThreadID() == Tid;
|
||||
}
|
||||
|
||||
EXTERN bool __kmpc_kernel_parallel(void**WorkFn);
|
||||
|
||||
static void __kmpc_target_region_state_machine(ident_t *Ident) {
|
||||
|
||||
int TId = __kmpc_get_hardware_thread_id_in_block();
|
||||
do {
|
||||
void* WorkFn = 0;
|
||||
|
||||
// Wait for the signal that we have a new work function.
|
||||
__kmpc_barrier_simple_spmd(Ident, TId);
|
||||
|
||||
|
||||
// Retrieve the work function from the runtime.
|
||||
bool IsActive = __kmpc_kernel_parallel(&WorkFn);
|
||||
|
||||
// If there is nothing more to do, break out of the state machine by
|
||||
// returning to the caller.
|
||||
if (!WorkFn)
|
||||
return;
|
||||
|
||||
if (IsActive) {
|
||||
((void(*)(uint32_t,uint32_t))WorkFn)(0, TId);
|
||||
__kmpc_kernel_end_parallel();
|
||||
}
|
||||
|
||||
__kmpc_barrier_simple_spmd(Ident, TId);
|
||||
|
||||
} while (true);
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
|
||||
bool UseGenericStateMachine,
|
||||
bool RequiresFullRuntime) {
|
||||
const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
|
||||
int TId = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (IsSPMD)
|
||||
__kmpc_spmd_kernel_init(RequiresFullRuntime);
|
||||
else
|
||||
__kmpc_generic_kernel_init();
|
||||
|
||||
if (IsSPMD) {
|
||||
__kmpc_barrier_simple_spmd(Ident, TId);
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (TId == GetMasterThreadID())
|
||||
return -1;
|
||||
|
||||
// Enter the generic state machine if enabled and if this thread can possibly
|
||||
// be an active worker thread.
|
||||
//
|
||||
// The latter check is important for NVIDIA Pascal (but not Volta) and AMD
|
||||
// GPU. In those cases, a single thread can apparently satisfy a barrier on
|
||||
// behalf of all threads in the same warp. Thus, it would not be safe for
|
||||
// other threads in the main thread's warp to reach the first
|
||||
// __kmpc_barrier_simple_spmd call in __kmpc_target_region_state_machine
|
||||
// before the main thread reaches its corresponding
|
||||
// __kmpc_barrier_simple_spmd call: that would permit all active worker
|
||||
// threads to proceed before the main thread has actually set
|
||||
// omptarget_nvptx_workFn, and then they would immediately quit without
|
||||
// doing any work. GetNumberOfWorkersInTeam() does not include any of the
|
||||
// main thread's warp, so none of its threads can ever be active worker
|
||||
// threads.
|
||||
if (UseGenericStateMachine && TId < GetNumberOfWorkersInTeam())
|
||||
__kmpc_target_region_state_machine(Ident);
|
||||
|
||||
return TId;
|
||||
}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
|
||||
bool RequiresFullRuntime) {
|
||||
const bool IsSPMD = Mode & OMP_TGT_EXEC_MODE_SPMD;
|
||||
if (IsSPMD)
|
||||
__kmpc_spmd_kernel_deinit(RequiresFullRuntime);
|
||||
else
|
||||
__kmpc_generic_kernel_deinit();
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,341 +0,0 @@
|
|||
//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Parallel implementation in the GPU. Here is the pattern:
|
||||
//
|
||||
// while (not finished) {
|
||||
//
|
||||
// if (master) {
|
||||
// sequential code, decide which par loop to do, or if finished
|
||||
// __kmpc_kernel_prepare_parallel() // exec by master only
|
||||
// }
|
||||
// syncthreads // A
|
||||
// __kmpc_kernel_parallel() // exec by all
|
||||
// if (this thread is included in the parallel) {
|
||||
// switch () for all parallel loops
|
||||
// __kmpc_kernel_end_parallel() // exec only by threads in parallel
|
||||
// }
|
||||
//
|
||||
//
|
||||
// The reason we don't exec end_parallel for the threads not included
|
||||
// in the parallel loop is that for each barrier in the parallel
|
||||
// region, these non-included threads will cycle through the
|
||||
// syncthread A. Thus they must preserve their current threadId that
|
||||
// is larger than thread in team.
|
||||
//
|
||||
// To make a long story short...
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for parallel that goes parallel (1 static level only)
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause,
|
||||
uint16_t NThreadsICV,
|
||||
uint16_t ThreadLimit) {
|
||||
uint16_t ThreadsRequested = NThreadsICV;
|
||||
if (NumThreadsClause != 0) {
|
||||
ThreadsRequested = NumThreadsClause;
|
||||
}
|
||||
|
||||
uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam();
|
||||
if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) {
|
||||
ThreadsAvailable = ThreadLimit;
|
||||
}
|
||||
|
||||
uint16_t NumThreads = ThreadsAvailable;
|
||||
if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) {
|
||||
NumThreads = ThreadsRequested;
|
||||
}
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
// On Volta and newer architectures we require that all lanes in
|
||||
// a warp participate in the parallel region. Round down to a
|
||||
// multiple of WARPSIZE since it is legal to do so in OpenMP.
|
||||
if (NumThreads < WARPSIZE) {
|
||||
NumThreads = 1;
|
||||
} else {
|
||||
NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1));
|
||||
}
|
||||
#endif
|
||||
|
||||
return NumThreads;
|
||||
}
|
||||
|
||||
// This routine is always called by the team master..
|
||||
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
|
||||
kmp_int32 NumThreadsClause) {
|
||||
PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n");
|
||||
|
||||
omptarget_nvptx_workFn = WorkFn;
|
||||
|
||||
// This routine is only called by the team master. The team master is
|
||||
// the first thread of the last warp. It always has the logical thread
|
||||
// id of 0 (since it is a shadow for the first worker thread).
|
||||
const int threadId = 0;
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
|
||||
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
|
||||
ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
|
||||
"cannot be called in a parallel region.");
|
||||
if (currTaskDescr->InParallelRegion()) {
|
||||
PRINT0(LD_PAR, "already in parallel: go seq\n");
|
||||
return;
|
||||
}
|
||||
|
||||
uint16_t NumThreads =
|
||||
determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
|
||||
|
||||
if (NumThreadsClause != 0) {
|
||||
// Reset request to avoid propagating to successive #parallel
|
||||
NumThreadsClause = 0;
|
||||
}
|
||||
|
||||
ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads",
|
||||
(int)NumThreads);
|
||||
ASSERT0(LT_FUSSY,
|
||||
__kmpc_get_hardware_thread_id_in_block() == GetMasterThreadID(),
|
||||
"only team master can create parallel");
|
||||
|
||||
// Set number of threads on work descriptor.
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
|
||||
threadsInTeam = NumThreads;
|
||||
}
|
||||
|
||||
// All workers call this function. Deactivate those not needed.
|
||||
// Fn - the outlined work function to execute.
|
||||
// returns True if this thread is active, else False.
|
||||
//
|
||||
// Only the worker threads call this routine.
|
||||
EXTERN bool __kmpc_kernel_parallel(void **WorkFn) {
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n");
|
||||
|
||||
// Work function and arguments for L1 parallel region.
|
||||
*WorkFn = omptarget_nvptx_workFn;
|
||||
|
||||
// If this is the termination signal from the master, quit early.
|
||||
if (!*WorkFn) {
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Only the worker threads call this routine and the master warp
|
||||
// never arrives here. Therefore, use the nvptx thread id.
|
||||
int threadId = __kmpc_get_hardware_thread_id_in_block();
|
||||
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
|
||||
// Set to true for workers participating in the parallel region.
|
||||
bool isActive = false;
|
||||
// Initialize state for active threads.
|
||||
if (threadId < threadsInTeam) {
|
||||
// init work descriptor from workdesccr
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
|
||||
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
|
||||
newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr());
|
||||
// install new top descriptor
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
// init private from int value
|
||||
PRINT(LD_PAR,
|
||||
"thread will execute parallel region with id %d in a team of "
|
||||
"%d threads\n",
|
||||
(int)newTaskDescr->ThreadId(), (int)nThreads);
|
||||
|
||||
isActive = true;
|
||||
}
|
||||
|
||||
return isActive;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_kernel_end_parallel() {
|
||||
// pop stack
|
||||
PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n");
|
||||
ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime.");
|
||||
|
||||
// Only the worker threads call this routine and the master warp
|
||||
// never arrives here. Therefore, use the nvptx thread id.
|
||||
int threadId = __kmpc_get_hardware_thread_id_in_block();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, currTaskDescr->GetPrevTaskDescr());
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support for parallel that goes sequential
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
static void serializedParallel(kmp_Ident *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to serializedParallel\n");
|
||||
|
||||
IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
|
||||
|
||||
if (isRuntimeUninitialized()) {
|
||||
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
return;
|
||||
}
|
||||
|
||||
// assume this is only called for nested parallel
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
|
||||
// unlike actual parallel, threads in the same team do not share
|
||||
// the workTaskDescr in this case and num threads is fixed to 1
|
||||
|
||||
// get current task
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
currTaskDescr->SaveLoopData();
|
||||
|
||||
// allocate new task descriptor and copy value from current one, set prev to
|
||||
// it
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr =
|
||||
(omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr),
|
||||
"new seq parallel task");
|
||||
newTaskDescr->CopyParent(currTaskDescr);
|
||||
|
||||
// tweak values for serialized parallel case:
|
||||
// - each thread becomes ID 0 in its serialized parallel, and
|
||||
// - there is only one thread per team
|
||||
newTaskDescr->ThreadId() = 0;
|
||||
|
||||
// set new task descriptor as top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
|
||||
newTaskDescr);
|
||||
}
|
||||
|
||||
static void endSerializedParallel(kmp_Ident *loc,
|
||||
uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to endSerializedParallel\n");
|
||||
|
||||
DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask());
|
||||
|
||||
if (isRuntimeUninitialized()) {
|
||||
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
return;
|
||||
}
|
||||
|
||||
// pop stack
|
||||
int threadId = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
// set new top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
|
||||
threadId, currTaskDescr->GetPrevTaskDescr());
|
||||
// free
|
||||
SafeFree(currTaskDescr, "new seq parallel task");
|
||||
currTaskDescr = getMyTopTaskDescriptor(threadId);
|
||||
currTaskDescr->RestoreLoopData();
|
||||
}
|
||||
|
||||
NOINLINE EXTERN uint8_t __kmpc_parallel_level() {
|
||||
return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1);
|
||||
}
|
||||
|
||||
// This kmpc call returns the thread id across all teams. It's value is
|
||||
// cached by the compiler and used when calling the runtime. On nvptx
|
||||
// it's cheap to recalculate this value so we never use the result
|
||||
// of this call.
|
||||
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) {
|
||||
return GetOmpThreadId();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// push params
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// Do nothing. The host guarantees we started the requested number of
|
||||
// teams and we only need inspection of gridDim.
|
||||
|
||||
EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid,
|
||||
int32_t num_teams, int32_t thread_limit) {
|
||||
PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams);
|
||||
ASSERT0(LT_FUSSY, 0, "should never have anything with new teams on device");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, int proc_bind) {
|
||||
PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// parallel interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
NOINLINE EXTERN void __kmpc_parallel_51(kmp_Ident *ident, kmp_int32 global_tid,
|
||||
kmp_int32 if_expr,
|
||||
kmp_int32 num_threads, int proc_bind,
|
||||
void *fn, void *wrapper_fn, void **args,
|
||||
size_t nargs) {
|
||||
// Handle the serialized case first, same for SPMD/non-SPMD except that in
|
||||
// SPMD mode we already incremented the parallel level counter, account for
|
||||
// that.
|
||||
bool InParallelRegion =
|
||||
(__kmpc_parallel_level() > __kmpc_is_spmd_exec_mode());
|
||||
if (!if_expr || InParallelRegion) {
|
||||
serializedParallel(ident, global_tid);
|
||||
__kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
|
||||
endSerializedParallel(ident, global_tid);
|
||||
return;
|
||||
}
|
||||
|
||||
if (__kmpc_is_spmd_exec_mode()) {
|
||||
__kmp_invoke_microtask(global_tid, 0, fn, args, nargs);
|
||||
return;
|
||||
}
|
||||
|
||||
__kmpc_kernel_prepare_parallel((void *)wrapper_fn, num_threads);
|
||||
|
||||
if (nargs) {
|
||||
void **GlobalArgs;
|
||||
__kmpc_begin_sharing_variables(&GlobalArgs, nargs);
|
||||
// TODO: faster memcpy?
|
||||
#pragma unroll
|
||||
for (int I = 0; I < nargs; I++)
|
||||
GlobalArgs[I] = args[I];
|
||||
}
|
||||
|
||||
// TODO: what if that's a parallel region with a single thread? this is
|
||||
// considered not active in the existing implementation.
|
||||
bool IsActiveParallelRegion = threadsInTeam != 1;
|
||||
int NumWarps =
|
||||
threadsInTeam / WARPSIZE + ((threadsInTeam % WARPSIZE) ? 1 : 0);
|
||||
// Increment parallel level for non-SPMD warps.
|
||||
for (int I = 0; I < NumWarps; ++I)
|
||||
parallelLevel[I] +=
|
||||
(1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
|
||||
|
||||
// Master signals work to activate workers.
|
||||
__kmpc_barrier_simple_spmd(ident, 0);
|
||||
|
||||
// OpenMP [2.5, Parallel Construct, p.49]
|
||||
// There is an implied barrier at the end of a parallel region. After the
|
||||
// end of a parallel region, only the master thread of the team resumes
|
||||
// execution of the enclosing task region.
|
||||
//
|
||||
// The master waits at this barrier until all workers are done.
|
||||
__kmpc_barrier_simple_spmd(ident, 0);
|
||||
|
||||
// Decrement parallel level for non-SPMD warps.
|
||||
for (int I = 0; I < NumWarps; ++I)
|
||||
parallelLevel[I] -=
|
||||
(1 + (IsActiveParallelRegion ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
|
||||
// TODO: Is synchronization needed since out of parallel execution?
|
||||
|
||||
if (nargs)
|
||||
__kmpc_end_sharing_variables();
|
||||
|
||||
// TODO: proc_bind is a noop?
|
||||
// if (proc_bind != proc_bind_default)
|
||||
// __kmpc_push_proc_bind(ident, global_tid, proc_bind);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,309 +0,0 @@
|
|||
//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of reduction with KMPC interface.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target/shuffle.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
EXTERN
|
||||
void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
|
||||
|
||||
EXTERN
|
||||
void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
|
||||
|
||||
INLINE static void gpu_regular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct) {
|
||||
for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
|
||||
shflFct(reduce_data, /*LaneId - not used= */ 0,
|
||||
/*Offset = */ mask, /*AlgoVersion=*/0);
|
||||
}
|
||||
}
|
||||
|
||||
INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct,
|
||||
uint32_t size, uint32_t tid) {
|
||||
uint32_t curr_size;
|
||||
uint32_t mask;
|
||||
curr_size = size;
|
||||
mask = curr_size / 2;
|
||||
while (mask > 0) {
|
||||
shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
|
||||
curr_size = (curr_size + 1) / 2;
|
||||
mask = curr_size / 2;
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ < 700
|
||||
INLINE static uint32_t
|
||||
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
|
||||
uint32_t size, remote_id, physical_lane_id;
|
||||
physical_lane_id = __kmpc_get_hardware_thread_id_in_block() % WARPSIZE;
|
||||
__kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt();
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
|
||||
__kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt();
|
||||
do {
|
||||
Liveness = __kmpc_impl_activemask();
|
||||
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
|
||||
size = __kmpc_impl_popc(Liveness);
|
||||
logical_lane_id /= 2;
|
||||
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
|
||||
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
|
||||
} while (logical_lane_id % 2 == 0 && size > 1);
|
||||
return (logical_lane_id == 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
INLINE
|
||||
static int32_t nvptx_parallel_reduce_nowait(
|
||||
int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
|
||||
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
|
||||
uint32_t BlockThreadId = GetLogicalThreadIdInBlock();
|
||||
uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
|
||||
if (NumThreads == 1)
|
||||
return 1;
|
||||
/*
|
||||
* This reduce function handles reduction within a team. It handles
|
||||
* parallel regions in both L1 and L2 parallelism levels. It also
|
||||
* supports Generic, SPMD, and NoOMP modes.
|
||||
*
|
||||
* 1. Reduce within a warp.
|
||||
* 2. Warp master copies value to warp 0 via shared memory.
|
||||
* 3. Warp 0 reduces to a single value.
|
||||
* 4. The reduced value is available in the thread that returns 1.
|
||||
*/
|
||||
|
||||
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
|
||||
uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
uint32_t WarpId = BlockThreadId / WARPSIZE;
|
||||
|
||||
// Volta execution model:
|
||||
// For the Generic execution mode a parallel region either has 1 thread and
|
||||
// beyond that, always a multiple of 32. For the SPMD execution mode we may
|
||||
// have any number of threads.
|
||||
if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
|
||||
gpu_irregular_warp_reduce(
|
||||
reduce_data, shflFct,
|
||||
/*LaneCount=*/NumThreads % WARPSIZE,
|
||||
/*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
//
|
||||
// Only L1 parallel region can enter this if condition.
|
||||
if (NumThreads > WARPSIZE) {
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
|
||||
BlockThreadId);
|
||||
}
|
||||
return BlockThreadId == 0;
|
||||
#else
|
||||
__kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask();
|
||||
if (Liveness == __kmpc_impl_all_lanes) // Full warp
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
|
||||
gpu_irregular_warp_reduce(
|
||||
reduce_data, shflFct,
|
||||
/*LaneCount=*/__kmpc_impl_popc(Liveness),
|
||||
/*LaneId=*/__kmpc_get_hardware_thread_id_in_block() % WARPSIZE);
|
||||
else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
|
||||
// parallel region may enter here; return
|
||||
// early.
|
||||
return gpu_irregular_simd_reduce(reduce_data, shflFct);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
//
|
||||
// Only L1 parallel region can enter this if condition.
|
||||
if (NumThreads > WARPSIZE) {
|
||||
uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
uint32_t WarpId = BlockThreadId / WARPSIZE;
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
|
||||
BlockThreadId);
|
||||
|
||||
return BlockThreadId == 0;
|
||||
} else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
|
||||
return BlockThreadId == 0;
|
||||
}
|
||||
|
||||
// Get the OMP thread Id. This is different from BlockThreadId in the case of
|
||||
// an L2 parallel region.
|
||||
return global_tid == 0;
|
||||
#endif // __CUDA_ARCH__ >= 700
|
||||
}
|
||||
|
||||
EXTERN
|
||||
int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
|
||||
void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct) {
|
||||
return nvptx_parallel_reduce_nowait(
|
||||
global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
|
||||
__kmpc_is_spmd_exec_mode(), isRuntimeUninitialized());
|
||||
}
|
||||
|
||||
INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) {
|
||||
return !__kmpc_is_spmd_exec_mode() || IsTeamMaster(ThreadId);
|
||||
}
|
||||
|
||||
INLINE static uint32_t roundToWarpsize(uint32_t s) {
|
||||
if (s < WARPSIZE)
|
||||
return 1;
|
||||
return (s & ~(unsigned)(WARPSIZE - 1));
|
||||
}
|
||||
|
||||
INLINE static uint32_t kmpcMin(uint32_t x, uint32_t y) { return x < y ? x : y; }
|
||||
|
||||
static volatile uint32_t IterCnt = 0;
|
||||
static volatile uint32_t Cnt = 0;
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
|
||||
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
|
||||
kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
|
||||
kmp_ListGlobalFctPtr glredFct) {
|
||||
|
||||
// Terminate all threads in non-SPMD mode except for the master thread.
|
||||
if (!__kmpc_is_spmd_exec_mode() &&
|
||||
!__kmpc_is_generic_main_thread(__kmpc_get_hardware_thread_id_in_block()))
|
||||
return 0;
|
||||
|
||||
uint32_t ThreadId = GetLogicalThreadIdInBlock();
|
||||
|
||||
// In non-generic mode all workers participate in the teams reduction.
|
||||
// In generic mode only the team master participates in the teams
|
||||
// reduction because the workers are waiting for parallel work.
|
||||
uint32_t NumThreads =
|
||||
__kmpc_is_spmd_exec_mode() ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
|
||||
: /*Master thread only*/ 1;
|
||||
uint32_t TeamId = GetBlockIdInKernel();
|
||||
uint32_t NumTeams = __kmpc_get_hardware_num_blocks();
|
||||
static unsigned SHARED(Bound);
|
||||
static unsigned SHARED(ChunkTeamCount);
|
||||
|
||||
// Block progress for teams greater than the current upper
|
||||
// limit. We always only allow a number of teams less or equal
|
||||
// to the number of slots in the buffer.
|
||||
bool IsMaster = isMaster(loc, ThreadId);
|
||||
while (IsMaster) {
|
||||
// Atomic read
|
||||
Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u);
|
||||
if (TeamId < Bound + num_of_records)
|
||||
break;
|
||||
}
|
||||
|
||||
if (IsMaster) {
|
||||
int ModBockId = TeamId % num_of_records;
|
||||
if (TeamId < num_of_records)
|
||||
lgcpyFct(global_buffer, ModBockId, reduce_data);
|
||||
else
|
||||
lgredFct(global_buffer, ModBockId, reduce_data);
|
||||
__kmpc_impl_threadfence_system();
|
||||
|
||||
// Increment team counter.
|
||||
// This counter is incremented by all teams in the current
|
||||
// BUFFER_SIZE chunk.
|
||||
ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u);
|
||||
}
|
||||
// Synchronize
|
||||
if (__kmpc_is_spmd_exec_mode())
|
||||
__kmpc_barrier(loc, global_tid);
|
||||
|
||||
// reduce_data is global or shared so before being reduced within the
|
||||
// warp we need to bring it in local memory:
|
||||
// local_reduce_data = reduce_data[i]
|
||||
//
|
||||
// Example for 3 reduction variables a, b, c (of potentially different
|
||||
// types):
|
||||
//
|
||||
// buffer layout (struct of arrays):
|
||||
// a, a, ..., a, b, b, ... b, c, c, ... c
|
||||
// |__________|
|
||||
// num_of_records
|
||||
//
|
||||
// local_data_reduce layout (struct):
|
||||
// a, b, c
|
||||
//
|
||||
// Each thread will have a local struct containing the values to be
|
||||
// reduced:
|
||||
// 1. do reduction within each warp.
|
||||
// 2. do reduction across warps.
|
||||
// 3. write the final result to the main reduction variable
|
||||
// by returning 1 in the thread holding the reduction result.
|
||||
|
||||
// Check if this is the very last team.
|
||||
unsigned NumRecs = kmpcMin(NumTeams, uint32_t(num_of_records));
|
||||
if (ChunkTeamCount == NumTeams - Bound - 1) {
|
||||
//
|
||||
// Last team processing.
|
||||
//
|
||||
if (ThreadId >= NumRecs)
|
||||
return 0;
|
||||
NumThreads = roundToWarpsize(kmpcMin(NumThreads, NumRecs));
|
||||
if (ThreadId >= NumThreads)
|
||||
return 0;
|
||||
|
||||
// Load from buffer and reduce.
|
||||
glcpyFct(global_buffer, ThreadId, reduce_data);
|
||||
for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads)
|
||||
glredFct(global_buffer, i, reduce_data);
|
||||
|
||||
// Reduce across warps to the warp master.
|
||||
if (NumThreads > 1) {
|
||||
gpu_regular_warp_reduce(reduce_data, shflFct);
|
||||
|
||||
// When we have more than [warpsize] number of threads
|
||||
// a block reduction is performed here.
|
||||
uint32_t ActiveThreads = kmpcMin(NumRecs, NumThreads);
|
||||
if (ActiveThreads > WARPSIZE) {
|
||||
uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
|
||||
// Gather all the reduced values from each warp
|
||||
// to the first warp.
|
||||
cpyFct(reduce_data, WarpsNeeded);
|
||||
|
||||
uint32_t WarpId = ThreadId / WARPSIZE;
|
||||
if (WarpId == 0)
|
||||
gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
|
||||
ThreadId);
|
||||
}
|
||||
}
|
||||
|
||||
if (IsMaster) {
|
||||
Cnt = 0;
|
||||
IterCnt = 0;
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
if (IsMaster && ChunkTeamCount == num_of_records - 1) {
|
||||
// Allow SIZE number of teams to proceed writing their
|
||||
// intermediate results to the global buffer.
|
||||
__kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,29 +0,0 @@
|
|||
//===--- shuffle.cpp - Implementation of the external shuffle idiom API -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "target/shuffle.h"
|
||||
|
||||
#pragma omp declare target
|
||||
|
||||
static constexpr uint64_t AllLanes = -1;
|
||||
|
||||
int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
|
||||
return __kmpc_impl_shfl_down_sync(AllLanes, val, delta, size);
|
||||
}
|
||||
|
||||
int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
|
||||
uint32_t lo, hi;
|
||||
__kmpc_impl_unpack(val, lo, hi);
|
||||
hi = __kmpc_impl_shfl_down_sync(AllLanes, hi, delta, size);
|
||||
lo = __kmpc_impl_shfl_down_sync(AllLanes, lo, delta, size);
|
||||
return __kmpc_impl_pack(lo, hi);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,240 +0,0 @@
|
|||
//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Wrapper implementation to some functions natively supported by the GPU.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/debug.h"
|
||||
#include "common/omptarget.h"
|
||||
#include "common/support.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Execution Parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
void setExecutionParameters(OMPTgtExecModeFlags EMode,
|
||||
OMPTgtRuntimeModeFlags RMode) {
|
||||
execution_param = EMode;
|
||||
execution_param |= RMode;
|
||||
}
|
||||
|
||||
bool isGenericMode() { return execution_param & OMP_TGT_EXEC_MODE_GENERIC; }
|
||||
|
||||
bool isRuntimeUninitialized() { return !isRuntimeInitialized(); }
|
||||
|
||||
bool isRuntimeInitialized() {
|
||||
return execution_param & OMP_TGT_RUNTIME_INITIALIZED;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// support: get info from machine
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// Calls to the Generic Scheme Implementation Layer (assuming 1D layout)
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// The master thread id is the first thread (lane) of the last warp.
|
||||
// Thread id is 0 indexed.
|
||||
// E.g: If NumThreads is 33, master id is 32.
|
||||
// If NumThreads is 64, master id is 32.
|
||||
// If NumThreads is 97, master id is 96.
|
||||
// If NumThreads is 1024, master id is 992.
|
||||
//
|
||||
// Called in Generic Execution Mode only.
|
||||
int GetMasterThreadID() {
|
||||
return (__kmpc_get_hardware_num_threads_in_block() - 1) & ~(WARPSIZE - 1);
|
||||
}
|
||||
|
||||
// The last warp is reserved for the master; other warps are workers.
|
||||
// Called in Generic Execution Mode only.
|
||||
int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get thread id in team
|
||||
|
||||
// This function may be called in a parallel region by the workers
|
||||
// or a serial region by the master. If the master (whose CUDA thread
|
||||
// id is GetMasterThreadID()) calls this routine, we return 0 because
|
||||
// it is a shadow for the first worker.
|
||||
int GetLogicalThreadIdInBlock() {
|
||||
// Implemented using control flow (predication) instead of with a modulo
|
||||
// operation.
|
||||
int tid = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (__kmpc_is_generic_main_thread(tid))
|
||||
return 0;
|
||||
else
|
||||
return tid;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
//
|
||||
// OpenMP Thread Support Layer
|
||||
//
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
int GetOmpThreadId() {
|
||||
int tid = __kmpc_get_hardware_thread_id_in_block();
|
||||
if (__kmpc_is_generic_main_thread(tid))
|
||||
return 0;
|
||||
// omp_thread_num
|
||||
int rc;
|
||||
if (__kmpc_parallel_level() > 1) {
|
||||
rc = 0;
|
||||
} else if (__kmpc_is_spmd_exec_mode()) {
|
||||
rc = tid;
|
||||
} else {
|
||||
omptarget_nvptx_TaskDescr *currTaskDescr =
|
||||
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
|
||||
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
|
||||
rc = currTaskDescr->ThreadId();
|
||||
}
|
||||
return rc;
|
||||
}
|
||||
|
||||
int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
|
||||
// omp_num_threads
|
||||
int rc;
|
||||
int Level = parallelLevel[GetWarpId()];
|
||||
if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
|
||||
rc = 1;
|
||||
} else if (isSPMDExecutionMode) {
|
||||
rc = __kmpc_get_hardware_num_threads_in_block();
|
||||
} else {
|
||||
rc = threadsInTeam;
|
||||
}
|
||||
|
||||
return rc;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Team id linked to OpenMP
|
||||
|
||||
int GetOmpTeamId() {
|
||||
// omp_team_num
|
||||
return GetBlockIdInKernel(); // assume 1 block per team
|
||||
}
|
||||
|
||||
int GetNumberOfOmpTeams() {
|
||||
// omp_num_teams
|
||||
return __kmpc_get_hardware_num_blocks(); // assume 1 block per team
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Masters
|
||||
|
||||
int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Parallel level
|
||||
|
||||
void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
|
||||
__kmpc_impl_syncwarp(Mask);
|
||||
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
|
||||
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
|
||||
if (Rank == 0) {
|
||||
parallelLevel[GetWarpId()] +=
|
||||
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
|
||||
__kmpc_impl_threadfence();
|
||||
}
|
||||
__kmpc_impl_syncwarp(Mask);
|
||||
}
|
||||
|
||||
void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) {
|
||||
__kmpc_impl_syncwarp(Mask);
|
||||
__kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt();
|
||||
unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt);
|
||||
if (Rank == 0) {
|
||||
parallelLevel[GetWarpId()] -=
|
||||
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
|
||||
__kmpc_impl_threadfence();
|
||||
}
|
||||
__kmpc_impl_syncwarp(Mask);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get OpenMP number of procs
|
||||
|
||||
// Get the number of processors in the device.
|
||||
int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) {
|
||||
if (!isSPMDExecutionMode)
|
||||
return GetNumberOfWorkersInTeam();
|
||||
return __kmpc_get_hardware_num_threads_in_block();
|
||||
}
|
||||
|
||||
int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) {
|
||||
return GetNumberOfProcsInDevice(isSPMDExecutionMode);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
unsigned long PadBytes(unsigned long size,
|
||||
unsigned long alignment) // must be a power of 2
|
||||
{
|
||||
// compute the necessary padding to satisfy alignment constraint
|
||||
ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0,
|
||||
"alignment %lu is not a power of 2\n", alignment);
|
||||
return (~(unsigned long)size + 1) & (alignment - 1);
|
||||
}
|
||||
|
||||
void *SafeMalloc(size_t size, const char *msg) // check if success
|
||||
{
|
||||
void *ptr = __kmpc_impl_malloc(size);
|
||||
PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n",
|
||||
(unsigned long long)size, msg, (unsigned long long)ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void *SafeFree(void *ptr, const char *msg) {
|
||||
PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg);
|
||||
__kmpc_impl_free(ptr);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Teams Reduction Scratchpad Helpers
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
unsigned int *GetTeamsReductionTimestamp() {
|
||||
return static_cast<unsigned int *>(ReductionScratchpadPtr);
|
||||
}
|
||||
|
||||
char *GetTeamsReductionScratchpad() {
|
||||
return static_cast<char *>(ReductionScratchpadPtr) + 256;
|
||||
}
|
||||
|
||||
// Invoke an outlined parallel function unwrapping arguments (up
|
||||
// to 32).
|
||||
void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
|
||||
void **args, size_t nargs) {
|
||||
switch (nargs) {
|
||||
#include "common/generated_microtask_cases.gen"
|
||||
default:
|
||||
printf("Too many arguments in kmp_invoke_microtask, aborting execution.\n");
|
||||
__builtin_trap();
|
||||
}
|
||||
}
|
||||
|
||||
namespace _OMP {
|
||||
/// Helper to keep code alive without introducing a performance penalty.
|
||||
__attribute__((used, retain, weak, optnone, cold)) void keepAlive() {
|
||||
__kmpc_get_hardware_thread_id_in_block();
|
||||
__kmpc_get_hardware_num_threads_in_block();
|
||||
__kmpc_get_warp_size();
|
||||
__kmpc_barrier_simple_spmd(nullptr, 0);
|
||||
__kmpc_barrier_simple_generic(nullptr, 0);
|
||||
}
|
||||
} // namespace _OMP
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,143 +0,0 @@
|
|||
//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Include all synchronization.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP Ordered calls
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_ordered\n");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_ordered\n");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP Barriers
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// a team is a block: we can use CUDA native synchronization mechanism
|
||||
// FIXME: what if not all threads (warps) participate to the barrier?
|
||||
// We may need to implement it differently
|
||||
|
||||
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) {
|
||||
PRINT0(LD_IO, "call kmpc_cancel_barrier\n");
|
||||
__kmpc_barrier(loc_ref, tid);
|
||||
PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
|
||||
if (isRuntimeUninitialized()) {
|
||||
ASSERT0(LT_FUSSY, __kmpc_is_spmd_exec_mode(),
|
||||
"Expected SPMD mode with uninitialized runtime.");
|
||||
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||
} else {
|
||||
tid = GetLogicalThreadIdInBlock();
|
||||
int numberOfActiveOMPThreads =
|
||||
GetNumberOfOmpThreads(__kmpc_is_spmd_exec_mode());
|
||||
if (numberOfActiveOMPThreads > 1) {
|
||||
if (__kmpc_is_spmd_exec_mode()) {
|
||||
__kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||
} else {
|
||||
// The #threads parameter must be rounded up to the WARPSIZE.
|
||||
int threads =
|
||||
WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE);
|
||||
|
||||
PRINT(LD_SYNC,
|
||||
"call kmpc_barrier with %d omp threads, sync parameter %d\n",
|
||||
(int)numberOfActiveOMPThreads, (int)threads);
|
||||
__kmpc_impl_named_sync(threads);
|
||||
}
|
||||
} else {
|
||||
// Still need to flush the memory per the standard.
|
||||
__kmpc_flush(loc_ref);
|
||||
} // numberOfActiveOMPThreads > 1
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier\n");
|
||||
}
|
||||
}
|
||||
|
||||
// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0
|
||||
// parallel region and that all worker threads participate.
|
||||
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) {
|
||||
PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n");
|
||||
__kmpc_impl_syncthreads();
|
||||
PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n");
|
||||
}
|
||||
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) {
|
||||
return __kmpc_barrier_simple_spmd(loc_ref, tid);
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP MASTER
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_master\n");
|
||||
return IsTeamMaster(global_tid);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_master\n");
|
||||
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// KMP SINGLE
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_single\n");
|
||||
// decide to implement single with master; master get the single
|
||||
return IsTeamMaster(global_tid);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) {
|
||||
PRINT0(LD_IO, "call kmpc_end_single\n");
|
||||
// decide to implement single with master: master get the single
|
||||
ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here");
|
||||
// sync barrier is explicitly called... so that is not a problem
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Flush
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_flush(kmp_Ident *loc) {
|
||||
PRINT0(LD_IO, "call kmpc_flush\n");
|
||||
__kmpc_impl_threadfence();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Vote
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN uint64_t __kmpc_warp_active_thread_mask(void) {
|
||||
PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n");
|
||||
return __kmpc_impl_activemask();
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Syncwarp
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
EXTERN void __kmpc_syncwarp(uint64_t Mask) {
|
||||
PRINT0(LD_IO, "call __kmpc_syncwarp\n");
|
||||
__kmpc_impl_syncwarp(Mask);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,219 +0,0 @@
|
|||
//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Task implementation support.
|
||||
//
|
||||
// explicit task structure uses
|
||||
// omptarget_nvptx task
|
||||
// kmp_task
|
||||
//
|
||||
// where kmp_task is
|
||||
// - klegacy_TaskDescr <- task pointer
|
||||
// shared -> X
|
||||
// routine
|
||||
// part_id
|
||||
// descr
|
||||
// - private (of size given by task_alloc call). Accessed by
|
||||
// task+sizeof(klegacy_TaskDescr)
|
||||
// * private data *
|
||||
// - shared: X. Accessed by shared ptr in klegacy_TaskDescr
|
||||
// * pointer table to shared variables *
|
||||
// - end
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/omptarget.h"
|
||||
|
||||
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(
|
||||
kmp_Ident *loc, // unused
|
||||
uint32_t global_tid, // unused
|
||||
int32_t flag, // unused (because in our impl, all are immediately exec
|
||||
size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable,
|
||||
kmp_TaskFctPtr taskSub) {
|
||||
PRINT(LD_IO,
|
||||
"call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, "
|
||||
"fct 0x%llx)\n",
|
||||
(long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable,
|
||||
(unsigned long long)taskSub);
|
||||
// want task+priv to be a multiple of 8 bytes
|
||||
size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *));
|
||||
sizeOfTaskInclPrivate += padForTaskInclPriv;
|
||||
size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable;
|
||||
ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0,
|
||||
"need task descr of size %d to be a multiple of %d\n",
|
||||
(int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *));
|
||||
size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize;
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc(
|
||||
totSize, "explicit task descriptor");
|
||||
kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr;
|
||||
ASSERT0(LT_FUSSY,
|
||||
(uint64_t)newKmpTaskDescr ==
|
||||
(uint64_t)ADD_BYTES(newExplicitTaskDescr,
|
||||
sizeof(omptarget_nvptx_TaskDescr)),
|
||||
"bad size assumptions");
|
||||
// init kmp_TaskDescr
|
||||
newKmpTaskDescr->sharedPointerTable =
|
||||
(void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate);
|
||||
newKmpTaskDescr->sub = taskSub;
|
||||
newKmpTaskDescr->destructors = NULL;
|
||||
PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n",
|
||||
(unsigned long long)newKmpTaskDescr,
|
||||
(unsigned long long)newExplicitTaskDescr);
|
||||
|
||||
return newKmpTaskDescr;
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0,
|
||||
0);
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum,
|
||||
void *noAliasDepList) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n",
|
||||
P64(newKmpTaskDescr));
|
||||
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explicit task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
|
||||
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
|
||||
"bad assumptions");
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
|
||||
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
|
||||
"bad assumptions");
|
||||
|
||||
// 2. push new context: update new task descriptor
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
|
||||
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
|
||||
// set new task descriptor as top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
|
||||
|
||||
// 3. call sub
|
||||
PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n",
|
||||
(unsigned long long)newKmpTaskDescr->sub,
|
||||
(unsigned long long)newKmpTaskDescr);
|
||||
newKmpTaskDescr->sub(0, newKmpTaskDescr);
|
||||
PRINT(LD_TASK, "return from call task sub 0x%llx()\n",
|
||||
(unsigned long long)newKmpTaskDescr->sub);
|
||||
|
||||
// 4. pop context
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
|
||||
parentTaskDescr);
|
||||
// 5. free
|
||||
SafeFree(newExplicitTaskDescr, "explicit task descriptor");
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n",
|
||||
(unsigned long long)newKmpTaskDescr);
|
||||
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explicit task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
|
||||
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
|
||||
"bad assumptions");
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
|
||||
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
|
||||
"bad assumptions");
|
||||
|
||||
// 2. push new context: update new task descriptor
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid);
|
||||
newTaskDescr->CopyForExplicitTask(parentTaskDescr);
|
||||
// set new task descriptor as top
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr);
|
||||
// 3... noting to call... is inline
|
||||
// 4 & 5 ... done in complete
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr) {
|
||||
PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n",
|
||||
(unsigned long long)newKmpTaskDescr);
|
||||
ASSERT0(LT_FUSSY, isRuntimeInitialized(),
|
||||
"Runtime must be initialized.");
|
||||
// 1. get explicit task descr from kmp task descr
|
||||
omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr =
|
||||
(omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES(
|
||||
newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr));
|
||||
ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr,
|
||||
"bad assumptions");
|
||||
omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr;
|
||||
ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr,
|
||||
"bad assumptions");
|
||||
// 2. get parent
|
||||
omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr();
|
||||
// 3... noting to call... is inline
|
||||
// 4. pop context
|
||||
int tid = GetLogicalThreadIdInBlock();
|
||||
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid,
|
||||
parentTaskDescr);
|
||||
// 5. free
|
||||
SafeFree(newExplicitTaskDescr, "explicit task descriptor");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum, void *noAliasDepList) {
|
||||
PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
|
||||
int end_part) {
|
||||
PRINT0(LD_IO, "call to __kmpc_taskyield()\n");
|
||||
// do nothing: tasks are executed immediately, no yielding allowed
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) {
|
||||
PRINT0(LD_IO, "call to __kmpc_taskwait()\n");
|
||||
// nothing to do as all our tasks are executed as final
|
||||
return 0;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr, int if_val,
|
||||
uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
|
||||
int32_t sched, uint64_t grainsize, void *task_dup) {
|
||||
|
||||
// skip task entirely if empty iteration space
|
||||
if (*lb > *ub)
|
||||
return;
|
||||
|
||||
// the compiler has already stored lb and ub in the kmp_TaskDescr structure
|
||||
// as we are using a single task to execute the entire loop, we can leave
|
||||
// the initial task_t untouched
|
||||
|
||||
__kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,51 +0,0 @@
|
|||
//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains a queue to hand out OpenMP state objects to teams of
|
||||
// one or more kernels.
|
||||
//
|
||||
// Reference:
|
||||
// Thomas R.W. Scogland and Wu-chun Feng. 2015.
|
||||
// Design and Evaluation of Scalable Concurrent Queues for Many-Core
|
||||
// Architectures. International Conference on Performance Engineering.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef __STATE_QUEUE_H
|
||||
#define __STATE_QUEUE_H
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "target_impl.h"
|
||||
|
||||
template <typename ElementType, uint32_t SIZE> class omptarget_nvptx_Queue {
|
||||
private:
|
||||
ElementType elements[SIZE];
|
||||
volatile ElementType *elementQueue[SIZE];
|
||||
volatile uint32_t head;
|
||||
volatile uint32_t ids[SIZE];
|
||||
volatile uint32_t tail;
|
||||
|
||||
static const uint32_t MAX_ID = (1u << 31) / SIZE / 2;
|
||||
INLINE uint32_t ENQUEUE_TICKET();
|
||||
INLINE uint32_t DEQUEUE_TICKET();
|
||||
INLINE static uint32_t ID(uint32_t ticket);
|
||||
INLINE bool IsServing(uint32_t slot, uint32_t id);
|
||||
INLINE void PushElement(uint32_t slot, ElementType *element);
|
||||
INLINE ElementType *PopElement(uint32_t slot);
|
||||
INLINE void DoneServing(uint32_t slot, uint32_t id);
|
||||
|
||||
public:
|
||||
INLINE omptarget_nvptx_Queue() {}
|
||||
INLINE void Enqueue(ElementType *element);
|
||||
INLINE ElementType *Dequeue();
|
||||
};
|
||||
|
||||
#include "state-queuei.h"
|
||||
|
||||
#endif
|
|
@ -1,88 +0,0 @@
|
|||
//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains the implementation of a queue to hand out OpenMP state
|
||||
// objects to teams of one or more kernels.
|
||||
//
|
||||
// Reference:
|
||||
// Thomas R.W. Scogland and Wu-chun Feng. 2015.
|
||||
// Design and Evaluation of Scalable Concurrent Queues for Many-Core
|
||||
// Architectures. International Conference on Performance Engineering.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#include "state-queue.h"
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ENQUEUE_TICKET() {
|
||||
return __kmpc_atomic_add((unsigned int *)&tail, 1u);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::DEQUEUE_TICKET() {
|
||||
return __kmpc_atomic_add((unsigned int *)&head, 1u);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE uint32_t omptarget_nvptx_Queue<ElementType, SIZE>::ID(uint32_t ticket) {
|
||||
return (ticket / SIZE) * 2;
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE bool omptarget_nvptx_Queue<ElementType, SIZE>::IsServing(uint32_t slot,
|
||||
uint32_t id) {
|
||||
return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id;
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE void
|
||||
omptarget_nvptx_Queue<ElementType, SIZE>::PushElement(uint32_t slot,
|
||||
ElementType *element) {
|
||||
__kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot],
|
||||
(unsigned long long)element);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE ElementType *
|
||||
omptarget_nvptx_Queue<ElementType, SIZE>::PopElement(uint32_t slot) {
|
||||
return (ElementType *)__kmpc_atomic_add(
|
||||
(unsigned long long *)&elementQueue[slot], (unsigned long long)0);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE void omptarget_nvptx_Queue<ElementType, SIZE>::DoneServing(uint32_t slot,
|
||||
uint32_t id) {
|
||||
__kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE void
|
||||
omptarget_nvptx_Queue<ElementType, SIZE>::Enqueue(ElementType *element) {
|
||||
uint32_t ticket = ENQUEUE_TICKET();
|
||||
uint32_t slot = ticket % SIZE;
|
||||
uint32_t id = ID(ticket) + 1;
|
||||
while (!IsServing(slot, id))
|
||||
;
|
||||
PushElement(slot, element);
|
||||
DoneServing(slot, id);
|
||||
}
|
||||
|
||||
template <typename ElementType, uint32_t SIZE>
|
||||
INLINE ElementType *omptarget_nvptx_Queue<ElementType, SIZE>::Dequeue() {
|
||||
uint32_t ticket = DEQUEUE_TICKET();
|
||||
uint32_t slot = ticket % SIZE;
|
||||
uint32_t id = ID(ticket);
|
||||
while (!IsServing(slot, id))
|
||||
;
|
||||
ElementType *element = PopElement(slot);
|
||||
// This is to populate the queue because of the lack of GPU constructors.
|
||||
if (element == 0)
|
||||
element = &elements[slot];
|
||||
DoneServing(slot, id);
|
||||
return element;
|
||||
}
|
|
@ -1,91 +0,0 @@
|
|||
//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Wrapper to some functions natively supported by the GPU.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef OMPTARGET_SUPPORT_H
|
||||
#define OMPTARGET_SUPPORT_H
|
||||
|
||||
#include "interface.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Execution Parameters
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
enum OMPTgtExecModeFlags : int8_t {
|
||||
OMP_TGT_EXEC_MODE_GENERIC = 1 << 0,
|
||||
OMP_TGT_EXEC_MODE_SPMD = 1 << 1
|
||||
};
|
||||
|
||||
enum OMPTgtRuntimeModeFlags : int8_t {
|
||||
OMP_TGT_RUNTIME_UNINITIALIZED = 0,
|
||||
OMP_TGT_RUNTIME_INITIALIZED = 1 << 2
|
||||
};
|
||||
|
||||
void setExecutionParameters(OMPTgtExecModeFlags EMode,
|
||||
OMPTgtRuntimeModeFlags RMode);
|
||||
bool isGenericMode();
|
||||
bool isRuntimeUninitialized();
|
||||
bool isRuntimeInitialized();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// get info from machine
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// get global ids to locate tread/team info (constant regardless of OMP)
|
||||
int GetLogicalThreadIdInBlock();
|
||||
int GetMasterThreadID();
|
||||
int GetNumberOfWorkersInTeam();
|
||||
|
||||
// get OpenMP thread and team ids
|
||||
int GetOmpThreadId(); // omp_thread_num
|
||||
int GetOmpTeamId(); // omp_team_num
|
||||
|
||||
// get OpenMP number of threads and team
|
||||
int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
|
||||
int GetNumberOfOmpTeams(); // omp_num_teams
|
||||
|
||||
// get OpenMP number of procs
|
||||
int GetNumberOfProcsInTeam(bool isSPMDExecutionMode);
|
||||
int GetNumberOfProcsInDevice(bool isSPMDExecutionMode);
|
||||
|
||||
// masters
|
||||
int IsTeamMaster(int ompThreadId);
|
||||
|
||||
// Parallel level
|
||||
void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
|
||||
void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask);
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Memory
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// safe alloc and free
|
||||
void *SafeMalloc(size_t size, const char *msg); // check if success
|
||||
void *SafeFree(void *ptr, const char *msg);
|
||||
// pad to a alignment (power of 2 only)
|
||||
unsigned long PadBytes(unsigned long size, unsigned long alignment);
|
||||
#define ADD_BYTES(_addr, _bytes) \
|
||||
((void *)((char *)((void *)(_addr)) + (_bytes)))
|
||||
#define SUB_BYTES(_addr, _bytes) \
|
||||
((void *)((char *)((void *)(_addr)) - (_bytes)))
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Teams Reduction Scratchpad Helpers
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
unsigned int *GetTeamsReductionTimestamp();
|
||||
char *GetTeamsReductionScratchpad();
|
||||
|
||||
// Invoke an outlined parallel function unwrapping global, shared arguments (up
|
||||
// to 128).
|
||||
void __kmp_invoke_microtask(kmp_int32 global_tid, kmp_int32 bound_tid, void *fn,
|
||||
void **args, size_t nargs);
|
||||
|
||||
#endif
|
|
@ -1,505 +0,0 @@
|
|||
//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains all the definitions that are relevant to
|
||||
// the interface. The first section contains the interface as
|
||||
// declared by OpenMP. The second section includes the compiler
|
||||
// specific interfaces.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _INTERFACES_H_
|
||||
#define _INTERFACES_H_
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __AMDGCN__
|
||||
#include "amdgcn/src/amdgcn_interface.h"
|
||||
#endif
|
||||
#ifdef __CUDACC__
|
||||
#include "nvptx/src/nvptx_interface.h"
|
||||
#endif
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// OpenMP interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */
|
||||
|
||||
typedef enum omp_sched_t {
|
||||
omp_sched_static = 1, /* chunkSize >0 */
|
||||
omp_sched_dynamic = 2, /* chunkSize >0 */
|
||||
omp_sched_guided = 3, /* chunkSize >0 */
|
||||
omp_sched_auto = 4, /* no chunkSize */
|
||||
} omp_sched_t;
|
||||
|
||||
typedef enum omp_proc_bind_t {
|
||||
omp_proc_bind_false = 0,
|
||||
omp_proc_bind_true = 1,
|
||||
omp_proc_bind_master = 2,
|
||||
omp_proc_bind_close = 3,
|
||||
omp_proc_bind_spread = 4
|
||||
} omp_proc_bind_t;
|
||||
|
||||
EXTERN double omp_get_wtick(void);
|
||||
EXTERN double omp_get_wtime(void);
|
||||
|
||||
EXTERN void omp_set_num_threads(int num);
|
||||
EXTERN int omp_get_num_threads(void);
|
||||
EXTERN int omp_get_max_threads(void);
|
||||
EXTERN int omp_get_thread_limit(void);
|
||||
EXTERN int omp_get_thread_num(void);
|
||||
EXTERN int omp_get_num_procs(void);
|
||||
EXTERN int omp_in_parallel(void);
|
||||
EXTERN int omp_in_final(void);
|
||||
EXTERN void omp_set_dynamic(int flag);
|
||||
EXTERN int omp_get_dynamic(void);
|
||||
EXTERN void omp_set_nested(int flag);
|
||||
EXTERN int omp_get_nested(void);
|
||||
EXTERN void omp_set_max_active_levels(int level);
|
||||
EXTERN int omp_get_max_active_levels(void);
|
||||
EXTERN int omp_get_level(void);
|
||||
EXTERN int omp_get_active_level(void);
|
||||
EXTERN int omp_get_ancestor_thread_num(int level);
|
||||
EXTERN int omp_get_team_size(int level);
|
||||
|
||||
EXTERN void omp_init_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN void omp_destroy_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN void omp_set_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN void omp_unset_lock(omp_lock_t *lock);
|
||||
EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock);
|
||||
EXTERN int omp_test_lock(omp_lock_t *lock);
|
||||
EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock);
|
||||
|
||||
EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier);
|
||||
EXTERN void omp_set_schedule(omp_sched_t kind, int modifier);
|
||||
EXTERN omp_proc_bind_t omp_get_proc_bind(void);
|
||||
EXTERN int omp_get_cancellation(void);
|
||||
EXTERN void omp_set_default_device(int deviceId);
|
||||
EXTERN int omp_get_default_device(void);
|
||||
EXTERN int omp_get_num_devices(void);
|
||||
EXTERN int omp_get_num_teams(void);
|
||||
EXTERN int omp_get_team_num(void);
|
||||
EXTERN int omp_get_initial_device(void);
|
||||
EXTERN int omp_get_max_task_priority(void);
|
||||
|
||||
EXTERN void *llvm_omp_get_dynamic_shared();
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// file below is swiped from kmpc host interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// kmp specific types
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
typedef enum kmp_sched_t {
|
||||
kmp_sched_static_chunk = 33,
|
||||
kmp_sched_static_nochunk = 34,
|
||||
kmp_sched_dynamic = 35,
|
||||
kmp_sched_guided = 36,
|
||||
kmp_sched_runtime = 37,
|
||||
kmp_sched_auto = 38,
|
||||
|
||||
kmp_sched_static_balanced_chunk = 45,
|
||||
|
||||
kmp_sched_static_ordered = 65,
|
||||
kmp_sched_static_nochunk_ordered = 66,
|
||||
kmp_sched_dynamic_ordered = 67,
|
||||
kmp_sched_guided_ordered = 68,
|
||||
kmp_sched_runtime_ordered = 69,
|
||||
kmp_sched_auto_ordered = 70,
|
||||
|
||||
kmp_sched_distr_static_chunk = 91,
|
||||
kmp_sched_distr_static_nochunk = 92,
|
||||
kmp_sched_distr_static_chunk_sched_static_chunkone = 93,
|
||||
|
||||
kmp_sched_default = kmp_sched_static_nochunk,
|
||||
kmp_sched_unordered_first = kmp_sched_static_chunk,
|
||||
kmp_sched_unordered_last = kmp_sched_auto,
|
||||
kmp_sched_ordered_first = kmp_sched_static_ordered,
|
||||
kmp_sched_ordered_last = kmp_sched_auto_ordered,
|
||||
kmp_sched_distribute_first = kmp_sched_distr_static_chunk,
|
||||
kmp_sched_distribute_last =
|
||||
kmp_sched_distr_static_chunk_sched_static_chunkone,
|
||||
|
||||
/* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers.
|
||||
* Since we need to distinguish the three possible cases (no modifier,
|
||||
* monotonic modifier, nonmonotonic modifier), we need separate bits for
|
||||
* each modifier. The absence of monotonic does not imply nonmonotonic,
|
||||
* especially since 4.5 says that the behaviour of the "no modifier" case
|
||||
* is implementation defined in 4.5, but will become "nonmonotonic" in 5.0.
|
||||
*
|
||||
* Since we're passing a full 32 bit value, we can use a couple of high
|
||||
* bits for these flags; out of paranoia we avoid the sign bit.
|
||||
*
|
||||
* These modifiers can be or-ed into non-static schedules by the compiler
|
||||
* to pass the additional information. They will be stripped early in the
|
||||
* processing in __kmp_dispatch_init when setting up schedules, so
|
||||
* most of the code won't ever see schedules with these bits set.
|
||||
*/
|
||||
kmp_sched_modifier_monotonic = (1 << 29),
|
||||
/**< Set if the monotonic schedule modifier was present */
|
||||
kmp_sched_modifier_nonmonotonic = (1 << 30),
|
||||
/**< Set if the nonmonotonic schedule modifier was present */
|
||||
|
||||
#define SCHEDULE_WITHOUT_MODIFIERS(s) \
|
||||
(enum kmp_sched_t)( \
|
||||
(s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic))
|
||||
#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0)
|
||||
#define SCHEDULE_HAS_NONMONOTONIC(s) \
|
||||
(((s)&kmp_sched_modifier_nonmonotonic) != 0)
|
||||
#define SCHEDULE_HAS_NO_MODIFIERS(s) \
|
||||
(((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \
|
||||
0)
|
||||
|
||||
} kmp_sched_t;
|
||||
|
||||
/*!
|
||||
* Enum for accesseing the reserved_2 field of the ident_t struct below.
|
||||
*/
|
||||
enum {
|
||||
/*! Bit set to 1 when in SPMD mode. */
|
||||
KMP_IDENT_SPMD_MODE = 0x01,
|
||||
/*! Bit set to 1 when a simplified runtime is used. */
|
||||
KMP_IDENT_SIMPLE_RT_MODE = 0x02,
|
||||
};
|
||||
|
||||
/*!
|
||||
* The ident structure that describes a source location.
|
||||
* The struct is identical to the one in the kmp.h file.
|
||||
* We maintain the same data structure for compatibility.
|
||||
*/
|
||||
typedef short kmp_int16;
|
||||
typedef int kmp_int32;
|
||||
typedef struct ident {
|
||||
kmp_int32 reserved_1; /**< might be used in Fortran; see above */
|
||||
kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC
|
||||
identifies this union member */
|
||||
kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */
|
||||
kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */
|
||||
char const *psource; /**< String describing the source location.
|
||||
The string is composed of semi-colon separated fields
|
||||
which describe the source file, the function and a pair
|
||||
of line numbers that delimit the construct. */
|
||||
} ident_t;
|
||||
|
||||
// parallel defs
|
||||
typedef ident_t kmp_Ident;
|
||||
typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num);
|
||||
typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id,
|
||||
int16_t lane_offset,
|
||||
int16_t shortCircuit);
|
||||
typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data);
|
||||
|
||||
// task defs
|
||||
typedef struct kmp_TaskDescr kmp_TaskDescr;
|
||||
typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr);
|
||||
typedef struct kmp_TaskDescr {
|
||||
void *sharedPointerTable; // ptr to a table of shared var ptrs
|
||||
kmp_TaskFctPtr sub; // task subroutine
|
||||
int32_t partId; // unused
|
||||
kmp_TaskFctPtr destructors; // destructor of c++ first private
|
||||
} kmp_TaskDescr;
|
||||
|
||||
// sync defs
|
||||
typedef int32_t kmp_CriticalName[8];
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// external interface
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
// parallel
|
||||
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
|
||||
NOINLINE EXTERN uint8_t __kmpc_parallel_level();
|
||||
|
||||
// proc bind
|
||||
EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid,
|
||||
int proc_bind);
|
||||
EXTERN int omp_get_num_places(void);
|
||||
EXTERN int omp_get_place_num_procs(int place_num);
|
||||
EXTERN void omp_get_place_proc_ids(int place_num, int *ids);
|
||||
EXTERN int omp_get_place_num(void);
|
||||
EXTERN int omp_get_partition_num_places(void);
|
||||
EXTERN void omp_get_partition_place_nums(int *place_nums);
|
||||
|
||||
// for static (no chunk or chunk)
|
||||
EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter1,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
// distribute static (no chunk or chunk)
|
||||
EXTERN void __kmpc_distribute_static_init_4(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_distribute_static_init_4u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_distribute_static_init_8(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN void __kmpc_distribute_static_init_8u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter1,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter1,
|
||||
uint64_t *plower, uint64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int32_t *plower, int32_t *pupper,
|
||||
int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_4u_simple_generic(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter,
|
||||
uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t *plastiter,
|
||||
int64_t *plower, int64_t *pupper,
|
||||
int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
EXTERN
|
||||
void __kmpc_for_static_init_8u_simple_generic(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1,
|
||||
uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr,
|
||||
int64_t chunk);
|
||||
|
||||
EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid);
|
||||
|
||||
EXTERN void __kmpc_distribute_static_fini(kmp_Ident *loc, int32_t global_tid);
|
||||
|
||||
// for dynamic
|
||||
EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int32_t lower, int32_t upper,
|
||||
int32_t incr, int32_t chunk);
|
||||
EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, uint32_t lower,
|
||||
uint32_t upper, int32_t incr,
|
||||
int32_t chunk);
|
||||
EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, int64_t lower, int64_t upper,
|
||||
int64_t incr, int64_t chunk);
|
||||
EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t sched, uint64_t lower,
|
||||
uint64_t upper, int64_t incr,
|
||||
int64_t chunk);
|
||||
|
||||
EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t *plastiter, int32_t *plower,
|
||||
int32_t *pupper, int32_t *pstride);
|
||||
EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t *plastiter, uint32_t *plower,
|
||||
uint32_t *pupper, int32_t *pstride);
|
||||
EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t *plastiter, int64_t *plower,
|
||||
int64_t *pupper, int64_t *pstride);
|
||||
EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t *plastiter, uint64_t *plower,
|
||||
uint64_t *pupper, int64_t *pstride);
|
||||
|
||||
EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid);
|
||||
|
||||
// reduction
|
||||
EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid);
|
||||
EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
|
||||
EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
|
||||
void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct);
|
||||
EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
|
||||
kmp_Ident *loc, int32_t global_tid, void *global_buffer,
|
||||
int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
|
||||
kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct,
|
||||
kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct,
|
||||
kmp_ListGlobalFctPtr glredFct);
|
||||
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size);
|
||||
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size);
|
||||
|
||||
// sync barrier
|
||||
EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid);
|
||||
EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid);
|
||||
EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid);
|
||||
EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid);
|
||||
|
||||
// single
|
||||
EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid);
|
||||
|
||||
// sync
|
||||
EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid);
|
||||
EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid,
|
||||
kmp_CriticalName *crit);
|
||||
EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid,
|
||||
kmp_CriticalName *crit);
|
||||
EXTERN void __kmpc_flush(kmp_Ident *loc);
|
||||
|
||||
// vote
|
||||
EXTERN uint64_t __kmpc_warp_active_thread_mask(void);
|
||||
// syncwarp
|
||||
EXTERN void __kmpc_syncwarp(uint64_t);
|
||||
|
||||
// tasks
|
||||
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, uint32_t global_tid,
|
||||
int32_t flag,
|
||||
size_t sizeOfTaskInclPrivate,
|
||||
size_t sizeOfSharedTable,
|
||||
kmp_TaskFctPtr sub);
|
||||
EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr);
|
||||
EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum,
|
||||
void *noAliasDepList);
|
||||
EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr);
|
||||
EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newLegacyTaskDescr);
|
||||
EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid,
|
||||
int32_t depNum, void *depList,
|
||||
int32_t noAliasDepNum, void *noAliasDepList);
|
||||
EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid);
|
||||
EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid);
|
||||
EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid,
|
||||
int end_part);
|
||||
EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid);
|
||||
EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid,
|
||||
kmp_TaskDescr *newKmpTaskDescr, int if_val,
|
||||
uint64_t *lb, uint64_t *ub, int64_t st, int nogroup,
|
||||
int32_t sched, uint64_t grainsize, void *task_dup);
|
||||
|
||||
// cancel
|
||||
EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t cancelVal);
|
||||
EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid,
|
||||
int32_t cancelVal);
|
||||
|
||||
// non standard
|
||||
EXTERN int32_t __kmpc_target_init(ident_t *Ident, int8_t Mode,
|
||||
bool UseGenericStateMachine,
|
||||
bool RequiresFullRuntime);
|
||||
EXTERN void __kmpc_target_deinit(ident_t *Ident, int8_t Mode,
|
||||
bool RequiresFullRuntime);
|
||||
EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
|
||||
int32_t NumThreadsClause);
|
||||
EXTERN bool __kmpc_kernel_parallel(void **WorkFn);
|
||||
EXTERN void __kmpc_kernel_end_parallel();
|
||||
|
||||
EXTERN void __kmpc_data_sharing_init_stack();
|
||||
EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs);
|
||||
EXTERN void __kmpc_end_sharing_variables();
|
||||
EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs);
|
||||
|
||||
/// Entry point to start a new parallel region.
|
||||
///
|
||||
/// \param ident The source identifier.
|
||||
/// \param global_tid The global thread ID.
|
||||
/// \param if_expr The if(expr), or 1 if none given.
|
||||
/// \param num_threads The num_threads(expr), or -1 if none given.
|
||||
/// \param proc_bind The proc_bind, or `proc_bind_default` if none given.
|
||||
/// \param fn The outlined parallel region function.
|
||||
/// \param wrapper_fn The worker wrapper function of fn.
|
||||
/// \param args The pointer array of arguments to fn.
|
||||
/// \param nargs The number of arguments to fn.
|
||||
NOINLINE EXTERN void __kmpc_parallel_51(ident_t *ident, kmp_int32 global_tid,
|
||||
kmp_int32 if_expr,
|
||||
kmp_int32 num_threads, int proc_bind,
|
||||
void *fn, void *wrapper_fn, void **args,
|
||||
size_t nargs);
|
||||
|
||||
// SPMD execution mode interrogation function.
|
||||
EXTERN int8_t __kmpc_is_spmd_exec_mode();
|
||||
|
||||
/// Return true if the hardware thread id \p Tid represents the OpenMP main
|
||||
/// thread in generic mode outside of a parallel region.
|
||||
EXTERN int8_t __kmpc_is_generic_main_thread(kmp_int32 Tid);
|
||||
|
||||
/// Return true if the hardware thread id \p Tid represents the OpenMP main
|
||||
/// thread in generic mode.
|
||||
EXTERN int8_t __kmpc_is_generic_main_thread_id(kmp_int32 Tid);
|
||||
|
||||
EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
|
||||
const void *buf, size_t size,
|
||||
int16_t is_shared, const void **res);
|
||||
|
||||
EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
|
||||
int16_t is_shared);
|
||||
|
||||
/// Allocate \p Bytes in "shareable" memory and return the address. Needs to be
|
||||
/// called balanced with __kmpc_free_shared like a stack (push/pop). Can be
|
||||
/// called by any thread, allocation happens per-thread.
|
||||
EXTERN void *__kmpc_alloc_shared(uint64_t Bytes);
|
||||
|
||||
/// Deallocate \p Ptr. Needs to be called balanced with __kmpc_alloc_shared like
|
||||
/// a stack (push/pop). Can be called by any thread. \p Ptr must be allocated by
|
||||
/// __kmpc_alloc_shared by the same thread. \p Bytes contains the size of the
|
||||
/// paired allocation to make memory management easier.
|
||||
EXTERN void __kmpc_free_shared(void *Ptr, size_t Bytes);
|
||||
|
||||
/// Get a pointer to the dynamic shared memory buffer in the device.
|
||||
EXTERN void *__kmpc_get_dynamic_shared();
|
||||
|
||||
#endif
|
|
@ -1,257 +0,0 @@
|
|||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
# See https://llvm.org/LICENSE.txt for license information.
|
||||
# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
#
|
||||
# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available
|
||||
#
|
||||
##===----------------------------------------------------------------------===##
|
||||
|
||||
# By default we will build NVPTX deviceRTL on a CUDA free system
|
||||
set(LIBOMPTARGET_BUILD_NVPTX_BCLIB FALSE CACHE BOOL
|
||||
"Whether build NVPTX deviceRTL on CUDA free system.")
|
||||
|
||||
if (NOT LIBOMPTARGET_BUILD_NVPTX_BCLIB)
|
||||
libomptarget_say("Not building NVPTX deviceRTL: Disabled by LIBOMPTARGET_BUILD_NVPTX_BCLIB")
|
||||
return()
|
||||
endif()
|
||||
|
||||
if (NOT LIBOMPTARGET_LLVM_INCLUDE_DIRS)
|
||||
libomptarget_say("Not building NVPTX device RTL: Missing definition for LIBOMPTARGET_LLVM_INCLUDE_DIRS")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# Check if we can create an LLVM bitcode implementation of the runtime library
|
||||
# that could be inlined in the user application. For that we need to find
|
||||
# a Clang compiler capable of compiling our CUDA files to LLVM bitcode and
|
||||
# an LLVM linker.
|
||||
set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING
|
||||
"Location of a CUDA compiler capable of emitting LLVM bitcode.")
|
||||
set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING
|
||||
"Location of a linker capable of linking LLVM bitcode objects.")
|
||||
|
||||
if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "")
|
||||
set(cuda_compiler ${LIBOMPTARGET_NVPTX_CUDA_COMPILER})
|
||||
elseif (LLVM_TOOL_CLANG_BUILD AND NOT CMAKE_CROSSCOMPILING)
|
||||
# Compile the deviceRTL with the clang that is built in the project.
|
||||
set(cuda_compiler "$<TARGET_FILE:clang>")
|
||||
elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang")
|
||||
# Compile the device runtime with the compiler that OpenMP is built with.
|
||||
# This is the case with LLVM_ENABLE_RUNTIMES=openmp.
|
||||
# FIXME: This is unreliable; the compiler can be on older version of clang
|
||||
# that does not support compiling CUDA, or only an older version of it. The
|
||||
# risk is especially high on sytems where clang is the default compiler
|
||||
# (MacOS, BSDs). LLVM_ENABLE_RUNTIMES=openmp should itself set
|
||||
# LIBOMPTARGET_NVPTX_CUDA_COMPILER instead.
|
||||
set(cuda_compiler ${CMAKE_C_COMPILER})
|
||||
else()
|
||||
libomptarget_say("Not building NVPTX deviceRTL: clang not found")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# Get compiler directory to try to locate a suitable linker.
|
||||
get_filename_component(compiler_dir ${cuda_compiler} DIRECTORY)
|
||||
set(llvm_link "${compiler_dir}/llvm-link")
|
||||
|
||||
if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "")
|
||||
set(bc_linker ${LIBOMPTARGET_NVPTX_BC_LINKER})
|
||||
elseif (EXISTS ${llvm_link})
|
||||
# Try to use the linker consistent with the CUDA compiler unless explicitly
|
||||
# set to a different linker.
|
||||
set(bc_linker ${llvm_link})
|
||||
elseif (NOT OPENMP_STANDALONE_BUILD AND NOT CMAKE_CROSSCOMPILING)
|
||||
# Use the linker also built in the same project.
|
||||
set(bc_linker "$<TARGET_FILE:llvm-link>")
|
||||
else()
|
||||
libomptarget_say("Not building NVPTX deviceRTL: llvm-link not found")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# TODO: This part needs to be refined when libomptarget is going to support
|
||||
# Windows!
|
||||
# TODO: This part can also be removed if we can change the clang driver to make
|
||||
# it support device only compilation.
|
||||
if(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64")
|
||||
set(aux_triple x86_64-unknown-linux-gnu)
|
||||
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "ppc64le")
|
||||
set(aux_triple powerpc64le-unknown-linux-gnu)
|
||||
elseif(CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64")
|
||||
set(aux_triple aarch64-unknown-linux-gnu)
|
||||
else()
|
||||
libomptarget_say("Not building CUDA offloading device RTL: unknown host arch: ${CMAKE_HOST_SYSTEM_PROCESSOR}")
|
||||
return()
|
||||
endif()
|
||||
|
||||
get_filename_component(devicertl_base_directory
|
||||
${CMAKE_CURRENT_SOURCE_DIR}
|
||||
DIRECTORY)
|
||||
set(devicertl_common_directory
|
||||
${devicertl_base_directory}/common)
|
||||
set(devicertl_nvptx_directory
|
||||
${devicertl_base_directory}/nvptx)
|
||||
|
||||
set(all_capabilities 35 37 50 52 53 60 61 62 70 72 75 80 86)
|
||||
|
||||
set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${all_capabilities} CACHE STRING
|
||||
"List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.")
|
||||
string(TOLOWER ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES} LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES)
|
||||
|
||||
if (LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "all")
|
||||
set(nvptx_sm_list ${all_capabilities})
|
||||
elseif(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES STREQUAL "auto")
|
||||
if (NOT LIBOMPTARGET_DEP_CUDA_FOUND)
|
||||
libomptarget_error_say("[NVPTX] Cannot auto detect compute capability as CUDA not found.")
|
||||
endif()
|
||||
set(nvptx_sm_list ${LIBOMPTARGET_DEP_CUDA_ARCH})
|
||||
else()
|
||||
string(REPLACE "," ";" nvptx_sm_list "${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}")
|
||||
endif()
|
||||
|
||||
# If user set LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES to empty, we disable the
|
||||
# build.
|
||||
if (NOT nvptx_sm_list)
|
||||
libomptarget_say("Not building CUDA offloading device RTL: empty compute capability list")
|
||||
return()
|
||||
endif()
|
||||
|
||||
# Check all SM values
|
||||
foreach(sm ${nvptx_sm_list})
|
||||
if (NOT ${sm} IN_LIST all_capabilities)
|
||||
libomptarget_warning_say("[NVPTX] Compute capability ${sm} is not supported. Make sure clang can work with it.")
|
||||
endif()
|
||||
endforeach()
|
||||
|
||||
# Override default MAX_SM in src/target_impl.h if requested
|
||||
if (DEFINED LIBOMPTARGET_NVPTX_MAX_SM)
|
||||
set(MAX_SM_DEFINITION "-DMAX_SM=${LIBOMPTARGET_NVPTX_MAX_SM}")
|
||||
endif()
|
||||
|
||||
# Activate RTL message dumps if requested by the user.
|
||||
set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL
|
||||
"Activate NVPTX device RTL debug messages.")
|
||||
|
||||
if ("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
|
||||
libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using in-tree clang.")
|
||||
else ()
|
||||
libomptarget_say("Building CUDA LLVM bitcode offloading device RTL using ${cuda_compiler}")
|
||||
endif ()
|
||||
|
||||
set(cuda_src_files
|
||||
${devicertl_common_directory}/src/cancel.cu
|
||||
${devicertl_common_directory}/src/critical.cu
|
||||
${devicertl_common_directory}/src/data_sharing.cu
|
||||
${devicertl_common_directory}/src/libcall.cu
|
||||
${devicertl_common_directory}/src/loop.cu
|
||||
${devicertl_common_directory}/src/omp_data.cu
|
||||
${devicertl_common_directory}/src/omptarget.cu
|
||||
${devicertl_common_directory}/src/parallel.cu
|
||||
${devicertl_common_directory}/src/reduction.cu
|
||||
${devicertl_common_directory}/src/support.cu
|
||||
${devicertl_common_directory}/src/sync.cu
|
||||
${devicertl_common_directory}/src/task.cu
|
||||
${devicertl_common_directory}/src/shuffle.cpp
|
||||
src/target_impl.cu
|
||||
)
|
||||
|
||||
# Prepend -I to each list element
|
||||
set (LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX "${LIBOMPTARGET_LLVM_INCLUDE_DIRS}")
|
||||
list(TRANSFORM LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX PREPEND "-I")
|
||||
|
||||
# Set flags for LLVM Bitcode compilation.
|
||||
set(bc_flags -S -x c++ -O1 -std=c++14
|
||||
-mllvm -openmp-opt-disable
|
||||
-ffreestanding
|
||||
-target nvptx64
|
||||
-fvisibility=hidden
|
||||
-Xclang -emit-llvm-bc
|
||||
-Xclang -aux-triple -Xclang ${aux_triple}
|
||||
-fopenmp -fopenmp-cuda-mode -Xclang -fopenmp-is-device
|
||||
-Xclang -target-feature -Xclang +ptx61
|
||||
-D__CUDACC__
|
||||
-I${devicertl_base_directory}
|
||||
-I${devicertl_common_directory}/include
|
||||
-I${devicertl_nvptx_directory}/src
|
||||
-I${devicertl_base_directory}/../include
|
||||
${LIBOMPTARGET_LLVM_INCLUDE_DIRS_NVPTX})
|
||||
|
||||
if(${LIBOMPTARGET_NVPTX_DEBUG})
|
||||
list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=-1 -g)
|
||||
else()
|
||||
list(APPEND bc_flags -DOMPTARGET_NVPTX_DEBUG=0)
|
||||
endif()
|
||||
|
||||
# Create target to build all Bitcode libraries.
|
||||
add_custom_target(omptarget-nvptx-bc)
|
||||
|
||||
# Generate a Bitcode library for all the compute capabilities the user requested
|
||||
foreach(sm ${nvptx_sm_list})
|
||||
set(cuda_flags -Xclang -target-cpu -Xclang sm_${sm} "-D__CUDA_ARCH__=${sm}0")
|
||||
set(bc_files "")
|
||||
foreach(src ${cuda_src_files})
|
||||
get_filename_component(infile ${src} ABSOLUTE)
|
||||
get_filename_component(outfile ${src} NAME)
|
||||
set(outfile "${outfile}-sm_${sm}.bc")
|
||||
|
||||
add_custom_command(OUTPUT ${outfile}
|
||||
COMMAND ${cuda_compiler} ${bc_flags}
|
||||
${cuda_flags} ${MAX_SM_DEFINITION} ${infile} -o ${outfile}
|
||||
DEPENDS ${infile}
|
||||
IMPLICIT_DEPENDS CXX ${infile}
|
||||
COMMENT "Building LLVM bitcode ${outfile}"
|
||||
VERBATIM
|
||||
)
|
||||
if("${cuda_compiler}" STREQUAL "$<TARGET_FILE:clang>")
|
||||
# Add a file-level dependency to ensure that clang is up-to-date.
|
||||
# By default, add_custom_command only builds clang if the
|
||||
# executable is missing.
|
||||
add_custom_command(OUTPUT ${outfile}
|
||||
DEPENDS clang
|
||||
APPEND
|
||||
)
|
||||
endif()
|
||||
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile})
|
||||
|
||||
list(APPEND bc_files ${outfile})
|
||||
endforeach()
|
||||
|
||||
set(bclib_name "libomptarget-nvptx-sm_${sm}.bc")
|
||||
|
||||
# Link to a bitcode library.
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
|
||||
COMMAND ${bc_linker}
|
||||
-o ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} ${bc_files}
|
||||
DEPENDS ${bc_files}
|
||||
COMMENT "Linking LLVM bitcode ${bclib_name}"
|
||||
)
|
||||
if("${bc_linker}" STREQUAL "$<TARGET_FILE:llvm-link>")
|
||||
# Add a file-level dependency to ensure that llvm-link is up-to-date.
|
||||
# By default, add_custom_command only builds llvm-link if the
|
||||
# executable is missing.
|
||||
add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
|
||||
DEPENDS llvm-link
|
||||
APPEND
|
||||
)
|
||||
endif()
|
||||
set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${bclib_name})
|
||||
|
||||
set(bclib_target_name "omptarget-nvptx-sm_${sm}-bc")
|
||||
|
||||
add_custom_target(${bclib_target_name} ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name})
|
||||
add_dependencies(omptarget-nvptx-bc ${bclib_target_name})
|
||||
|
||||
# Copy library to destination.
|
||||
add_custom_command(TARGET ${bclib_target_name} POST_BUILD
|
||||
COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name}
|
||||
${LIBOMPTARGET_LIBRARY_DIR})
|
||||
|
||||
# Install bitcode library under the lib destination folder.
|
||||
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${bclib_name} DESTINATION "${OPENMP_INSTALL_LIBDIR}")
|
||||
endforeach()
|
||||
|
||||
# Test will be enabled if the building machine supports CUDA
|
||||
if (LIBOMPTARGET_DEP_CUDA_FOUND)
|
||||
add_subdirectory(test)
|
||||
endif()
|
|
@ -1,523 +0,0 @@
|
|||
|
||||
**Design document for OpenMP reductions on the GPU**
|
||||
|
||||
//Abstract: //In this document we summarize the new design for an OpenMP
|
||||
implementation of reductions on NVIDIA GPUs. This document comprises
|
||||
* a succinct background review,
|
||||
* an introduction to the decoupling of reduction algorithm and
|
||||
data-structure-specific processing routines,
|
||||
* detailed illustrations of reduction algorithms used and
|
||||
* a brief overview of steps we have made beyond the last implementation.
|
||||
|
||||
**Problem Review**
|
||||
|
||||
Consider a typical OpenMP program with reduction pragma.
|
||||
|
||||
```
|
||||
double foo, bar;
|
||||
#pragma omp parallel for reduction(+:foo, bar)
|
||||
for (int i = 0; i < N; i++) {
|
||||
foo+=A[i]; bar+=B[i];
|
||||
}
|
||||
```
|
||||
where 'foo' and 'bar' are reduced across all threads in the parallel region.
|
||||
Our primary goal is to efficiently aggregate the values of foo and bar in
|
||||
such manner that
|
||||
* makes the compiler logically concise.
|
||||
* efficiently reduces within warps, threads, blocks and the device.
|
||||
|
||||
**Introduction to Decoupling**
|
||||
In this section we address the problem of making the compiler
|
||||
//logically concise// by partitioning the task of reduction into two broad
|
||||
categories: data-structure specific routines and algorithmic routines.
|
||||
|
||||
The previous reduction implementation was highly coupled with
|
||||
the specificity of the reduction element data structures (e.g., sizes, data
|
||||
types) and operators of the reduction (e.g., addition, multiplication). In
|
||||
our implementation we strive to decouple them. In our final implementations,
|
||||
we could remove all template functions in our runtime system.
|
||||
|
||||
The (simplified) pseudo code generated by LLVM is as follows:
|
||||
|
||||
```
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn,
|
||||
interWarpCpyFn)
|
||||
where:
|
||||
struct ReduceData {
|
||||
double *foo;
|
||||
double *bar;
|
||||
} reduceData
|
||||
reduceData.foo = &foo_p
|
||||
reduceData.bar = &bar_p
|
||||
|
||||
shuffleReduceFn and interWarpCpyFn are two auxiliary functions
|
||||
generated to aid the runtime performing algorithmic steps
|
||||
while being data-structure agnostic about ReduceData.
|
||||
|
||||
In particular, shuffleReduceFn is a function that takes the following
|
||||
inputs:
|
||||
a. local copy of ReduceData
|
||||
b. its lane_id
|
||||
c. the offset of the lane_id which hosts a remote ReduceData
|
||||
relative to the current one
|
||||
d. an algorithm version parameter determining which reduction
|
||||
algorithm to use.
|
||||
This shuffleReduceFn retrieves the remote ReduceData through shuffle
|
||||
intrinsics and reduces, using the algorithm specified by the 4th
|
||||
parameter, the local ReduceData and with the remote ReduceData element
|
||||
wise, and places the resultant values into the local ReduceData.
|
||||
|
||||
Different reduction algorithms are implemented with different runtime
|
||||
functions, but they all make calls to this same shuffleReduceFn to
|
||||
perform the essential reduction step. Therefore, based on the 4th
|
||||
parameter, this shuffleReduceFn will behave slightly differently to
|
||||
cooperate with the runtime function to ensure correctness under
|
||||
different circumstances.
|
||||
|
||||
InterWarpCpyFn, as the name suggests, is a function that copies data
|
||||
across warps. Its function is to tunnel all the thread private
|
||||
ReduceData that is already reduced within a warp to a lane in the first
|
||||
warp with minimal shared memory footprint. This is an essential step to
|
||||
prepare for the last step of a block reduction.
|
||||
|
||||
(Warp, block, device level reduction routines that utilize these
|
||||
auxiliary functions will be discussed in the next section.)
|
||||
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
```
|
||||
|
||||
**Reduction Algorithms**
|
||||
|
||||
On the warp level, we have three versions of the algorithms:
|
||||
|
||||
1. Full Warp Reduction
|
||||
|
||||
```
|
||||
gpu_regular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr ShuffleReduceFn) {
|
||||
for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
|
||||
ShuffleReduceFn(reduce_data, 0, offset, 0);
|
||||
}
|
||||
```
|
||||
ShuffleReduceFn is used here with lane_id set to 0 because it is not used
|
||||
therefore we save instructions by not retrieving lane_id from the corresponding
|
||||
special registers. The 4th parameters, which represents the version of the
|
||||
algorithm being used here, is set to 0 to signify full warp reduction.
|
||||
|
||||
In this version specified (=0), the ShuffleReduceFn behaves, per element, as
|
||||
follows:
|
||||
|
||||
```
|
||||
//reduce_elem refers to an element in the local ReduceData
|
||||
//remote_elem is retrieved from a remote lane
|
||||
remote_elem = shuffle_down(reduce_elem, offset, 32);
|
||||
reduce_elem = reduce_elem @ remote_elem;
|
||||
|
||||
```
|
||||
|
||||
An illustration of this algorithm operating on a hypothetical 8-lane full-warp
|
||||
would be:
|
||||
{F74}
|
||||
The coloring invariant follows that elements with the same color will be
|
||||
combined and reduced in the next reduction step. As can be observed, no overhead
|
||||
is present, exactly log(2, N) steps are needed.
|
||||
|
||||
2. Contiguous Full Warp Reduction
|
||||
```
|
||||
gpu_irregular_warp_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr ShuffleReduceFn, int size,
|
||||
int lane_id) {
|
||||
int curr_size;
|
||||
int offset;
|
||||
curr_size = size;
|
||||
mask = curr_size/2;
|
||||
while (offset>0) {
|
||||
ShuffleReduceFn(reduce_data, lane_id, offset, 1);
|
||||
curr_size = (curr_size+1)/2;
|
||||
offset = curr_size/2;
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
In this version specified (=1), the ShuffleReduceFn behaves, per element, as
|
||||
follows:
|
||||
```
|
||||
//reduce_elem refers to an element in the local ReduceData
|
||||
//remote_elem is retrieved from a remote lane
|
||||
remote_elem = shuffle_down(reduce_elem, offset, 32);
|
||||
if (lane_id < offset) {
|
||||
reduce_elem = reduce_elem @ remote_elem
|
||||
} else {
|
||||
reduce_elem = remote_elem
|
||||
}
|
||||
```
|
||||
|
||||
An important invariant (also a restriction on the starting state of the
|
||||
reduction) is that this algorithm assumes that all unused ReduceData are
|
||||
located in a contiguous subset of threads in a warp starting from lane 0.
|
||||
|
||||
With the presence of a trailing active lane with an odd-numbered lane
|
||||
id, its value will not be aggregated with any other lane. Therefore,
|
||||
in order to preserve the invariant, such ReduceData is copied to the first lane
|
||||
whose thread-local ReduceData has already being used in a previous reduction
|
||||
and would therefore be useless otherwise.
|
||||
|
||||
An illustration of this algorithm operating on a hypothetical 8-lane partial
|
||||
warp woud be:
|
||||
{F75}
|
||||
|
||||
As illustrated, this version of the algorithm introduces overhead whenever
|
||||
we have odd number of participating lanes in any reduction step to
|
||||
copy data between lanes.
|
||||
|
||||
3. Dispersed Partial Warp Reduction
|
||||
```
|
||||
gpu_irregular_simt_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr ShuffleReduceFn) {
|
||||
int size, remote_id;
|
||||
int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2;
|
||||
do {
|
||||
remote_id = find_the_next_active_lane_id_right_after_me();
|
||||
// the above function returns 0 of no active lane
|
||||
// is present right after the current thread.
|
||||
size = get_number_of_active_lanes_in_this_warp();
|
||||
logical_lane_id /= 2;
|
||||
ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2);
|
||||
} while (logical_lane_id % 2 == 0 && size > 1);
|
||||
```
|
||||
|
||||
There is no assumption made about the initial state of the reduction.
|
||||
Any number of lanes (>=1) could be active at any position. The reduction
|
||||
result is kept in the first active lane.
|
||||
|
||||
In this version specified (=2), the ShuffleReduceFn behaves, per element, as
|
||||
follows:
|
||||
```
|
||||
//reduce_elem refers to an element in the local ReduceData
|
||||
//remote_elem is retrieved from a remote lane
|
||||
remote_elem = shuffle_down(reduce_elem, offset, 32);
|
||||
if (LaneId % 2 == 0 && Offset > 0) {
|
||||
reduce_elem = reduce_elem @ remote_elem
|
||||
} else {
|
||||
reduce_elem = remote_elem
|
||||
}
|
||||
```
|
||||
We will proceed with a brief explanation for some arguments passed in,
|
||||
it is important to notice that, in this section, we will introduce the
|
||||
concept of logical_lane_id, and it is important to distinguish it
|
||||
from physical lane_id as defined by nvidia.
|
||||
1. //logical_lane_id//: as the name suggests, it refers to the calculated
|
||||
lane_id (instead of the physical one defined by nvidia) that would make
|
||||
our algorithm logically concise. A thread with logical_lane_id k means
|
||||
there are (k-1) threads before it.
|
||||
2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane
|
||||
id of the remote lane from which we will retrieve the ReduceData. We
|
||||
subtract (threadIdx+1) from it because we would like to maintain only one
|
||||
underlying shuffle intrinsic (which is used to communicate among lanes in a
|
||||
warp). This particular version of shuffle intrinsic we take accepts only
|
||||
offsets, instead of absolute lane_id. Therefore the subtraction is performed
|
||||
on the absolute lane_id we calculated to obtain the offset.
|
||||
|
||||
This algorithm is slightly different in 2 ways and it is not, conceptually, a
|
||||
generalization of the above algorithms.
|
||||
1. It reduces elements close to each other. For instance, values in the 0th lane
|
||||
is to be combined with that of the 1st lane; values in the 2nd lane is to be
|
||||
combined with that of the 3rd lane. We did not use the previous algorithm
|
||||
where the first half of the (partial) warp is reduced with the second half
|
||||
of the (partial) warp. This is because, the mapping
|
||||
f(x): logical_lane_id -> physical_lane_id;
|
||||
can be easily calculated whereas its inverse
|
||||
f^-1(x): physical_lane_id -> logical_lane_id
|
||||
cannot and performing such reduction requires the inverse to be known.
|
||||
2. Because this algorithm is agnostic about the positions of the lanes that are
|
||||
active, we do not need to perform the coping step as in the second
|
||||
algorithm.
|
||||
An illustrative run would look like
|
||||
{F76}
|
||||
As observed, overhead is high because in each and every step of reduction,
|
||||
logical_lane_id is recalculated; so is the remote_id.
|
||||
|
||||
On a block level, we have implemented the following block reduce algorithm:
|
||||
|
||||
```
|
||||
gpu_irregular_block_reduce(void *reduce_data,
|
||||
kmp_ShuffleReductFctPtr shuflReduceFn,
|
||||
kmp_InterWarpCopyFctPtr interWarpCpyFn,
|
||||
int size) {
|
||||
|
||||
int wid = threadIdx.x/WARPSIZE;
|
||||
int lane_id = threadIdx.x%WARPSIZE;
|
||||
|
||||
int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division
|
||||
|
||||
unsigned tnum = __ballot(1);
|
||||
int thread_num = __popc(tnum);
|
||||
|
||||
//full warp reduction
|
||||
if (thread_num == WARPSIZE) {
|
||||
gpu_regular_warp_reduce(reduce_data, shuflReduceFn);
|
||||
}
|
||||
//partial warp reduction
|
||||
if (thread_num < WARPSIZE) {
|
||||
gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num,
|
||||
lane_id);
|
||||
}
|
||||
//Gather all the reduced values from each warp
|
||||
//to the first warp
|
||||
//named_barrier inside this function to ensure
|
||||
//correctness. It is effectively a sync_thread
|
||||
//that won't deadlock.
|
||||
interWarpCpyFn(reduce_data, warp_needed);
|
||||
|
||||
//This is to reduce data gathered from each "warp master".
|
||||
if (wid==0) {
|
||||
gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed,
|
||||
lane_id);
|
||||
}
|
||||
|
||||
return;
|
||||
}
|
||||
```
|
||||
In this function, no ShuffleReduceFn is directly called as it makes calls
|
||||
to various versions of the warp-reduction functions. It first reduces
|
||||
ReduceData warp by warp; in the end, we end up with the number of
|
||||
ReduceData equal to the number of warps present in this thread
|
||||
block. We then proceed to gather all such ReduceData to the first warp.
|
||||
|
||||
As observed, in this algorithm we make use of the function InterWarpCpyFn,
|
||||
which copies data from each of the "warp master" (0th lane of each warp, where
|
||||
a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a
|
||||
mathematical sense) the problem of reduction across warp masters in a block to
|
||||
the problem of warp reduction which we already have solutions to.
|
||||
|
||||
We can thus completely avoid the use of atomics to reduce in a threadblock.
|
||||
|
||||
**Efficient Cross Block Reduce**
|
||||
|
||||
The next challenge is to reduce values across threadblocks. We aim to do this
|
||||
without atomics or critical sections.
|
||||
|
||||
Let a kernel be started with TB threadblocks.
|
||||
Let the GPU have S SMs.
|
||||
There can be at most N active threadblocks per SM at any time.
|
||||
|
||||
Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of
|
||||
at most 'N' active threadblocks on SM s. Let each threadblock active on an SM
|
||||
be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id)
|
||||
uniquely identifies an active threadblock on the GPU.
|
||||
|
||||
To efficiently implement cross block reduce, we first allocate an array for
|
||||
each value to be reduced of size S*N (which is the maximum number of active
|
||||
threadblocks at any time on the device).
|
||||
|
||||
Each threadblock reduces its value to slot [s][id]. This can be done without
|
||||
locking since no other threadblock can write to the same slot concurrently.
|
||||
|
||||
As a final stage, we reduce the values in the array as follows:
|
||||
|
||||
```
|
||||
// Compiler generated wrapper function for each target region with a reduction
|
||||
clause.
|
||||
target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1
|
||||
thread.
|
||||
// Use dynamic parallelism to launch M teams, N threads as requested by the
|
||||
user to execute the target region.
|
||||
|
||||
target_function<<M, N>>(map_args)
|
||||
|
||||
Reduce values in reduction_array
|
||||
|
||||
```
|
||||
|
||||
**Comparison with Last Version**
|
||||
|
||||
|
||||
The (simplified) pseudo code generated by LLVM on the host is as follows:
|
||||
|
||||
|
||||
```
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
|
||||
where:
|
||||
struct ReduceData {
|
||||
double *foo;
|
||||
double *bar;
|
||||
} reduceData
|
||||
reduceData.foo = &foo_p
|
||||
reduceData.bar = &bar_p
|
||||
|
||||
reduceFn is a pointer to a function that takes in two inputs
|
||||
of type ReduceData, "reduces" them element wise, and places the
|
||||
result in the first input:
|
||||
reduceFn(ReduceData *a, ReduceData *b)
|
||||
a = a @ b
|
||||
|
||||
Every thread in the parallel region calls kmpc_reduce_nowait with
|
||||
its private copy of reduceData. The runtime reduces across the
|
||||
threads (using tree reduction on the operator 'reduceFn?) and stores
|
||||
the final result in the master thread if successful.
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
5. else if ret == 2:
|
||||
In this case kmpc_reduce_nowait() could not use tree reduction,
|
||||
so use atomics instead:
|
||||
each thread atomically writes to foo
|
||||
each thread atomically writes to bar
|
||||
```
|
||||
|
||||
On a GPU, a similar reduction may need to be performed across SIMT threads,
|
||||
warps, and threadblocks. The challenge is to do so efficiently in a fashion
|
||||
that is compatible with the LLVM OpenMP implementation.
|
||||
|
||||
In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs,
|
||||
the salient steps of the code generated are as follows:
|
||||
|
||||
|
||||
```
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock)
|
||||
status = can_block_reduce()
|
||||
if status == 1:
|
||||
reduce efficiently to thread 0 using shuffles and shared memory.
|
||||
return 1
|
||||
else
|
||||
cannot use efficient block reduction, fallback to atomics
|
||||
return 2
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
5. else if ret == 2:
|
||||
In this case kmpc_reduce_nowait() could not use tree reduction,
|
||||
so use atomics instead:
|
||||
each thread atomically writes to foo
|
||||
each thread atomically writes to bar
|
||||
```
|
||||
|
||||
The function can_block_reduce() is defined as follows:
|
||||
|
||||
|
||||
```
|
||||
int32_t can_block_reduce() {
|
||||
int tid = GetThreadIdInTeam();
|
||||
int nt = GetNumberOfOmpThreads(tid);
|
||||
if (nt != blockDim.x)
|
||||
return 0;
|
||||
unsigned tnum = __ballot(1);
|
||||
if (tnum != (~0x0)) {
|
||||
return 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
```
|
||||
|
||||
This function permits the use of the efficient block reduction algorithm
|
||||
using shuffles and shared memory (return 1) only if (a) all SIMT threads in
|
||||
a warp are active (i.e., number of threads in the parallel region is a
|
||||
multiple of 32) and (b) the number of threads in the parallel region
|
||||
(set by the num_threads clause) equals blockDim.x.
|
||||
|
||||
If either of these preconditions is not true, each thread in the threadblock
|
||||
updates the global value using atomics.
|
||||
|
||||
Atomics and compare-and-swap operations are expensive on many threaded
|
||||
architectures such as GPUs and we must avoid them completely.
|
||||
|
||||
|
||||
**Appendix: Implementation Details**
|
||||
|
||||
|
||||
```
|
||||
// Compiler generated function.
|
||||
reduceFn(ReduceData *a, ReduceData *b)
|
||||
a->foo = a->foo + b->foo
|
||||
a->bar = a->bar + b->bar
|
||||
|
||||
// Compiler generated function.
|
||||
swapAndReduceFn(ReduceData *thread_private, int lane)
|
||||
ReduceData *remote = new ReduceData()
|
||||
remote->foo = shuffle_double(thread_private->foo, lane)
|
||||
remote->bar = shuffle_double(thread_private->bar, lane)
|
||||
reduceFn(thread_private, remote)
|
||||
|
||||
// OMP runtime function.
|
||||
warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn):
|
||||
offset = 16
|
||||
while (offset > 0)
|
||||
swapAndReduceFn(thread_private, offset)
|
||||
offset /= 2
|
||||
|
||||
// OMP runtime function.
|
||||
warpReduce_irregular():
|
||||
...
|
||||
|
||||
// OMP runtime function.
|
||||
kmpc_reduce_warp(reduceData, swapAndReduceFn)
|
||||
if all_lanes_active:
|
||||
warpReduce_regular(reduceData, swapAndReduceFn)
|
||||
else:
|
||||
warpReduce_irregular(reduceData, swapAndReduceFn)
|
||||
if in_simd_region:
|
||||
// all done, reduce to global in simd lane 0
|
||||
return 1
|
||||
else if in_parallel_region:
|
||||
// done reducing to one value per warp, now reduce across warps
|
||||
return 3
|
||||
|
||||
// OMP runtime function; one for each basic type.
|
||||
kmpc_reduce_block_double(double *a)
|
||||
if lane == 0:
|
||||
shared[wid] = *a
|
||||
named_barrier(1, num_threads)
|
||||
if wid == 0
|
||||
block_reduce(shared)
|
||||
if lane == 0
|
||||
*a = shared[0]
|
||||
named_barrier(1, num_threads)
|
||||
if wid == 0 and lane == 0
|
||||
return 1 // write back reduced result
|
||||
else
|
||||
return 0 // don't do anything
|
||||
|
||||
```
|
||||
|
||||
|
||||
|
||||
```
|
||||
// Compiler generated code.
|
||||
1. Create private copies of variables: foo_p, bar_p
|
||||
2. Each thread reduces the chunk of A and B assigned to it and writes
|
||||
to foo_p and bar_p respectively.
|
||||
3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn)
|
||||
4. if ret == 1:
|
||||
The master thread stores the reduced result in the globals.
|
||||
foo += reduceData.foo; bar += reduceData.bar
|
||||
5. else if ret == 3:
|
||||
ret = block_reduce_double(reduceData.foo)
|
||||
if ret == 1:
|
||||
foo += reduceData.foo
|
||||
ret = block_reduce_double(reduceData.bar)
|
||||
if ret == 1:
|
||||
bar += reduceData.bar
|
||||
```
|
||||
|
||||
**Notes**
|
||||
|
||||
1. This scheme requires that the CUDA OMP runtime can call llvm generated
|
||||
functions. This functionality now works.
|
||||
2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery
|
||||
(including calls through function pointers) are optimized away.
|
||||
3. If we are reducing multiple to multiple variables in a parallel region,
|
||||
the reduce operations are all performed in warpReduce_[ir]regular(). This
|
||||
results in more instructions in the loop and should result in fewer
|
||||
stalls due to data dependencies. Unfortunately we cannot do the same in
|
||||
kmpc_reduce_block_double() without increasing shared memory usage.
|
|
@ -1,17 +0,0 @@
|
|||
//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _NVPTX_INTERFACE_H_
|
||||
#define _NVPTX_INTERFACE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#define EXTERN extern "C"
|
||||
typedef uint32_t omp_lock_t; /* arbitrary type of the right length */
|
||||
|
||||
#endif
|
|
@ -1,198 +0,0 @@
|
|||
//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Definitions of target specific functions
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#pragma omp declare target
|
||||
|
||||
#include "common/debug.h"
|
||||
#include "target_impl.h"
|
||||
#include "target_interface.h"
|
||||
|
||||
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
||||
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
||||
}
|
||||
|
||||
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) {
|
||||
uint64_t val;
|
||||
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
|
||||
return val;
|
||||
}
|
||||
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
|
||||
__kmpc_impl_lanemask_t res;
|
||||
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res));
|
||||
return res;
|
||||
}
|
||||
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
|
||||
__kmpc_impl_lanemask_t res;
|
||||
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res));
|
||||
return res;
|
||||
}
|
||||
|
||||
EXTERN uint32_t __kmpc_impl_smid() {
|
||||
uint32_t id;
|
||||
asm("mov.u32 %0, %%smid;" : "=r"(id));
|
||||
return id;
|
||||
}
|
||||
|
||||
EXTERN double __kmpc_impl_get_wtick() {
|
||||
// Timer precision is 1ns
|
||||
return ((double)1E-9);
|
||||
}
|
||||
|
||||
EXTERN double __kmpc_impl_get_wtime() {
|
||||
unsigned long long nsecs;
|
||||
asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs));
|
||||
return (double)nsecs * __kmpc_impl_get_wtick();
|
||||
}
|
||||
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
||||
unsigned int Mask;
|
||||
asm volatile("activemask.b32 %0;" : "=r"(Mask));
|
||||
return Mask;
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_syncthreads() {
|
||||
int barrier = 2;
|
||||
asm volatile("barrier.sync %0;"
|
||||
:
|
||||
: "r"(barrier)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
|
||||
__nvvm_bar_warp_sync(Mask);
|
||||
}
|
||||
|
||||
// NVPTX specific kernel initialization
|
||||
EXTERN void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
|
||||
}
|
||||
|
||||
// Barrier until num_threads arrive.
|
||||
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads) {
|
||||
// The named barrier for active parallel threads of a team in an L1 parallel
|
||||
// region to synchronize with each other.
|
||||
int barrier = 1;
|
||||
asm volatile("barrier.sync %0, %1;"
|
||||
:
|
||||
: "r"(barrier), "r"(num_threads)
|
||||
: "memory");
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_threadfence() { __nvvm_membar_gl(); }
|
||||
EXTERN void __kmpc_impl_threadfence_block() { __nvvm_membar_cta(); }
|
||||
EXTERN void __kmpc_impl_threadfence_system() { __nvvm_membar_sys(); }
|
||||
|
||||
// Calls to the NVPTX layer (assuming 1D layout)
|
||||
EXTERN int __kmpc_get_hardware_thread_id_in_block() {
|
||||
return __nvvm_read_ptx_sreg_tid_x();
|
||||
}
|
||||
EXTERN int GetBlockIdInKernel() { return __nvvm_read_ptx_sreg_ctaid_x(); }
|
||||
EXTERN int __kmpc_get_hardware_num_blocks() {
|
||||
return __nvvm_read_ptx_sreg_nctaid_x();
|
||||
}
|
||||
EXTERN int __kmpc_get_hardware_num_threads_in_block() {
|
||||
return __nvvm_read_ptx_sreg_ntid_x();
|
||||
}
|
||||
EXTERN unsigned __kmpc_get_warp_size() { return WARPSIZE; }
|
||||
EXTERN unsigned GetWarpId() {
|
||||
return __kmpc_get_hardware_thread_id_in_block() / WARPSIZE;
|
||||
}
|
||||
EXTERN unsigned GetLaneId() {
|
||||
return __kmpc_get_hardware_thread_id_in_block() & (WARPSIZE - 1);
|
||||
}
|
||||
|
||||
// Atomics
|
||||
uint32_t __kmpc_atomic_add(uint32_t *Address, uint32_t Val) {
|
||||
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
uint32_t __kmpc_atomic_inc(uint32_t *Address, uint32_t Val) {
|
||||
return __nvvm_atom_inc_gen_ui(Address, Val);
|
||||
}
|
||||
|
||||
uint32_t __kmpc_atomic_max(uint32_t *Address, uint32_t Val) {
|
||||
return __atomic_fetch_max(Address, Val, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
uint32_t __kmpc_atomic_exchange(uint32_t *Address, uint32_t Val) {
|
||||
uint32_t R;
|
||||
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
|
||||
return R;
|
||||
}
|
||||
|
||||
uint32_t __kmpc_atomic_cas(uint32_t *Address, uint32_t Compare, uint32_t Val) {
|
||||
(void)__atomic_compare_exchange(Address, &Compare, &Val, false,
|
||||
__ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
|
||||
return Compare;
|
||||
}
|
||||
|
||||
unsigned long long __kmpc_atomic_exchange(unsigned long long *Address,
|
||||
unsigned long long Val) {
|
||||
unsigned long long R;
|
||||
__atomic_exchange(Address, &Val, &R, __ATOMIC_SEQ_CST);
|
||||
return R;
|
||||
}
|
||||
|
||||
unsigned long long __kmpc_atomic_add(unsigned long long *Address,
|
||||
unsigned long long Val) {
|
||||
return __atomic_fetch_add(Address, Val, __ATOMIC_SEQ_CST);
|
||||
}
|
||||
|
||||
#define __OMP_SPIN 1000
|
||||
#define UNSET 0u
|
||||
#define SET 1u
|
||||
|
||||
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_unset_lock(lock);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) {
|
||||
__kmpc_impl_unset_lock(lock);
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) {
|
||||
// TODO: not sure spinning is a good idea here..
|
||||
while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) {
|
||||
int32_t start = __nvvm_read_ptx_sreg_clock();
|
||||
int32_t now;
|
||||
for (;;) {
|
||||
now = __nvvm_read_ptx_sreg_clock();
|
||||
int32_t cycles = now > start ? now - start : now + (0xffffffff - start);
|
||||
if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) {
|
||||
break;
|
||||
}
|
||||
}
|
||||
} // wait for 0 to be the read value
|
||||
}
|
||||
|
||||
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) {
|
||||
(void)__kmpc_atomic_exchange(lock, UNSET);
|
||||
}
|
||||
|
||||
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) {
|
||||
return __kmpc_atomic_add(lock, 0u);
|
||||
}
|
||||
|
||||
extern "C" {
|
||||
void *malloc(size_t);
|
||||
void free(void *);
|
||||
int32_t vprintf(const char *, void *);
|
||||
}
|
||||
|
||||
EXTERN void *__kmpc_impl_malloc(size_t x) { return malloc(x); }
|
||||
EXTERN void __kmpc_impl_free(void *x) { free(x); }
|
||||
|
||||
EXTERN int32_t __llvm_omp_vprintf(const char *Format, void *Arguments,
|
||||
uint32_t) {
|
||||
return vprintf(Format, Arguments);
|
||||
}
|
||||
|
||||
#pragma omp end declare target
|
|
@ -1,89 +0,0 @@
|
|||
//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// Definitions of target specific functions
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
#ifndef _TARGET_IMPL_H_
|
||||
#define _TARGET_IMPL_H_
|
||||
|
||||
#include "nvptx_interface.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
|
||||
// subset of inttypes.h
|
||||
#define PRId64 "ld"
|
||||
#define PRIu64 "lu"
|
||||
|
||||
typedef uint32_t __kmpc_impl_lanemask_t;
|
||||
|
||||
#define INLINE inline __attribute__((always_inline))
|
||||
#define NOINLINE __attribute__((noinline))
|
||||
#define ALIGN(N) __attribute__((aligned(N)))
|
||||
#define PLUGIN_ACCESSIBLE /* no annotation needed for cuda plugin */
|
||||
|
||||
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
||||
|
||||
INLINE constexpr const llvm::omp::GV &getGridValue() {
|
||||
return llvm::omp::NVPTXGridValues;
|
||||
}
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// Kernel options
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
////////////////////////////////////////////////////////////////////////////////
|
||||
// The following def must match the absolute limit hardwired in the host RTL
|
||||
// max number of threads per team
|
||||
enum { MAX_THREADS_PER_TEAM = getGridValue().GV_Max_WG_Size };
|
||||
enum { WARPSIZE = getGridValue().GV_Warp_Size };
|
||||
|
||||
// Maximum number of omp state objects per SM allocated statically in global
|
||||
// memory.
|
||||
#if __CUDA_ARCH__ >= 600
|
||||
#define OMP_STATE_COUNT 32
|
||||
#else
|
||||
#define OMP_STATE_COUNT 16
|
||||
#endif
|
||||
|
||||
#if !defined(MAX_SM)
|
||||
#if __CUDA_ARCH__ >= 900
|
||||
#error unsupported compute capability, define MAX_SM via LIBOMPTARGET_NVPTX_MAX_SM cmake option
|
||||
#elif __CUDA_ARCH__ >= 800
|
||||
// GA100 design has a maxinum of 128 SMs but A100 product only has 108 SMs
|
||||
// GA102 design has a maxinum of 84 SMs
|
||||
#define MAX_SM 108
|
||||
#elif __CUDA_ARCH__ >= 700
|
||||
#define MAX_SM 84
|
||||
#elif __CUDA_ARCH__ >= 600
|
||||
#define MAX_SM 56
|
||||
#else
|
||||
#define MAX_SM 16
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define OMP_ACTIVE_PARALLEL_LEVEL 128
|
||||
|
||||
// Data sharing related quantities, need to match what is used in the compiler.
|
||||
enum DATA_SHARING_SIZES {
|
||||
// The size reserved for data in a shared memory slot.
|
||||
DS_Slot_Size = getGridValue().GV_Slot_Size,
|
||||
// The slot size that should be reserved for a working warp.
|
||||
DS_Worker_Warp_Slot_Size = getGridValue().warpSlotSize(),
|
||||
// The maximum number of warps in use
|
||||
DS_Max_Warp_Number = getGridValue().maxWarpNumber(),
|
||||
};
|
||||
|
||||
enum : __kmpc_impl_lanemask_t {
|
||||
__kmpc_impl_all_lanes = ~(__kmpc_impl_lanemask_t)0
|
||||
};
|
||||
|
||||
#define printf(...)
|
||||
|
||||
#endif
|
|
@ -1,25 +0,0 @@
|
|||
if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang")
|
||||
# Silently return, no need to annoy the user.
|
||||
return()
|
||||
endif()
|
||||
|
||||
set(deps omptarget omp)
|
||||
if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB)
|
||||
set(deps ${deps} omptarget-nvptx-bc)
|
||||
endif()
|
||||
|
||||
# Run with only one thread to only launch one application to the GPU at a time.
|
||||
add_openmp_testsuite(check-libomptarget-nvptx
|
||||
"Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR}
|
||||
EXCLUDE_FROM_CHECK_ALL
|
||||
DEPENDS ${deps} ARGS -j1)
|
||||
|
||||
set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING
|
||||
"Extra compiler flags to send to the test compiler.")
|
||||
set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS
|
||||
"-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING
|
||||
"OpenMP compiler flags to use for testing libomptarget-nvptx.")
|
||||
|
||||
# Configure the lit.site.cfg.in file
|
||||
set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!")
|
||||
configure_file(lit.site.cfg.in lit.site.cfg @ONLY)
|
|
@ -1,22 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(){
|
||||
int max_threads = -1;
|
||||
int num_threads = -1;
|
||||
|
||||
#pragma omp target map(tofrom: max_threads)
|
||||
max_threads = omp_get_max_threads();
|
||||
|
||||
#pragma omp target parallel map(tofrom: num_threads)
|
||||
{
|
||||
#pragma omp master
|
||||
num_threads = omp_get_num_threads();
|
||||
}
|
||||
|
||||
// CHECK: Max Threads: 128, Num Threads: 128
|
||||
printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,38 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
const int MaxThreads = 1024;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1;
|
||||
|
||||
#pragma omp target map(cancellation, dynamic, nested, maxActiveLevels)
|
||||
{
|
||||
// libomptarget-nvptx doesn't support cancellation.
|
||||
cancellation = omp_get_cancellation();
|
||||
|
||||
// No support for dynamic adjustment of the number of threads.
|
||||
omp_set_dynamic(1);
|
||||
dynamic = omp_get_dynamic();
|
||||
|
||||
// libomptarget-nvptx doesn't support nested parallelism.
|
||||
omp_set_nested(1);
|
||||
nested = omp_get_nested();
|
||||
|
||||
omp_set_max_active_levels(42);
|
||||
maxActiveLevels = omp_get_max_active_levels();
|
||||
}
|
||||
|
||||
// CHECK: cancellation = 0
|
||||
printf("cancellation = %d\n", cancellation);
|
||||
// CHECK: dynamic = 0
|
||||
printf("dynamic = %d\n", dynamic);
|
||||
// CHECK: nested = 0
|
||||
printf("nested = %d\n", nested);
|
||||
// CHECK: maxActiveLevels = 1
|
||||
printf("maxActiveLevels = %d\n", maxActiveLevels);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,53 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int MaxThreadsL1 = -1, MaxThreadsL2 = -1;
|
||||
|
||||
#pragma omp declare reduction(unique:int \
|
||||
: omp_out = (omp_in == 1 ? omp_in : omp_out)) \
|
||||
initializer(omp_priv = -1)
|
||||
|
||||
// Non-SPMD mode.
|
||||
#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32) \
|
||||
num_teams(1)
|
||||
{
|
||||
MaxThreadsL1 = omp_get_max_threads();
|
||||
#pragma omp parallel reduction(unique : MaxThreadsL2)
|
||||
{ MaxThreadsL2 = omp_get_max_threads(); }
|
||||
}
|
||||
|
||||
//FIXME: This Non-SPMD kernel will have 32 active threads due to
|
||||
// thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of
|
||||
// threads in block (64 in this case), which translates to worker
|
||||
// threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD
|
||||
// kernels. According to the spec, omp_get_max_threads must return the
|
||||
// max active threads possible between the two kernel types.
|
||||
|
||||
// CHECK: Non-SPMD MaxThreadsL1 = 64
|
||||
printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1);
|
||||
// CHECK: Non-SPMD MaxThreadsL2 = 1
|
||||
printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2);
|
||||
|
||||
// SPMD mode with full runtime
|
||||
MaxThreadsL2 = -1;
|
||||
#pragma omp target parallel reduction(unique : MaxThreadsL2)
|
||||
{ MaxThreadsL2 = omp_get_max_threads(); }
|
||||
|
||||
// CHECK: SPMD with full runtime MaxThreadsL2 = 1
|
||||
printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
|
||||
|
||||
// SPMD mode without runtime
|
||||
MaxThreadsL2 = -1;
|
||||
#pragma omp target parallel for reduction(unique : MaxThreadsL2)
|
||||
for (int I = 0; I < 2; ++I) {
|
||||
MaxThreadsL2 = omp_get_max_threads();
|
||||
}
|
||||
|
||||
// CHECK: SPMD without runtime MaxThreadsL2 = 1
|
||||
printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,72 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1;
|
||||
|
||||
#pragma omp declare reduction(unique64:int \
|
||||
: omp_out = (omp_in == 64 ? omp_in : omp_out)) \
|
||||
initializer(omp_priv = -1)
|
||||
#pragma omp declare reduction(unique32:int \
|
||||
: omp_out = (omp_in == 32 ? omp_in : omp_out)) \
|
||||
initializer(omp_priv = -1)
|
||||
|
||||
// Non-SPMD mode.
|
||||
#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2) \
|
||||
thread_limit(64) num_teams(1)
|
||||
{
|
||||
ThreadLimitL0 = omp_get_thread_limit();
|
||||
#pragma omp parallel reduction(unique64 \
|
||||
: ThreadLimitL1, ThreadLimitL2) num_threads(32)
|
||||
{
|
||||
ThreadLimitL1 = omp_get_thread_limit();
|
||||
#pragma omp parallel reduction(unique64 : ThreadLimitL2)
|
||||
{ ThreadLimitL2 = omp_get_thread_limit(); }
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: Non-SPMD ThreadLimitL0 = 64
|
||||
printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0);
|
||||
// CHECK: Non-SPMD ThreadLimitL1 = 64
|
||||
printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1);
|
||||
// CHECK: Non-SPMD ThreadLimitL2 = 64
|
||||
printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2);
|
||||
|
||||
// SPMD mode with full runtime
|
||||
ThreadLimitL1 = -1;
|
||||
ThreadLimitL2 = -1;
|
||||
#pragma omp target parallel reduction(unique32 \
|
||||
: ThreadLimitL1, ThreadLimitL2) \
|
||||
num_threads(32)
|
||||
{
|
||||
ThreadLimitL1 = omp_get_thread_limit();
|
||||
#pragma omp parallel reduction(unique32 : ThreadLimitL2)
|
||||
{ ThreadLimitL2 = omp_get_thread_limit(); }
|
||||
}
|
||||
|
||||
// CHECK: SPMD with full runtime ThreadLimitL1 = 32
|
||||
printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
|
||||
// CHECK: SPMD with full runtime ThreadLimitL2 = 32
|
||||
printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
|
||||
|
||||
// SPMD mode without runtime
|
||||
ThreadLimitL1 = -1;
|
||||
ThreadLimitL2 = -1;
|
||||
#pragma omp target parallel for reduction(unique32 \
|
||||
: ThreadLimitL1, ThreadLimitL2) \
|
||||
num_threads(32)
|
||||
for (int I = 0; I < 2; ++I) {
|
||||
ThreadLimitL1 = omp_get_thread_limit();
|
||||
#pragma omp parallel reduction(unique32 : ThreadLimitL2)
|
||||
{ ThreadLimitL2 = omp_get_thread_limit(); }
|
||||
}
|
||||
|
||||
// CHECK: SPMD without runtime ThreadLimitL1 = 32
|
||||
printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1);
|
||||
// CHECK: SPMD without runtime ThreadLimitL2 = 32
|
||||
printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,55 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#pragma omp declare target
|
||||
static void putValueInParallel(int *ptr, int value) {
|
||||
#pragma omp parallel
|
||||
{
|
||||
*ptr = value;
|
||||
}
|
||||
}
|
||||
|
||||
static int getId() {
|
||||
int id;
|
||||
putValueInParallel(&id, omp_get_thread_num());
|
||||
return id;
|
||||
}
|
||||
#pragma omp end declare target
|
||||
|
||||
const int MaxThreads = 1024;
|
||||
const int Threads = 64;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int master;
|
||||
int check[MaxThreads];
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
check[i] = 0;
|
||||
}
|
||||
|
||||
#pragma omp target map(master, check[:])
|
||||
{
|
||||
master = getId();
|
||||
|
||||
#pragma omp parallel num_threads(Threads)
|
||||
{
|
||||
check[omp_get_thread_num()] = getId();
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: master = 0.
|
||||
printf("master = %d.\n", master);
|
||||
// CHECK-NOT: invalid
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
if (i < Threads) {
|
||||
if (check[i] != i) {
|
||||
printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]);
|
||||
}
|
||||
} else if (check[i] != 0) {
|
||||
printf("invalid: check[%d] should be 0, is %d\n", i, check[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,76 +0,0 @@
|
|||
# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79:
|
||||
# Configuration file for the 'lit' test runner.
|
||||
|
||||
import os
|
||||
import lit.formats
|
||||
|
||||
# Tell pylint that we know config and lit_config exist somewhere.
|
||||
if 'PYLINT_IMPORT' in os.environ:
|
||||
config = object()
|
||||
lit_config = object()
|
||||
|
||||
def prepend_library_path(name, value, sep):
|
||||
if name in config.environment:
|
||||
config.environment[name] = value + sep + config.environment[name]
|
||||
else:
|
||||
config.environment[name] = value
|
||||
|
||||
# name: The name of this test suite.
|
||||
config.name = 'libomptarget-nvptx'
|
||||
|
||||
# suffixes: A list of file extensions to treat as test files.
|
||||
config.suffixes = ['.c', '.cpp', '.cc']
|
||||
|
||||
# test_source_root: The root path where tests are located.
|
||||
config.test_source_root = os.path.dirname(__file__)
|
||||
|
||||
# test_exec_root: The root object directory where output is placed
|
||||
config.test_exec_root = config.binary_dir
|
||||
|
||||
# test format
|
||||
config.test_format = lit.formats.ShTest()
|
||||
|
||||
# compiler flags
|
||||
config.test_flags = " -I " + config.omp_header_directory + \
|
||||
" -L " + config.library_dir
|
||||
|
||||
if config.omp_host_rtl_directory:
|
||||
config.test_flags = config.test_flags + \
|
||||
" -L " + config.omp_host_rtl_directory
|
||||
|
||||
config.test_flags = config.test_flags + " " + config.test_extra_flags
|
||||
|
||||
# Setup environment to find dynamic library at runtime.
|
||||
prepend_library_path('LIBRARY_PATH', config.library_dir, ":")
|
||||
prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":")
|
||||
prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":")
|
||||
if config.cuda_libdir:
|
||||
prepend_library_path('LD_LIBRARY_PATH', config.cuda_libdir, ":")
|
||||
|
||||
# Forbid fallback to host.
|
||||
config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY"
|
||||
|
||||
# substitutions
|
||||
config.substitutions.append(("%compilexx-run-and-check",
|
||||
"%compilexx-and-run | " + config.libomptarget_filecheck + " %s"))
|
||||
config.substitutions.append(("%compile-run-and-check",
|
||||
"%compile-and-run | " + config.libomptarget_filecheck + " %s"))
|
||||
config.substitutions.append(("%compilexx-and-run", "%compilexx && %run"))
|
||||
config.substitutions.append(("%compile-and-run", "%compile && %run"))
|
||||
|
||||
config.substitutions.append(("%compilexx",
|
||||
"%clangxx %openmp_flags %cuda_flags %flags %s -o %t"))
|
||||
config.substitutions.append(("%compile",
|
||||
"%clang %openmp_flags %cuda_flags %flags %s -o %t"))
|
||||
|
||||
config.substitutions.append(("%clangxx", config.test_cxx_compiler))
|
||||
config.substitutions.append(("%clang", config.test_c_compiler))
|
||||
config.substitutions.append(("%openmp_flags", config.test_openmp_flags))
|
||||
if config.cuda_path:
|
||||
config.substitutions.append(("%cuda_flags", "--cuda-path=" + config.cuda_path))
|
||||
else:
|
||||
config.substitutions.append(("%cuda_flags", ""))
|
||||
config.substitutions.append(("%flags", config.test_flags))
|
||||
|
||||
config.substitutions.append(("%run", "%t"))
|
||||
config.substitutions.append(("%not", config.libomptarget_not))
|
|
@ -1,17 +0,0 @@
|
|||
@AUTO_GEN_COMMENT@
|
||||
|
||||
config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@"
|
||||
config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@"
|
||||
config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@"
|
||||
config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@"
|
||||
config.cuda_path = "@CUDA_TOOLKIT_ROOT_DIR@"
|
||||
config.cuda_libdir = "@CUDA_LIBDIR@"
|
||||
config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@"
|
||||
config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
|
||||
config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@"
|
||||
config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@"
|
||||
config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@"
|
||||
config.libomptarget_not = "@OPENMP_NOT_EXECUTABLE@"
|
||||
|
||||
# Let the main config do the real work.
|
||||
lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
|
|
@ -1,37 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int data, out, flag = 0;
|
||||
#pragma omp target teams num_teams(2) map(tofrom \
|
||||
: out) map(to \
|
||||
: data, flag) \
|
||||
thread_limit(1)
|
||||
#pragma omp parallel num_threads(1)
|
||||
{
|
||||
if (omp_get_team_num() == 0) {
|
||||
/* Write to the data buffer that will be read by thread in team 1 */
|
||||
data = 42;
|
||||
/* Flush data to thread in team 1 */
|
||||
#pragma omp barrier
|
||||
/* Set flag to release thread in team 1 */
|
||||
#pragma omp atomic write
|
||||
flag = 1;
|
||||
} else if (omp_get_team_num() == 1) {
|
||||
/* Loop until we see the update to the flag */
|
||||
int val;
|
||||
do {
|
||||
#pragma omp atomic read
|
||||
val = flag;
|
||||
} while (val < 1);
|
||||
out = data;
|
||||
#pragma omp barrier
|
||||
}
|
||||
}
|
||||
// CHECK: out=42.
|
||||
/* Value of out will be 42 */
|
||||
printf("out=%d.\n", out);
|
||||
return !(out == 42);
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int data, out, flag = 0;
|
||||
#pragma omp target parallel num_threads(64) map(tofrom \
|
||||
: out, flag) map(to \
|
||||
: data)
|
||||
{
|
||||
if (omp_get_thread_num() == 0) {
|
||||
/* Write to the data buffer that will be read by thread */
|
||||
data = 42;
|
||||
/* Flush data to thread 32 */
|
||||
#pragma omp flush(data)
|
||||
/* Set flag to release thread 32 */
|
||||
#pragma omp atomic write
|
||||
flag = 1;
|
||||
} else if (omp_get_thread_num() == 32) {
|
||||
/* Loop until we see the update to the flag */
|
||||
int val;
|
||||
do {
|
||||
#pragma omp atomic read
|
||||
val = flag;
|
||||
} while (val < 1);
|
||||
out = data;
|
||||
#pragma omp flush(out)
|
||||
}
|
||||
}
|
||||
// CHECK: out=42.
|
||||
/* Value of out will be 42 */
|
||||
printf("out=%d.\n", out);
|
||||
return !(out == 42);
|
||||
}
|
|
@ -1,151 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
const int MaxThreads = 1024;
|
||||
const int NumThreads = 64;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int level = -1, activeLevel = -1;
|
||||
// The expected value is -1, initialize to different value.
|
||||
int ancestorTNumNeg = 1, teamSizeNeg = 1;
|
||||
int ancestorTNum0 = -1, teamSize0 = -1;
|
||||
// The expected value is -1, initialize to different value.
|
||||
int ancestorTNum1 = 1, teamSize1 = 1;
|
||||
int check1[MaxThreads];
|
||||
int check2[MaxThreads];
|
||||
int check3[MaxThreads];
|
||||
int check4[MaxThreads];
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
check1[i] = check2[i] = check3[i] = check4[i] = 0;
|
||||
}
|
||||
|
||||
#pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \
|
||||
map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \
|
||||
map(check1[:], check2[:], check3[:], check4[:])
|
||||
{
|
||||
level = omp_get_level();
|
||||
activeLevel = omp_get_active_level();
|
||||
|
||||
// Expected to return -1.
|
||||
ancestorTNumNeg = omp_get_ancestor_thread_num(-1);
|
||||
teamSizeNeg = omp_get_team_size(-1);
|
||||
|
||||
// Expected to return 0 and 1.
|
||||
ancestorTNum0 = omp_get_ancestor_thread_num(0);
|
||||
teamSize0 = omp_get_team_size(0);
|
||||
|
||||
// Expected to return -1 because the requested level is larger than
|
||||
// the nest level.
|
||||
ancestorTNum1 = omp_get_ancestor_thread_num(1);
|
||||
teamSize1 = omp_get_team_size(1);
|
||||
|
||||
// Expecting active parallel region.
|
||||
#pragma omp parallel num_threads(NumThreads)
|
||||
{
|
||||
int id = omp_get_thread_num();
|
||||
// Multiply return value of omp_get_level by 5 to avoid that this test
|
||||
// passes if both API calls return wrong values.
|
||||
check1[id] += omp_get_level() * 5 + omp_get_active_level();
|
||||
|
||||
// Expected to return 0 and 1.
|
||||
check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
|
||||
// Expected to return the current thread num.
|
||||
check2[id] += (omp_get_ancestor_thread_num(1) - id);
|
||||
// Expected to return the current number of threads.
|
||||
check2[id] += 3 * omp_get_team_size(1);
|
||||
// Expected to return -1, see above.
|
||||
check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2);
|
||||
|
||||
// Expecting serialized parallel region.
|
||||
#pragma omp parallel
|
||||
{
|
||||
#pragma omp atomic
|
||||
check3[id] += omp_get_level() * 5 + omp_get_active_level();
|
||||
|
||||
// Expected to return 0 and 1.
|
||||
int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0);
|
||||
// Expected to return the parent thread num.
|
||||
check4Inc += (omp_get_ancestor_thread_num(1) - id);
|
||||
// Expected to return the number of threads in the active parallel region.
|
||||
check4Inc += 3 * omp_get_team_size(1);
|
||||
// Expected to return 0 and 1.
|
||||
check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2);
|
||||
// Expected to return -1, see above.
|
||||
check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3);
|
||||
|
||||
#pragma omp atomic
|
||||
check4[id] += check4Inc;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: target: level = 0, activeLevel = 0
|
||||
printf("target: level = %d, activeLevel = %d\n", level, activeLevel);
|
||||
// CHECK: level = -1: ancestorTNum = -1, teamSize = -1
|
||||
printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg);
|
||||
// CHECK: level = 0: ancestorTNum = 0, teamSize = 1
|
||||
printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0);
|
||||
// CHECK: level = 1: ancestorTNum = -1, teamSize = -1
|
||||
printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1);
|
||||
|
||||
// CHECK-NOT: invalid
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
// Check active parallel region:
|
||||
// omp_get_level() = 1, omp_get_active_level() = 1
|
||||
const int Expected1 = 6;
|
||||
if (i < NumThreads) {
|
||||
if (check1[i] != Expected1) {
|
||||
printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]);
|
||||
}
|
||||
} else if (check1[i] != 0) {
|
||||
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
|
||||
}
|
||||
|
||||
// 5 * 1 + 3 * 64 - 1 - 1 (see above)
|
||||
const int Expected2 = 195;
|
||||
if (i < NumThreads) {
|
||||
if (check2[i] != Expected2) {
|
||||
printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]);
|
||||
}
|
||||
} else if (check2[i] != 0) {
|
||||
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
|
||||
}
|
||||
|
||||
// Check serialized parallel region:
|
||||
// omp_get_level() = 2, omp_get_active_level() = 1
|
||||
const int Expected3 = 11;
|
||||
if (i < NumThreads) {
|
||||
if (check3[i] != Expected3) {
|
||||
printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]);
|
||||
}
|
||||
} else if (check3[i] != 0) {
|
||||
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
|
||||
}
|
||||
|
||||
// 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above)
|
||||
const int Expected4 = 198;
|
||||
if (i < NumThreads) {
|
||||
if (check4[i] != Expected4) {
|
||||
printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]);
|
||||
}
|
||||
} else if (check4[i] != 0) {
|
||||
printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check for paraller level in non-SPMD kernels.
|
||||
level = 0;
|
||||
#pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level)
|
||||
for (int i=0; i<5032; i+=32) {
|
||||
int ub = (i+32 > 5032) ? 5032 : i+32;
|
||||
#pragma omp parallel for schedule(dynamic)
|
||||
for (int j=i ; j < ub; j++) ;
|
||||
level += omp_get_level();
|
||||
}
|
||||
// CHECK: Integral level = 0.
|
||||
printf("Integral level = %d.\n", level);
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,136 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
const int MaxThreads = 1024;
|
||||
const int NumThreads = 64;
|
||||
const int NumThreads1 = 1;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int inParallel = -1, numThreads = -1, threadNum = -1;
|
||||
int check1[MaxThreads];
|
||||
int check2[MaxThreads];
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
check1[i] = check2[i] = 0;
|
||||
}
|
||||
|
||||
#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
|
||||
{
|
||||
inParallel = omp_in_parallel();
|
||||
numThreads = omp_get_num_threads();
|
||||
threadNum = omp_get_thread_num();
|
||||
|
||||
// Expecting active parallel region.
|
||||
#pragma omp parallel num_threads(NumThreads)
|
||||
{
|
||||
int id = omp_get_thread_num();
|
||||
check1[id] += omp_get_num_threads() + omp_in_parallel();
|
||||
|
||||
// Expecting serialized parallel region.
|
||||
#pragma omp parallel
|
||||
{
|
||||
// Expected to be 1.
|
||||
int nestedInParallel = omp_in_parallel();
|
||||
// Expected to be 1.
|
||||
int nestedNumThreads = omp_get_num_threads();
|
||||
// Expected to be 0.
|
||||
int nestedThreadNum = omp_get_thread_num();
|
||||
#pragma omp atomic
|
||||
check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
|
||||
printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
|
||||
inParallel, numThreads, threadNum);
|
||||
|
||||
// CHECK-NOT: invalid
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
// Check that all threads reported
|
||||
// omp_get_num_threads() = 64, omp_in_parallel() = 1.
|
||||
int Expected = NumThreads + 1;
|
||||
if (i < NumThreads) {
|
||||
if (check1[i] != Expected) {
|
||||
printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
|
||||
check1[i]);
|
||||
}
|
||||
} else if (check1[i] != 0) {
|
||||
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
|
||||
}
|
||||
|
||||
// Check serialized parallel region.
|
||||
if (i < NumThreads) {
|
||||
if (check2[i] != 2) {
|
||||
printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]);
|
||||
}
|
||||
} else if (check2[i] != 0) {
|
||||
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
|
||||
}
|
||||
}
|
||||
|
||||
inParallel = -1;
|
||||
numThreads = -1;
|
||||
threadNum = -1;
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
check1[i] = check2[i] = 0;
|
||||
}
|
||||
|
||||
#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:])
|
||||
{
|
||||
inParallel = omp_in_parallel();
|
||||
numThreads = omp_get_num_threads();
|
||||
threadNum = omp_get_thread_num();
|
||||
|
||||
// Expecting active parallel region.
|
||||
#pragma omp parallel num_threads(NumThreads1)
|
||||
{
|
||||
int id = omp_get_thread_num();
|
||||
check1[id] += omp_get_num_threads() + omp_in_parallel();
|
||||
|
||||
// Expecting serialized parallel region.
|
||||
#pragma omp parallel
|
||||
{
|
||||
// Expected to be 0.
|
||||
int nestedInParallel = omp_in_parallel();
|
||||
// Expected to be 1.
|
||||
int nestedNumThreads = omp_get_num_threads();
|
||||
// Expected to be 0.
|
||||
int nestedThreadNum = omp_get_thread_num();
|
||||
#pragma omp atomic
|
||||
check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0
|
||||
printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n",
|
||||
inParallel, numThreads, threadNum);
|
||||
|
||||
// CHECK-NOT: invalid
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
// Check that all threads reported
|
||||
// omp_get_num_threads() = 1, omp_in_parallel() = 0.
|
||||
int Expected = 1;
|
||||
if (i < NumThreads1) {
|
||||
if (check1[i] != Expected) {
|
||||
printf("invalid: check1[%d] should be %d, is %d\n", i, Expected,
|
||||
check1[i]);
|
||||
}
|
||||
} else if (check1[i] != 0) {
|
||||
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
|
||||
}
|
||||
|
||||
// Check serialized parallel region.
|
||||
if (i < NumThreads1) {
|
||||
if (check2[i] != 1) {
|
||||
printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]);
|
||||
}
|
||||
} else if (check2[i] != 0) {
|
||||
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,102 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
const int WarpSize = 32;
|
||||
const int NumThreads1 = 1 * WarpSize;
|
||||
const int NumThreads2 = 2 * WarpSize;
|
||||
const int NumThreads3 = 3 * WarpSize;
|
||||
const int MaxThreads = 1024;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int check1[MaxThreads];
|
||||
int check2[MaxThreads];
|
||||
int check3[MaxThreads];
|
||||
int check4[MaxThreads];
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
check1[i] = check2[i] = check3[i] = check4[i] = 0;
|
||||
}
|
||||
|
||||
int maxThreads1 = -1;
|
||||
int maxThreads2 = -1;
|
||||
int maxThreads3 = -1;
|
||||
|
||||
#pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \
|
||||
map(maxThreads1, maxThreads2, maxThreads3)
|
||||
{
|
||||
#pragma omp parallel num_threads(NumThreads1)
|
||||
{
|
||||
check1[omp_get_thread_num()] += omp_get_num_threads();
|
||||
}
|
||||
|
||||
// API method to set number of threads in parallel regions without
|
||||
// num_threads() clause.
|
||||
omp_set_num_threads(NumThreads2);
|
||||
maxThreads1 = omp_get_max_threads();
|
||||
#pragma omp parallel
|
||||
{
|
||||
check2[omp_get_thread_num()] += omp_get_num_threads();
|
||||
}
|
||||
|
||||
maxThreads2 = omp_get_max_threads();
|
||||
|
||||
// num_threads() clause should override nthreads-var ICV.
|
||||
#pragma omp parallel num_threads(NumThreads3)
|
||||
{
|
||||
check3[omp_get_thread_num()] += omp_get_num_threads();
|
||||
}
|
||||
|
||||
maxThreads3 = omp_get_max_threads();
|
||||
|
||||
// Effect from omp_set_num_threads() should still be visible.
|
||||
#pragma omp parallel
|
||||
{
|
||||
check4[omp_get_thread_num()] += omp_get_num_threads();
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: maxThreads1 = 64
|
||||
printf("maxThreads1 = %d\n", maxThreads1);
|
||||
// CHECK: maxThreads2 = 64
|
||||
printf("maxThreads2 = %d\n", maxThreads2);
|
||||
// CHECK: maxThreads3 = 64
|
||||
printf("maxThreads3 = %d\n", maxThreads3);
|
||||
|
||||
// CHECK-NOT: invalid
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
if (i < NumThreads1) {
|
||||
if (check1[i] != NumThreads1) {
|
||||
printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]);
|
||||
}
|
||||
} else if (check1[i] != 0) {
|
||||
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
|
||||
}
|
||||
|
||||
if (i < NumThreads2) {
|
||||
if (check2[i] != NumThreads2) {
|
||||
printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]);
|
||||
}
|
||||
} else if (check2[i] != 0) {
|
||||
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
|
||||
}
|
||||
|
||||
if (i < NumThreads3) {
|
||||
if (check3[i] != NumThreads3) {
|
||||
printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]);
|
||||
}
|
||||
} else if (check3[i] != 0) {
|
||||
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
|
||||
}
|
||||
|
||||
if (i < NumThreads2) {
|
||||
if (check4[i] != NumThreads2) {
|
||||
printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]);
|
||||
}
|
||||
} else if (check4[i] != 0) {
|
||||
printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,51 +0,0 @@
|
|||
// RUN: %compilexx-run-and-check
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
int main(void) {
|
||||
int isHost = -1;
|
||||
int ParallelLevel1 = -1, ParallelLevel2 = -1;
|
||||
int Count = 0;
|
||||
|
||||
#pragma omp target parallel for map(tofrom \
|
||||
: isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1)
|
||||
for (int J = 0; J < 10; ++J) {
|
||||
#pragma omp critical
|
||||
{
|
||||
isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost;
|
||||
ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1)
|
||||
? omp_get_level()
|
||||
: ParallelLevel1;
|
||||
}
|
||||
if (omp_get_thread_num() > 5) {
|
||||
int L2;
|
||||
#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count)
|
||||
for (int I = 0; I < 10; ++I) {
|
||||
L2 = omp_get_level();
|
||||
Count += omp_get_level(); // (10-6)*10*2 = 80
|
||||
}
|
||||
#pragma omp critical
|
||||
ParallelLevel2 =
|
||||
(ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2;
|
||||
} else {
|
||||
Count += omp_get_level(); // 6 * 1 = 6
|
||||
}
|
||||
}
|
||||
|
||||
if (isHost < 0) {
|
||||
printf("Runtime error, isHost=%d\n", isHost);
|
||||
}
|
||||
|
||||
// CHECK: Target region executed on the device
|
||||
printf("Target region executed on the %s\n", isHost ? "host" : "device");
|
||||
// CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2
|
||||
printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1,
|
||||
ParallelLevel2);
|
||||
// Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par
|
||||
// level) + 6(num of iterations) * 1(par level)
|
||||
// CHECK: Expected count = 86
|
||||
printf("Expected count = %d\n", Count);
|
||||
|
||||
return isHost;
|
||||
}
|
|
@ -1,77 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
||||
const int WarpSize = 32;
|
||||
const int ThreadLimit = 1 * WarpSize;
|
||||
const int NumThreads2 = 2 * WarpSize;
|
||||
const int NumThreads3 = 3 * WarpSize;
|
||||
const int MaxThreads = 1024;
|
||||
|
||||
int main(int argc, char *argv[]) {
|
||||
int check1[MaxThreads];
|
||||
int check2[MaxThreads];
|
||||
int check3[MaxThreads];
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
check1[i] = check2[i] = check3[i] = 0;
|
||||
}
|
||||
|
||||
int threadLimit = -1;
|
||||
|
||||
#pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \
|
||||
map(check1[:], check2[:], check3[:], threadLimit)
|
||||
{
|
||||
threadLimit = omp_get_thread_limit();
|
||||
|
||||
// All parallel regions should get as many threads as specified by the
|
||||
// thread_limit() clause.
|
||||
#pragma omp parallel
|
||||
{
|
||||
check1[omp_get_thread_num()] += omp_get_num_threads();
|
||||
}
|
||||
|
||||
omp_set_num_threads(NumThreads2);
|
||||
#pragma omp parallel
|
||||
{
|
||||
check2[omp_get_thread_num()] += omp_get_num_threads();
|
||||
}
|
||||
|
||||
#pragma omp parallel num_threads(NumThreads3)
|
||||
{
|
||||
check3[omp_get_thread_num()] += omp_get_num_threads();
|
||||
}
|
||||
}
|
||||
|
||||
// CHECK: threadLimit = 32
|
||||
printf("threadLimit = %d\n", threadLimit);
|
||||
|
||||
// CHECK-NOT: invalid
|
||||
for (int i = 0; i < MaxThreads; i++) {
|
||||
if (i < ThreadLimit) {
|
||||
if (check1[i] != ThreadLimit) {
|
||||
printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]);
|
||||
}
|
||||
} else if (check1[i] != 0) {
|
||||
printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]);
|
||||
}
|
||||
|
||||
if (i < ThreadLimit) {
|
||||
if (check2[i] != ThreadLimit) {
|
||||
printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]);
|
||||
}
|
||||
} else if (check2[i] != 0) {
|
||||
printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]);
|
||||
}
|
||||
|
||||
if (i < ThreadLimit) {
|
||||
if (check3[i] != ThreadLimit) {
|
||||
printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]);
|
||||
}
|
||||
} else if (check3[i] != 0) {
|
||||
printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,22 +0,0 @@
|
|||
// RUN: %compile-run-and-check
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
||||
int main() {
|
||||
int res = 0;
|
||||
|
||||
#pragma omp parallel num_threads(2) reduction(+:res)
|
||||
{
|
||||
int tid = omp_get_thread_num();
|
||||
#pragma omp target teams distribute reduction(+:res)
|
||||
for (int i = tid; i < 2; i++)
|
||||
++res;
|
||||
}
|
||||
// The first thread makes 2 iterations, the second - 1. Expected result of the
|
||||
// reduction res is 3.
|
||||
|
||||
// CHECK: res = 3.
|
||||
printf("res = %d.\n", res);
|
||||
return 0;
|
||||
}
|
|
@ -1,78 +0,0 @@
|
|||
//===------------- target_interface.h - Target interfaces --------- C++ -*-===//
|
||||
//
|
||||
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
||||
// See https://llvm.org/LICENSE.txt for license information.
|
||||
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This file contains interfaces that must be implemented by each target.
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
#ifndef _OMPTARGET_TARGET_INTERFACE_H_
|
||||
#define _OMPTARGET_TARGET_INTERFACE_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#include "DeviceEnvironment.h"
|
||||
#include "target_impl.h"
|
||||
|
||||
// Calls to the NVPTX layer (assuming 1D layout)
|
||||
EXTERN int __kmpc_get_hardware_thread_id_in_block();
|
||||
EXTERN int GetBlockIdInKernel();
|
||||
EXTERN NOINLINE int __kmpc_get_hardware_num_blocks();
|
||||
EXTERN NOINLINE int __kmpc_get_hardware_num_threads_in_block();
|
||||
EXTERN unsigned __kmpc_get_warp_size();
|
||||
EXTERN unsigned GetWarpId();
|
||||
EXTERN unsigned GetLaneId();
|
||||
|
||||
// Atomics
|
||||
uint32_t __kmpc_atomic_add(uint32_t *, uint32_t);
|
||||
uint32_t __kmpc_atomic_inc(uint32_t *, uint32_t);
|
||||
uint32_t __kmpc_atomic_max(uint32_t *, uint32_t);
|
||||
uint32_t __kmpc_atomic_exchange(uint32_t *, uint32_t);
|
||||
uint32_t __kmpc_atomic_cas(uint32_t *, uint32_t, uint32_t);
|
||||
static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "");
|
||||
unsigned long long __kmpc_atomic_exchange(unsigned long long *,
|
||||
unsigned long long);
|
||||
unsigned long long __kmpc_atomic_add(unsigned long long *, unsigned long long);
|
||||
|
||||
// Locks
|
||||
EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock);
|
||||
EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock);
|
||||
EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock);
|
||||
EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock);
|
||||
EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock);
|
||||
|
||||
EXTERN void __kmpc_impl_threadfence();
|
||||
EXTERN void __kmpc_impl_threadfence_block();
|
||||
EXTERN void __kmpc_impl_threadfence_system();
|
||||
|
||||
EXTERN double __kmpc_impl_get_wtick();
|
||||
EXTERN double __kmpc_impl_get_wtime();
|
||||
|
||||
EXTERN void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi);
|
||||
EXTERN uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi);
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt();
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt();
|
||||
EXTERN uint32_t __kmpc_impl_smid();
|
||||
|
||||
EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask();
|
||||
|
||||
EXTERN void __kmpc_impl_syncthreads();
|
||||
EXTERN void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask);
|
||||
|
||||
// Kernel initialization
|
||||
EXTERN void __kmpc_impl_target_init();
|
||||
|
||||
// Memory
|
||||
EXTERN void *__kmpc_impl_malloc(size_t);
|
||||
EXTERN void __kmpc_impl_free(void *);
|
||||
|
||||
// Barrier until num_threads arrive.
|
||||
EXTERN void __kmpc_impl_named_sync(uint32_t num_threads);
|
||||
|
||||
extern DeviceEnvironmentTy omptarget_device_environment;
|
||||
|
||||
#endif // _OMPTARGET_TARGET_INTERFACE_H_
|
|
@ -118,6 +118,6 @@ if (${amdgpu_arch_result})
|
|||
libomptarget_say("Not generating amdgcn test targets as amdgpu-arch exited with ${amdgpu_arch_result}")
|
||||
else()
|
||||
# Report to the parent scope that we are building a plugin for amdgpu
|
||||
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa-newRTL " PARENT_SCOPE)
|
||||
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} amdgcn-amd-amdhsa " PARENT_SCOPE)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -72,7 +72,7 @@ target_link_libraries(omptarget.rtl.cuda
|
|||
# Otherwise this plugin is being built speculatively and there may be no cuda available
|
||||
if (LIBOMPTARGET_CAN_LINK_LIBCUDA OR LIBOMPTARGET_FORCE_DLOPEN_LIBCUDA)
|
||||
libomptarget_say("Enable tests using CUDA plugin")
|
||||
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda-newRTL nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
|
||||
set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda nvptx64-nvidia-cuda-newDriver" PARENT_SCOPE)
|
||||
else()
|
||||
libomptarget_say("Disabling tests using CUDA plugin as cuda may not be available")
|
||||
endif()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// RUN: %libomptarget-compile-nvptx64-nvidia-cuda -fopenmp-target-new-runtime
|
||||
// RUN: %libomptarget-compile-nvptx64-nvidia-cuda
|
||||
// RUN: env LIBOMPTARGET_SHARED_MEMORY_SIZE=256 \
|
||||
// RUN: %libomptarget-run-nvptx64-nvidia-cuda | %fcheck-nvptx64-nvidia-cuda
|
||||
// REQUIRES: nvptx64-nvidia-cuda
|
||||
|
|
|
@ -104,17 +104,11 @@ else: # Unices
|
|||
config.test_flags += " --libomptarget-amdgcn-bc-path=" + config.library_dir
|
||||
if config.libomptarget_current_target.startswith('nvptx'):
|
||||
config.test_flags += " --libomptarget-nvptx-bc-path=" + config.library_dir
|
||||
if config.libomptarget_current_target.endswith('-newRTL'):
|
||||
config.test_flags += " -fopenmp-target-new-runtime"
|
||||
elif not config.libomptarget_current_target.endswith('-newDriver'):
|
||||
config.test_flags += " -fno-openmp-target-new-runtime"
|
||||
if config.libomptarget_current_target.endswith('-newDriver'):
|
||||
config.test_flags += " -fopenmp-new-driver"
|
||||
|
||||
def remove_newRTL_suffix_if_present(name):
|
||||
if name.endswith('-newRTL'):
|
||||
return name[:-7]
|
||||
elif name.endswith('-newDriver'):
|
||||
def remove_suffix_if_present(name):
|
||||
if name.endswith('-newDriver'):
|
||||
return name[:-10]
|
||||
else:
|
||||
return name
|
||||
|
@ -183,10 +177,10 @@ for libomptarget_target in config.libomptarget_all_targets:
|
|||
"%not --crash %t"))
|
||||
config.substitutions.append(("%clangxx-" + libomptarget_target, \
|
||||
"%clangxx %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
|
||||
remove_newRTL_suffix_if_present(libomptarget_target)))
|
||||
remove_suffix_if_present(libomptarget_target)))
|
||||
config.substitutions.append(("%clang-" + libomptarget_target, \
|
||||
"%clang %openmp_flags %cuda_flags %flags -fopenmp-targets=" +\
|
||||
remove_newRTL_suffix_if_present(libomptarget_target)))
|
||||
remove_suffix_if_present(libomptarget_target)))
|
||||
config.substitutions.append(("%fcheck-" + libomptarget_target, \
|
||||
config.libomptarget_filecheck + " %s"))
|
||||
else:
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
// Wrong results on amdgpu
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
// Wrong results on amdgpu
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
// Wrong results on amdgpu
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <cstdio>
|
||||
#include <cstdlib>
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
// Wrong results on amdgpu
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
|
|
@ -1,7 +1,7 @@
|
|||
// RUN: %libomptarget-compilexx-run-and-check-generic
|
||||
|
||||
// Error on the gpu that crashes the host
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa
|
||||
|
||||
#include <iostream>
|
||||
|
||||
|
|
|
@ -3,7 +3,6 @@
|
|||
|
||||
// Wrong results on amdgpu
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
// Hangs
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newDriver
|
||||
|
||||
#include <iostream>
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
// Currently hangs on amdgpu
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
|
||||
// UNSUPPORTED: x86_64-pc-linux-gnu
|
||||
|
||||
#include <cassert>
|
||||
|
|
|
@ -34,7 +34,6 @@
|
|||
|
||||
// Hangs
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newDriver
|
||||
|
||||
#if ADD_REDUCTION
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
// Fails in DAGToDAG on an address space problem
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <cmath>
|
||||
#include <cstdio>
|
||||
|
|
|
@ -9,7 +9,6 @@
|
|||
|
||||
// amdgpu does not have a working printf definition
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
|
|
@ -1,11 +1,10 @@
|
|||
// RUN: %libomptarget-compile-run-and-check-generic
|
||||
// XFAIL: nvptx64-nvidia-cuda
|
||||
// XFAIL: nvptx64-nvidia-cuda-newRTL
|
||||
// XFAIL: nvptx64-nvidia-cuda
|
||||
// XFAIL: nvptx64-nvidia-cuda-newDriver
|
||||
|
||||
// Fails on amdgpu with error: GPU Memory Error
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
// XFAIL: amdgcn-amd-amdhsa-newDriver
|
||||
|
||||
#include <stdio.h>
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
|
||||
// Fails on amdgpu with error: GPU Memory Error
|
||||
// XFAIL: amdgcn-amd-amdhsa
|
||||
// XFAIL: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <omp.h>
|
||||
#include <stdio.h>
|
||||
|
|
|
@ -5,7 +5,6 @@
|
|||
|
||||
// amdgpu runtime crash
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
|
||||
#include <omp.h>
|
||||
|
|
|
@ -4,7 +4,6 @@
|
|||
|
||||
// amdgpu runtime crash
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa
|
||||
// UNSUPPORTED: amdgcn-amd-amdhsa-newRTL
|
||||
|
||||
#include <stdio.h>
|
||||
#include <omp.h>
|
||||
|
|
Loading…
Reference in New Issue