forked from OSchip/llvm-project
[cuda] Driver changes to compile and stitch together host and device-side CUDA code.
NOTE: reverts r242077 to reinstate r242058, r242065, 242067 and includes fix for OS X test failures. - Changed driver pipeline to compile host and device side of CUDA files and incorporate results of device-side compilation into host object file. - Added a test for cuda pipeline creation in clang driver. New clang options: --cuda-host-only - Do host-side compilation only. --cuda-device-only - Do device-side compilation only. --cuda-gpu-arch=<ARCH> - specify GPU architecture for device-side compilation. E.g. sm_35, sm_30. Default is sm_20. May be used more than once in which case one device-compilation will be done per unique specified GPU architecture. Differential Revision: http://reviews.llvm.org/D9509 llvm-svn: 242085
This commit is contained in:
parent
2eacca86ef
commit
0ff05cd165
|
@ -41,6 +41,8 @@ public:
|
|||
enum ActionClass {
|
||||
InputClass = 0,
|
||||
BindArchClass,
|
||||
CudaDeviceClass,
|
||||
CudaHostClass,
|
||||
PreprocessJobClass,
|
||||
PrecompileJobClass,
|
||||
AnalyzeJobClass,
|
||||
|
@ -133,6 +135,41 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
class CudaDeviceAction : public Action {
|
||||
virtual void anchor();
|
||||
/// GPU architecture to bind -- e.g 'sm_35'.
|
||||
const char *GpuArchName;
|
||||
/// True when action results are not consumed by the host action (e.g when
|
||||
/// -fsyntax-only or --cuda-device-only options are used).
|
||||
bool AtTopLevel;
|
||||
|
||||
public:
|
||||
CudaDeviceAction(std::unique_ptr<Action> Input, const char *ArchName,
|
||||
bool AtTopLevel);
|
||||
|
||||
const char *getGpuArchName() const { return GpuArchName; }
|
||||
bool isAtTopLevel() const { return AtTopLevel; }
|
||||
|
||||
static bool classof(const Action *A) {
|
||||
return A->getKind() == CudaDeviceClass;
|
||||
}
|
||||
};
|
||||
|
||||
class CudaHostAction : public Action {
|
||||
virtual void anchor();
|
||||
ActionList DeviceActions;
|
||||
|
||||
public:
|
||||
CudaHostAction(std::unique_ptr<Action> Input,
|
||||
const ActionList &DeviceActions);
|
||||
~CudaHostAction() override;
|
||||
|
||||
ActionList &getDeviceActions() { return DeviceActions; }
|
||||
const ActionList &getDeviceActions() const { return DeviceActions; }
|
||||
|
||||
static bool classof(const Action *A) { return A->getKind() == CudaHostClass; }
|
||||
};
|
||||
|
||||
class JobAction : public Action {
|
||||
virtual void anchor();
|
||||
protected:
|
||||
|
|
|
@ -351,6 +351,12 @@ def cxx_isystem : JoinedOrSeparate<["-"], "cxx-isystem">, Group<clang_i_Group>,
|
|||
MetaVarName<"<directory>">;
|
||||
def c : Flag<["-"], "c">, Flags<[DriverOption]>,
|
||||
HelpText<"Only run preprocess, compile, and assemble steps">;
|
||||
def cuda_device_only : Flag<["--"], "cuda-device-only">,
|
||||
HelpText<"Do device-side CUDA compilation only">;
|
||||
def cuda_gpu_arch_EQ : Joined<["--"], "cuda-gpu-arch=">,
|
||||
Flags<[DriverOption, HelpHidden]>, HelpText<"CUDA GPU architecture">;
|
||||
def cuda_host_only : Flag<["--"], "cuda-host-only">,
|
||||
HelpText<"Do host-side CUDA compilation only">;
|
||||
def dA : Flag<["-"], "dA">, Group<d_Group>;
|
||||
def dD : Flag<["-"], "dD">, Group<d_Group>, Flags<[CC1Option]>,
|
||||
HelpText<"Print macro definitions in -E mode in addition to normal output">;
|
||||
|
|
|
@ -44,6 +44,7 @@ TYPE("c", C, PP_C, "c", "u")
|
|||
TYPE("cl", CL, PP_C, "cl", "u")
|
||||
TYPE("cuda-cpp-output", PP_CUDA, INVALID, "cui", "u")
|
||||
TYPE("cuda", CUDA, PP_CUDA, "cu", "u")
|
||||
TYPE("cuda", CUDA_DEVICE, PP_CUDA, "cu", "")
|
||||
TYPE("objective-c-cpp-output", PP_ObjC, INVALID, "mi", "u")
|
||||
TYPE("objc-cpp-output", PP_ObjC_Alias, INVALID, "mi", "u")
|
||||
TYPE("objective-c", ObjC, PP_ObjC, "m", "u")
|
||||
|
|
|
@ -63,6 +63,9 @@ namespace types {
|
|||
/// isCXX - Is this a "C++" input (C++ and Obj-C++ sources and headers).
|
||||
bool isCXX(ID Id);
|
||||
|
||||
/// isCuda - Is this a CUDA input.
|
||||
bool isCuda(ID Id);
|
||||
|
||||
/// isObjC - Is this an "ObjC" input (Obj-C and Obj-C++ sources and headers).
|
||||
bool isObjC(ID Id);
|
||||
|
||||
|
|
|
@ -24,6 +24,8 @@ const char *Action::getClassName(ActionClass AC) {
|
|||
switch (AC) {
|
||||
case InputClass: return "input";
|
||||
case BindArchClass: return "bind-arch";
|
||||
case CudaDeviceClass: return "cuda-device";
|
||||
case CudaHostClass: return "cuda-host";
|
||||
case PreprocessJobClass: return "preprocessor";
|
||||
case PrecompileJobClass: return "precompiler";
|
||||
case AnalyzeJobClass: return "analyzer";
|
||||
|
@ -53,6 +55,25 @@ BindArchAction::BindArchAction(std::unique_ptr<Action> Input,
|
|||
const char *_ArchName)
|
||||
: Action(BindArchClass, std::move(Input)), ArchName(_ArchName) {}
|
||||
|
||||
void CudaDeviceAction::anchor() {}
|
||||
|
||||
CudaDeviceAction::CudaDeviceAction(std::unique_ptr<Action> Input,
|
||||
const char *ArchName, bool AtTopLevel)
|
||||
: Action(CudaDeviceClass, std::move(Input)), GpuArchName(ArchName),
|
||||
AtTopLevel(AtTopLevel) {}
|
||||
|
||||
void CudaHostAction::anchor() {}
|
||||
|
||||
CudaHostAction::CudaHostAction(std::unique_ptr<Action> Input,
|
||||
const ActionList &_DeviceActions)
|
||||
: Action(CudaHostClass, std::move(Input)), DeviceActions(_DeviceActions) {}
|
||||
|
||||
CudaHostAction::~CudaHostAction() {
|
||||
for (iterator it = DeviceActions.begin(), ie = DeviceActions.end(); it != ie;
|
||||
++it)
|
||||
delete *it;
|
||||
}
|
||||
|
||||
void JobAction::anchor() {}
|
||||
|
||||
JobAction::JobAction(ActionClass Kind, std::unique_ptr<Action> Input,
|
||||
|
|
|
@ -174,8 +174,10 @@ phases::ID Driver::getFinalPhase(const DerivedArgList &DAL,
|
|||
} else if ((PhaseArg = DAL.getLastArg(options::OPT_S))) {
|
||||
FinalPhase = phases::Backend;
|
||||
|
||||
// -c only runs up to the assembler.
|
||||
} else if ((PhaseArg = DAL.getLastArg(options::OPT_c))) {
|
||||
// -c and partial CUDA compilations only run up to the assembler.
|
||||
} else if ((PhaseArg = DAL.getLastArg(options::OPT_c)) ||
|
||||
(PhaseArg = DAL.getLastArg(options::OPT_cuda_device_only)) ||
|
||||
(PhaseArg = DAL.getLastArg(options::OPT_cuda_host_only))) {
|
||||
FinalPhase = phases::Assemble;
|
||||
|
||||
// Otherwise do everything.
|
||||
|
@ -900,9 +902,20 @@ static unsigned PrintActions1(const Compilation &C, Action *A,
|
|||
} else if (BindArchAction *BIA = dyn_cast<BindArchAction>(A)) {
|
||||
os << '"' << BIA->getArchName() << '"' << ", {"
|
||||
<< PrintActions1(C, *BIA->begin(), Ids) << "}";
|
||||
} else if (CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
|
||||
os << '"' << CDA->getGpuArchName() << '"' << ", {"
|
||||
<< PrintActions1(C, *CDA->begin(), Ids) << "}";
|
||||
} else {
|
||||
ActionList *AL;
|
||||
if (CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
|
||||
os << "{" << PrintActions1(C, *CHA->begin(), Ids) << "}"
|
||||
<< ", gpu binaries ";
|
||||
AL = &CHA->getDeviceActions();
|
||||
} else
|
||||
AL = &A->getInputs();
|
||||
|
||||
const char *Prefix = "{";
|
||||
for (Action *PreRequisite : *A) {
|
||||
for (Action *PreRequisite : *AL) {
|
||||
os << Prefix << PrintActions1(C, PreRequisite, Ids);
|
||||
Prefix = ", ";
|
||||
}
|
||||
|
@ -1215,6 +1228,93 @@ void Driver::BuildInputs(const ToolChain &TC, DerivedArgList &Args,
|
|||
}
|
||||
}
|
||||
|
||||
// For each unique --cuda-gpu-arch= argument creates a TY_CUDA_DEVICE input
|
||||
// action and then wraps each in CudaDeviceAction paired with appropriate GPU
|
||||
// arch name. If we're only building device-side code, each action remains
|
||||
// independent. Otherwise we pass device-side actions as inputs to a new
|
||||
// CudaHostAction which combines both host and device side actions.
|
||||
static std::unique_ptr<Action>
|
||||
buildCudaActions(const Driver &D, const ToolChain &TC, DerivedArgList &Args,
|
||||
const Arg *InputArg, const types::ID InputType,
|
||||
std::unique_ptr<Action> Current, ActionList &Actions) {
|
||||
|
||||
assert(InputType == types::TY_CUDA &&
|
||||
"CUDA Actions only apply to CUDA inputs.");
|
||||
|
||||
// Collect all cuda_gpu_arch parameters, removing duplicates.
|
||||
SmallVector<const char *, 4> GpuArchList;
|
||||
llvm::StringSet<> GpuArchNames;
|
||||
for (Arg *A : Args) {
|
||||
if (A->getOption().matches(options::OPT_cuda_gpu_arch_EQ)) {
|
||||
A->claim();
|
||||
if (GpuArchNames.insert(A->getValue()).second)
|
||||
GpuArchList.push_back(A->getValue());
|
||||
}
|
||||
}
|
||||
|
||||
// Default to sm_20 which is the lowest common denominator for supported GPUs.
|
||||
// sm_20 code should work correctly, if suboptimally, on all newer GPUs.
|
||||
if (GpuArchList.empty())
|
||||
GpuArchList.push_back("sm_20");
|
||||
|
||||
// Replicate inputs for each GPU architecture.
|
||||
Driver::InputList CudaDeviceInputs;
|
||||
for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
|
||||
CudaDeviceInputs.push_back(std::make_pair(types::TY_CUDA_DEVICE, InputArg));
|
||||
|
||||
// Build actions for all device inputs.
|
||||
ActionList CudaDeviceActions;
|
||||
D.BuildActions(TC, Args, CudaDeviceInputs, CudaDeviceActions);
|
||||
assert(GpuArchList.size() == CudaDeviceActions.size() &&
|
||||
"Failed to create actions for all devices");
|
||||
|
||||
// Check whether any of device actions stopped before they could generate PTX.
|
||||
bool PartialCompilation = false;
|
||||
bool DeviceOnlyCompilation = Args.hasArg(options::OPT_cuda_device_only);
|
||||
for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i) {
|
||||
if (CudaDeviceActions[i]->getKind() != Action::BackendJobClass) {
|
||||
PartialCompilation = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Figure out what to do with device actions -- pass them as inputs to the
|
||||
// host action or run each of them independently.
|
||||
if (PartialCompilation || DeviceOnlyCompilation) {
|
||||
// In case of partial or device-only compilation results of device actions
|
||||
// are not consumed by the host action device actions have to be added to
|
||||
// top-level actions list with AtTopLevel=true and run independently.
|
||||
|
||||
// -o is ambiguous if we have more than one top-level action.
|
||||
if (Args.hasArg(options::OPT_o) &&
|
||||
(!DeviceOnlyCompilation || GpuArchList.size() > 1)) {
|
||||
D.Diag(clang::diag::err_drv_output_argument_with_multiple_files);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
|
||||
Actions.push_back(
|
||||
new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]),
|
||||
GpuArchList[i], /* AtTopLevel */ true));
|
||||
// Kill host action in case of device-only compilation.
|
||||
if (DeviceOnlyCompilation)
|
||||
Current.reset(nullptr);
|
||||
return Current;
|
||||
} else {
|
||||
// Outputs of device actions during complete CUDA compilation get created
|
||||
// with AtTopLevel=false and become inputs for the host action.
|
||||
ActionList DeviceActions;
|
||||
for (unsigned i = 0, e = GpuArchList.size(); i != e; ++i)
|
||||
DeviceActions.push_back(
|
||||
new CudaDeviceAction(std::unique_ptr<Action>(CudaDeviceActions[i]),
|
||||
GpuArchList[i], /* AtTopLevel */ false));
|
||||
// Return a new host action that incorporates original host action and all
|
||||
// device actions.
|
||||
return std::unique_ptr<Action>(
|
||||
new CudaHostAction(std::move(Current), DeviceActions));
|
||||
}
|
||||
}
|
||||
|
||||
void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
|
||||
const InputList &Inputs, ActionList &Actions) const {
|
||||
llvm::PrettyStackTraceString CrashInfo("Building compilation actions");
|
||||
|
@ -1312,6 +1412,25 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
|
|||
continue;
|
||||
}
|
||||
|
||||
phases::ID CudaInjectionPhase;
|
||||
if (isSaveTempsEnabled()) {
|
||||
// All phases are done independently, inject GPU blobs during compilation
|
||||
// phase as that's where we generate glue code to init them.
|
||||
CudaInjectionPhase = phases::Compile;
|
||||
} else {
|
||||
// Assumes that clang does everything up until linking phase, so we inject
|
||||
// cuda device actions at the last step before linking. Otherwise CUDA
|
||||
// host action forces preprocessor into a separate invocation.
|
||||
if (FinalPhase == phases::Link) {
|
||||
for (auto i = PL.begin(), e = PL.end(); i != e; ++i) {
|
||||
auto next = i + 1;
|
||||
if (next != e && *next == phases::Link)
|
||||
CudaInjectionPhase = *i;
|
||||
}
|
||||
} else
|
||||
CudaInjectionPhase = FinalPhase;
|
||||
}
|
||||
|
||||
// Build the pipeline for this file.
|
||||
std::unique_ptr<Action> Current(new InputAction(*InputArg, InputType));
|
||||
for (SmallVectorImpl<phases::ID>::iterator i = PL.begin(), e = PL.end();
|
||||
|
@ -1337,6 +1456,15 @@ void Driver::BuildActions(const ToolChain &TC, DerivedArgList &Args,
|
|||
|
||||
// Otherwise construct the appropriate action.
|
||||
Current = ConstructPhaseAction(TC, Args, Phase, std::move(Current));
|
||||
|
||||
if (InputType == types::TY_CUDA && Phase == CudaInjectionPhase &&
|
||||
!Args.hasArg(options::OPT_cuda_host_only)) {
|
||||
Current = buildCudaActions(*this, TC, Args, InputArg, InputType,
|
||||
std::move(Current), Actions);
|
||||
if (!Current)
|
||||
break;
|
||||
}
|
||||
|
||||
if (Current->getType() == types::TY_Nothing)
|
||||
break;
|
||||
}
|
||||
|
@ -1576,7 +1704,13 @@ static const Tool *SelectToolForJob(Compilation &C, bool SaveTemps,
|
|||
if (isa<BackendJobAction>(JA)) {
|
||||
// Check if the compiler supports emitting LLVM IR.
|
||||
assert(Inputs->size() == 1);
|
||||
JobAction *CompileJA = cast<CompileJobAction>(*Inputs->begin());
|
||||
JobAction *CompileJA;
|
||||
// Extract real host action, if it's a CudaHostAction.
|
||||
if (CudaHostAction *CudaHA = dyn_cast<CudaHostAction>(*Inputs->begin()))
|
||||
CompileJA = cast<CompileJobAction>(*CudaHA->begin());
|
||||
else
|
||||
CompileJA = cast<CompileJobAction>(*Inputs->begin());
|
||||
|
||||
const Tool *Compiler = TC->SelectTool(*CompileJA);
|
||||
if (!Compiler)
|
||||
return nullptr;
|
||||
|
@ -1610,6 +1744,20 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A,
|
|||
InputInfo &Result) const {
|
||||
llvm::PrettyStackTraceString CrashInfo("Building compilation jobs");
|
||||
|
||||
InputInfoList CudaDeviceInputInfos;
|
||||
if (const CudaHostAction *CHA = dyn_cast<CudaHostAction>(A)) {
|
||||
InputInfo II;
|
||||
// Append outputs of device jobs to the input list.
|
||||
for (const Action *DA : CHA->getDeviceActions()) {
|
||||
BuildJobsForAction(C, DA, TC, "", AtTopLevel,
|
||||
/*MultipleArchs*/ false, LinkingOutput, II);
|
||||
CudaDeviceInputInfos.push_back(II);
|
||||
}
|
||||
// Override current action with a real host compile action and continue
|
||||
// processing it.
|
||||
A = *CHA->begin();
|
||||
}
|
||||
|
||||
if (const InputAction *IA = dyn_cast<InputAction>(A)) {
|
||||
// FIXME: It would be nice to not claim this here; maybe the old scheme of
|
||||
// just using Args was better?
|
||||
|
@ -1635,11 +1783,24 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A,
|
|||
else
|
||||
TC = &C.getDefaultToolChain();
|
||||
|
||||
BuildJobsForAction(C, *BAA->begin(), TC, BAA->getArchName(), AtTopLevel,
|
||||
BuildJobsForAction(C, *BAA->begin(), TC, ArchName, AtTopLevel,
|
||||
MultipleArchs, LinkingOutput, Result);
|
||||
return;
|
||||
}
|
||||
|
||||
if (const CudaDeviceAction *CDA = dyn_cast<CudaDeviceAction>(A)) {
|
||||
// Figure out which NVPTX triple to use for device-side compilation based on
|
||||
// whether host is 64-bit.
|
||||
llvm::Triple DeviceTriple(C.getDefaultToolChain().getTriple().isArch64Bit()
|
||||
? "nvptx64-nvidia-cuda"
|
||||
: "nvptx-nvidia-cuda");
|
||||
BuildJobsForAction(C, *CDA->begin(),
|
||||
&getToolChain(C.getArgs(), DeviceTriple),
|
||||
CDA->getGpuArchName(), CDA->isAtTopLevel(),
|
||||
/*MultipleArchs*/ true, LinkingOutput, Result);
|
||||
return;
|
||||
}
|
||||
|
||||
const ActionList *Inputs = &A->getInputs();
|
||||
|
||||
const JobAction *JA = cast<JobAction>(A);
|
||||
|
@ -1671,6 +1832,10 @@ void Driver::BuildJobsForAction(Compilation &C, const Action *A,
|
|||
if (JA->getType() == types::TY_dSYM)
|
||||
BaseInput = InputInfos[0].getFilename();
|
||||
|
||||
// Append outputs of cuda device jobs to the input list
|
||||
if (CudaDeviceInputInfos.size())
|
||||
InputInfos.append(CudaDeviceInputInfos.begin(), CudaDeviceInputInfos.end());
|
||||
|
||||
// Determine the place to write output to, if any.
|
||||
if (JA->getType() == types::TY_Nothing)
|
||||
Result = InputInfo(A->getType(), BaseInput);
|
||||
|
@ -2052,6 +2217,9 @@ const ToolChain &Driver::getToolChain(const ArgList &Args,
|
|||
break;
|
||||
}
|
||||
break;
|
||||
case llvm::Triple::CUDA:
|
||||
TC = new toolchains::CudaToolChain(*this, Target, Args);
|
||||
break;
|
||||
default:
|
||||
// Of these targets, Hexagon is the only one that might have
|
||||
// an OS of Linux, in which case it got handled above already.
|
||||
|
|
|
@ -151,6 +151,8 @@ Tool *ToolChain::getTool(Action::ActionClass AC) const {
|
|||
|
||||
case Action::InputClass:
|
||||
case Action::BindArchClass:
|
||||
case Action::CudaDeviceClass:
|
||||
case Action::CudaHostClass:
|
||||
case Action::LipoJobClass:
|
||||
case Action::DsymutilJobClass:
|
||||
case Action::VerifyDebugInfoJobClass:
|
||||
|
|
|
@ -3652,6 +3652,65 @@ Tool *DragonFly::buildLinker() const {
|
|||
return new tools::dragonfly::Linker(*this);
|
||||
}
|
||||
|
||||
/// Stub for CUDA toolchain. At the moment we don't have assembler or
|
||||
/// linker and need toolchain mainly to propagate device-side options
|
||||
/// to CC1.
|
||||
|
||||
CudaToolChain::CudaToolChain(const Driver &D, const llvm::Triple &Triple,
|
||||
const ArgList &Args)
|
||||
: Linux(D, Triple, Args) {}
|
||||
|
||||
void
|
||||
CudaToolChain::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
|
||||
llvm::opt::ArgStringList &CC1Args) const {
|
||||
Linux::addClangTargetOptions(DriverArgs, CC1Args);
|
||||
CC1Args.push_back("-fcuda-is-device");
|
||||
}
|
||||
|
||||
llvm::opt::DerivedArgList *
|
||||
CudaToolChain::TranslateArgs(const llvm::opt::DerivedArgList &Args,
|
||||
const char *BoundArch) const {
|
||||
DerivedArgList *DAL = new DerivedArgList(Args.getBaseArgs());
|
||||
const OptTable &Opts = getDriver().getOpts();
|
||||
|
||||
for (Arg *A : Args) {
|
||||
if (A->getOption().matches(options::OPT_Xarch__)) {
|
||||
// Skip this argument unless the architecture matches BoundArch
|
||||
if (A->getValue(0) != StringRef(BoundArch))
|
||||
continue;
|
||||
|
||||
unsigned Index = Args.getBaseArgs().MakeIndex(A->getValue(1));
|
||||
unsigned Prev = Index;
|
||||
std::unique_ptr<Arg> XarchArg(Opts.ParseOneArg(Args, Index));
|
||||
|
||||
// If the argument parsing failed or more than one argument was
|
||||
// consumed, the -Xarch_ argument's parameter tried to consume
|
||||
// extra arguments. Emit an error and ignore.
|
||||
//
|
||||
// We also want to disallow any options which would alter the
|
||||
// driver behavior; that isn't going to work in our model. We
|
||||
// use isDriverOption() as an approximation, although things
|
||||
// like -O4 are going to slip through.
|
||||
if (!XarchArg || Index > Prev + 1) {
|
||||
getDriver().Diag(diag::err_drv_invalid_Xarch_argument_with_args)
|
||||
<< A->getAsString(Args);
|
||||
continue;
|
||||
} else if (XarchArg->getOption().hasFlag(options::DriverOption)) {
|
||||
getDriver().Diag(diag::err_drv_invalid_Xarch_argument_isdriver)
|
||||
<< A->getAsString(Args);
|
||||
continue;
|
||||
}
|
||||
XarchArg->setBaseArg(A);
|
||||
A = XarchArg.release();
|
||||
DAL->AddSynthesizedArg(A);
|
||||
}
|
||||
DAL->append(A);
|
||||
}
|
||||
|
||||
DAL->AddJoinedArg(nullptr, Opts.getOption(options::OPT_march_EQ), BoundArch);
|
||||
return DAL;
|
||||
}
|
||||
|
||||
/// XCore tool chain
|
||||
XCore::XCore(const Driver &D, const llvm::Triple &Triple, const ArgList &Args)
|
||||
: ToolChain(D, Triple, Args) {
|
||||
|
|
|
@ -699,6 +699,18 @@ private:
|
|||
std::string computeSysRoot() const;
|
||||
};
|
||||
|
||||
class LLVM_LIBRARY_VISIBILITY CudaToolChain : public Linux {
|
||||
public:
|
||||
CudaToolChain(const Driver &D, const llvm::Triple &Triple,
|
||||
const llvm::opt::ArgList &Args);
|
||||
|
||||
llvm::opt::DerivedArgList *
|
||||
TranslateArgs(const llvm::opt::DerivedArgList &Args,
|
||||
const char *BoundArch) const override;
|
||||
void addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
|
||||
llvm::opt::ArgStringList &CC1Args) const override;
|
||||
};
|
||||
|
||||
class LLVM_LIBRARY_VISIBILITY Hexagon_TC : public Linux {
|
||||
protected:
|
||||
GCCVersion GCCLibAndIncVersion;
|
||||
|
|
|
@ -1488,6 +1488,12 @@ static std::string getCPUName(const ArgList &Args, const llvm::Triple &T) {
|
|||
return CPUName;
|
||||
}
|
||||
|
||||
case llvm::Triple::nvptx:
|
||||
case llvm::Triple::nvptx64:
|
||||
if (const Arg *A = Args.getLastArg(options::OPT_march_EQ))
|
||||
return A->getValue();
|
||||
return "";
|
||||
|
||||
case llvm::Triple::ppc:
|
||||
case llvm::Triple::ppc64:
|
||||
case llvm::Triple::ppc64le: {
|
||||
|
@ -2826,8 +2832,14 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
getToolChain().getTriple().isWindowsCygwinEnvironment();
|
||||
bool IsWindowsMSVC = getToolChain().getTriple().isWindowsMSVCEnvironment();
|
||||
|
||||
assert(Inputs.size() == 1 && "Unable to handle multiple inputs.");
|
||||
// Check number of inputs for sanity. We need at least one input.
|
||||
assert(Inputs.size() >= 1 && "Must have at least one input.");
|
||||
const InputInfo &Input = Inputs[0];
|
||||
// CUDA compilation may have multiple inputs (source file + results of
|
||||
// device-side compilations). All other jobs are expected to have exactly one
|
||||
// input.
|
||||
bool IsCuda = types::isCuda(Input.getType());
|
||||
assert((IsCuda || Inputs.size() == 1) && "Unable to handle multiple inputs.");
|
||||
|
||||
// Invoke ourselves in -cc1 mode.
|
||||
//
|
||||
|
@ -4812,14 +4824,12 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
assert(Output.isNothing() && "Invalid output.");
|
||||
}
|
||||
|
||||
for (const auto &II : Inputs) {
|
||||
addDashXForInput(Args, II, CmdArgs);
|
||||
addDashXForInput(Args, Input, CmdArgs);
|
||||
|
||||
if (II.isFilename())
|
||||
CmdArgs.push_back(II.getFilename());
|
||||
else
|
||||
II.getInputArg().renderAsInput(Args, CmdArgs);
|
||||
}
|
||||
if (Input.isFilename())
|
||||
CmdArgs.push_back(Input.getFilename());
|
||||
else
|
||||
Input.getInputArg().renderAsInput(Args, CmdArgs);
|
||||
|
||||
Args.AddAllArgs(CmdArgs, options::OPT_undef);
|
||||
|
||||
|
@ -4857,6 +4867,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
|
|||
CmdArgs.push_back(SplitDwarfOut);
|
||||
}
|
||||
|
||||
// Host-side cuda compilation receives device-side outputs as Inputs[1...].
|
||||
// Include them with -fcuda-include-gpubinary.
|
||||
if (IsCuda && Inputs.size() > 1)
|
||||
for (InputInfoList::const_iterator it = std::next(Inputs.begin()),
|
||||
ie = Inputs.end();
|
||||
it != ie; ++it) {
|
||||
CmdArgs.push_back("-fcuda-include-gpubinary");
|
||||
CmdArgs.push_back(it->getFilename());
|
||||
}
|
||||
|
||||
// Finally add the compile command to the compilation.
|
||||
if (Args.hasArg(options::OPT__SLASH_fallback) &&
|
||||
Output.getType() == types::TY_Object &&
|
||||
|
|
|
@ -86,6 +86,7 @@ bool types::isAcceptedByClang(ID Id) {
|
|||
case TY_C: case TY_PP_C:
|
||||
case TY_CL:
|
||||
case TY_CUDA: case TY_PP_CUDA:
|
||||
case TY_CUDA_DEVICE:
|
||||
case TY_ObjC: case TY_PP_ObjC: case TY_PP_ObjC_Alias:
|
||||
case TY_CXX: case TY_PP_CXX:
|
||||
case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
|
||||
|
@ -122,7 +123,19 @@ bool types::isCXX(ID Id) {
|
|||
case TY_ObjCXX: case TY_PP_ObjCXX: case TY_PP_ObjCXX_Alias:
|
||||
case TY_CXXHeader: case TY_PP_CXXHeader:
|
||||
case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
|
||||
case TY_CUDA: case TY_PP_CUDA:
|
||||
case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool types::isCuda(ID Id) {
|
||||
switch (Id) {
|
||||
default:
|
||||
return false;
|
||||
|
||||
case TY_CUDA:
|
||||
case TY_PP_CUDA:
|
||||
case TY_CUDA_DEVICE:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
@ -206,10 +219,12 @@ void types::getCompilationPhases(ID Id, llvm::SmallVectorImpl<phases::ID> &P) {
|
|||
P.push_back(phases::Compile);
|
||||
P.push_back(phases::Backend);
|
||||
}
|
||||
P.push_back(phases::Assemble);
|
||||
if (Id != TY_CUDA_DEVICE)
|
||||
P.push_back(phases::Assemble);
|
||||
}
|
||||
}
|
||||
if (!onlyPrecompileType(Id)) {
|
||||
|
||||
if (!onlyPrecompileType(Id) && Id != TY_CUDA_DEVICE) {
|
||||
P.push_back(phases::Link);
|
||||
}
|
||||
assert(0 < P.size() && "Not enough phases in list");
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include "clang/Basic/DiagnosticOptions.h"
|
||||
#include "clang/Driver/Compilation.h"
|
||||
#include "clang/Driver/Driver.h"
|
||||
#include "clang/Driver/Action.h"
|
||||
#include "clang/Driver/Options.h"
|
||||
#include "clang/Driver/Tool.h"
|
||||
#include "clang/Frontend/CompilerInstance.h"
|
||||
|
@ -61,9 +62,25 @@ clang::createInvocationFromCommandLine(ArrayRef<const char *> ArgList,
|
|||
}
|
||||
|
||||
// We expect to get back exactly one command job, if we didn't something
|
||||
// failed.
|
||||
// failed. CUDA compilation is an exception as it creates multiple jobs. If
|
||||
// that's the case, we proceed with the first job. If caller needs particular
|
||||
// CUDA job, it should be controlled via --cuda-{host|device}-only option
|
||||
// passed to the driver.
|
||||
const driver::JobList &Jobs = C->getJobs();
|
||||
if (Jobs.size() != 1 || !isa<driver::Command>(*Jobs.begin())) {
|
||||
bool CudaCompilation = false;
|
||||
if (Jobs.size() > 1) {
|
||||
for (auto &A : C->getActions()){
|
||||
// On MacOSX real actions may end up being wrapped in BindArchAction
|
||||
if (isa<driver::BindArchAction>(A))
|
||||
A = *A->begin();
|
||||
if (isa<driver::CudaDeviceAction>(A)) {
|
||||
CudaCompilation = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (Jobs.size() == 0 || !isa<driver::Command>(*Jobs.begin()) ||
|
||||
(Jobs.size() > 1 && !CudaCompilation)) {
|
||||
SmallString<256> Msg;
|
||||
llvm::raw_svector_ostream OS(Msg);
|
||||
Jobs.Print(OS, "; ", true);
|
||||
|
|
|
@ -0,0 +1,109 @@
|
|||
// Tests CUDA compilation pipeline construction in Driver.
|
||||
// REQUIRES: clang-driver
|
||||
|
||||
// Simple compilation case:
|
||||
// RUN: %clang -### -c %s 2>&1 \
|
||||
// Compile device-side to PTX assembly and make sure we use it on the host side.
|
||||
// RUN: | FileCheck -check-prefix CUDA-D1 \
|
||||
// Then compile host side and incorporate device code.
|
||||
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
|
||||
// Make sure we don't link anything.
|
||||
// RUN: -check-prefix CUDA-NL %s
|
||||
|
||||
// Typical compilation + link case:
|
||||
// RUN: %clang -### %s 2>&1 \
|
||||
// Compile device-side to PTX assembly and make sure we use it on the host side
|
||||
// RUN: | FileCheck -check-prefix CUDA-D1 \
|
||||
// Then compile host side and incorporate device code.
|
||||
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
|
||||
// Then link things.
|
||||
// RUN: -check-prefix CUDA-L %s
|
||||
|
||||
// Verify that -cuda-no-device disables device-side compilation and linking
|
||||
// RUN: %clang -### --cuda-host-only %s 2>&1 \
|
||||
// Make sure we didn't run device-side compilation.
|
||||
// RUN: | FileCheck -check-prefix CUDA-ND \
|
||||
// Then compile host side and make sure we don't attempt to incorporate GPU code.
|
||||
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-NI \
|
||||
// Make sure we don't link anything.
|
||||
// RUN: -check-prefix CUDA-NL %s
|
||||
|
||||
// Verify that -cuda-no-host disables host-side compilation and linking
|
||||
// RUN: %clang -### --cuda-device-only %s 2>&1 \
|
||||
// Compile device-side to PTX assembly
|
||||
// RUN: | FileCheck -check-prefix CUDA-D1 \
|
||||
// Make sure there are no host cmpilation or linking.
|
||||
// RUN: -check-prefix CUDA-NH -check-prefix CUDA-NL %s
|
||||
|
||||
// Verify that with -S we compile host and device sides to assembly
|
||||
// and incorporate device code on the host side.
|
||||
// RUN: %clang -### -S -c %s 2>&1 \
|
||||
// Compile device-side to PTX assembly
|
||||
// RUN: | FileCheck -check-prefix CUDA-D1 \
|
||||
// Then compile host side and incorporate GPU code.
|
||||
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
|
||||
// Make sure we don't link anything.
|
||||
// RUN: -check-prefix CUDA-NL %s
|
||||
|
||||
// Verify that --cuda-gpu-arch option passes correct GPU
|
||||
// archtecture info to device compilation.
|
||||
// RUN: %clang -### --cuda-gpu-arch=sm_35 -c %s 2>&1 \
|
||||
// Compile device-side to PTX assembly.
|
||||
// RUN: | FileCheck -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
|
||||
// Then compile host side and incorporate GPU code.
|
||||
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 \
|
||||
// Make sure we don't link anything.
|
||||
// RUN: -check-prefix CUDA-NL %s
|
||||
|
||||
// Verify that there is device-side compilation per --cuda-gpu-arch args
|
||||
// and that all results are included on the host side.
|
||||
// RUN: %clang -### --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \
|
||||
// Compile both device-sides to PTX assembly
|
||||
// RUN: | FileCheck \
|
||||
// RUN: -check-prefix CUDA-D1 -check-prefix CUDA-D1-SM35 \
|
||||
// RUN: -check-prefix CUDA-D2 -check-prefix CUDA-D2-SM30 \
|
||||
// Then compile host side and incorporate both device-side outputs
|
||||
// RUN: -check-prefix CUDA-H -check-prefix CUDA-H-I1 -check-prefix CUDA-H-I2 \
|
||||
// Make sure we don't link anything.
|
||||
// RUN: -check-prefix CUDA-NL %s
|
||||
|
||||
// Match device-side compilation
|
||||
// CUDA-D1: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
|
||||
// CUDA-D1-SAME: "-fcuda-is-device"
|
||||
// CUDA-D1-SM35-SAME: "-target-cpu" "sm_35"
|
||||
// CUDA-D1-SAME: "-o" "[[GPUBINARY1:[^"]*]]"
|
||||
// CUDA-D1-SAME: "-x" "cuda"
|
||||
|
||||
// Match anothe device-side compilation
|
||||
// CUDA-D2: "-cc1" "-triple" "nvptx{{(64)?}}-nvidia-cuda"
|
||||
// CUDA-D2-SAME: "-fcuda-is-device"
|
||||
// CUDA-D2-SM30-SAME: "-target-cpu" "sm_30"
|
||||
// CUDA-D2-SAME: "-o" "[[GPUBINARY2:[^"]*]]"
|
||||
// CUDA-D2-SAME: "-x" "cuda"
|
||||
|
||||
// Match no device-side compilation
|
||||
// CUDA-ND-NOT: "-cc1" "-triple" "nvptx{{64?}}-nvidia-cuda"
|
||||
// CUDA-ND-SAME-NOT: "-fcuda-is-device"
|
||||
|
||||
// Match host-side compilation
|
||||
// CUDA-H: "-cc1" "-triple"
|
||||
// CUDA-H-SAME-NOT: "nvptx{{64?}}-nvidia-cuda"
|
||||
// CUDA-H-SAME-NOT: "-fcuda-is-device"
|
||||
// CUDA-H-SAME: "-o" "[[HOSTOBJ:[^"]*]]"
|
||||
// CUDA-H-SAME: "-x" "cuda"
|
||||
// CUDA-H-I1-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY1]]"
|
||||
// CUDA-H-I2-SAME: "-fcuda-include-gpubinary" "[[GPUBINARY2]]"
|
||||
|
||||
// Match no GPU code inclusion.
|
||||
// CUDA-H-NI-NOT: "-fcuda-include-gpubinary"
|
||||
|
||||
// Match no CUDA compilation
|
||||
// CUDA-NH-NOT: "-cc1" "-triple"
|
||||
// CUDA-NH-SAME-NOT: "-x" "cuda"
|
||||
|
||||
// Match linker
|
||||
// CUDA-L: "{{.*}}ld{{(.exe)?}}"
|
||||
// CUDA-L-SAME: "[[HOSTOBJ]]"
|
||||
|
||||
// Match no linker
|
||||
// CUDA-NL-NOT: "{{.*}}ld{{(.exe)?}}"
|
|
@ -1,4 +1,6 @@
|
|||
// RUN: c-index-test -test-load-source all -x cuda %s | FileCheck %s
|
||||
// RUN: c-index-test -test-load-source all -x cuda --cuda-host-only %s | FileCheck %s
|
||||
// RUN: c-index-test -test-load-source all -x cuda --cuda-device-only %s | FileCheck %s
|
||||
|
||||
__attribute__((device)) void f_device();
|
||||
__attribute__((global)) void f_global();
|
||||
|
@ -6,13 +8,13 @@ __attribute__((constant)) int* g_constant;
|
|||
__attribute__((shared)) float *g_shared;
|
||||
__attribute__((host)) void f_host();
|
||||
|
||||
// CHECK: attributes-cuda.cu:3:30: FunctionDecl=f_device:3:30
|
||||
// CHECK-NEXT: attributes-cuda.cu:3:16: attribute(device)
|
||||
// CHECK: attributes-cuda.cu:4:30: FunctionDecl=f_global:4:30
|
||||
// CHECK-NEXT: attributes-cuda.cu:4:16: attribute(global)
|
||||
// CHECK: attributes-cuda.cu:5:32: VarDecl=g_constant:5:32 (Definition)
|
||||
// CHECK-NEXT: attributes-cuda.cu:5:16: attribute(constant)
|
||||
// CHECK: attributes-cuda.cu:6:32: VarDecl=g_shared:6:32 (Definition)
|
||||
// CHECK-NEXT: attributes-cuda.cu:6:16: attribute(shared)
|
||||
// CHECK: attributes-cuda.cu:7:28: FunctionDecl=f_host:7:28
|
||||
// CHECK-NEXT: attributes-cuda.cu:7:16: attribute(host)
|
||||
// CHECK: attributes-cuda.cu:5:30: FunctionDecl=f_device:5:30
|
||||
// CHECK-NEXT: attributes-cuda.cu:5:16: attribute(device)
|
||||
// CHECK: attributes-cuda.cu:6:30: FunctionDecl=f_global:6:30
|
||||
// CHECK-NEXT: attributes-cuda.cu:6:16: attribute(global)
|
||||
// CHECK: attributes-cuda.cu:7:32: VarDecl=g_constant:7:32 (Definition)
|
||||
// CHECK-NEXT: attributes-cuda.cu:7:16: attribute(constant)
|
||||
// CHECK: attributes-cuda.cu:8:32: VarDecl=g_shared:8:32 (Definition)
|
||||
// CHECK-NEXT: attributes-cuda.cu:8:16: attribute(shared)
|
||||
// CHECK: attributes-cuda.cu:9:28: FunctionDecl=f_host:9:28
|
||||
// CHECK-NEXT: attributes-cuda.cu:9:16: attribute(host)
|
||||
|
|
|
@ -0,0 +1,9 @@
|
|||
// Make sure we can process CUDA file even if driver creates multiple jobs
|
||||
// RUN: c-index-test -test-load-source all %s | FileCheck %s -check-prefix=CHECK-ANY
|
||||
// Make sure we process correct side of cuda compilation
|
||||
// RUN: c-index-test -test-load-source all --cuda-host-only %s | FileCheck %s -check-prefix=CHECK-HOST
|
||||
// RUN: c-index-test -test-load-source all --cuda-device-only %s | FileCheck %s -check-prefix=CHECK-DEVICE
|
||||
|
||||
// CHECK-ANY: macro definition=__cplusplus
|
||||
// CHECK-HOST-NOT: macro definition=__CUDA_ARCH__
|
||||
// CHECK-DEVICE: macro definition=__CUDA_ARCH__
|
|
@ -3102,6 +3102,12 @@ static void clang_parseTranslationUnit_Impl(void *UserData) {
|
|||
/*AllowPCHWithCompilerErrors=*/true, SkipFunctionBodies,
|
||||
/*UserFilesAreVolatile=*/true, ForSerialization, &ErrUnit));
|
||||
|
||||
// Early failures in LoadFromCommandLine may return with ErrUnit unset.
|
||||
if (!Unit && !ErrUnit) {
|
||||
PTUI->result = CXError_ASTReadError;
|
||||
return;
|
||||
}
|
||||
|
||||
if (NumErrors != Diags->getClient()->getNumErrors()) {
|
||||
// Make sure to check that 'Unit' is non-NULL.
|
||||
if (CXXIdx->getDisplayDiagnostics())
|
||||
|
|
|
@ -164,6 +164,7 @@ testing::AssertionResult matchesConditionallyWithCuda(
|
|||
std::vector<std::string> Args;
|
||||
Args.push_back("-xcuda");
|
||||
Args.push_back("-fno-ms-extensions");
|
||||
Args.push_back("--cuda-host-only");
|
||||
Args.push_back(CompileArg);
|
||||
if (!runToolOnCodeWithArgs(Factory->create(),
|
||||
CudaHeader + Code, Args)) {
|
||||
|
|
Loading…
Reference in New Issue