diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index b29ec84128ba..75816e9a2671 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -170,7 +170,6 @@ LANGOPT(CUDAIsDevice      , 1, 0, "Compiling for CUDA device")
 LANGOPT(CUDAAllowHostCallsFromHostDevice, 1, 0, "Allow host device functions to call host functions")
 LANGOPT(CUDADisableTargetCallChecks, 1, 0, "Disable checks for call targets (host, device, etc.)")
 LANGOPT(CUDATargetOverloads, 1, 0, "Enable function overloads based on CUDA target attributes")
-LANGOPT(CUDAUsesLibDevice , 1, 0, "Selectively link and internalize bitcode.")
 
 LANGOPT(AssumeSaneOperatorNew , 1, 1, "implicit __attribute__((malloc)) for C++'s new operators")
 LANGOPT(SizedDeallocation , 1, 0, "enable sized deallocation functions")
diff --git a/clang/include/clang/CodeGen/CodeGenAction.h b/clang/include/clang/CodeGen/CodeGenAction.h
index 264780d01ca9..cc38e243420b 100644
--- a/clang/include/clang/CodeGen/CodeGenAction.h
+++ b/clang/include/clang/CodeGen/CodeGenAction.h
@@ -25,7 +25,9 @@ class CodeGenAction : public ASTFrontendAction {
 private:
   unsigned Act;
   std::unique_ptr<llvm::Module> TheModule;
-  llvm::Module *LinkModule;
+  // Vector of {Linker::Flags, Module*} pairs to specify bitcode
+  // modules to link in using corresponding linker flags.
+  SmallVector<std::pair<unsigned, llvm::Module *>, 4> LinkModules;
   llvm::LLVMContext *VMContext;
   bool OwnsVMContext;
 
@@ -50,7 +52,9 @@ public:
   /// setLinkModule - Set the link module to be used by this action.  If a link
   /// module is not provided, and CodeGenOptions::LinkBitcodeFile is non-empty,
   /// the action will load it from the specified file.
-  void setLinkModule(llvm::Module *Mod) { LinkModule = Mod; }
+  void addLinkModule(llvm::Module *Mod, unsigned LinkFlags) {
+    LinkModules.push_back(std::make_pair(LinkFlags, Mod));
+  }
 
   /// Take the generated LLVM module, for use after the action has been run.
   /// The result may be null on failure.
diff --git a/clang/include/clang/Driver/CC1Options.td b/clang/include/clang/Driver/CC1Options.td
index e643db648dd8..7a1a9edf2894 100644
--- a/clang/include/clang/Driver/CC1Options.td
+++ b/clang/include/clang/Driver/CC1Options.td
@@ -240,6 +240,9 @@ def mconstructor_aliases : Flag<["-"], "mconstructor-aliases">,
   HelpText<"Emit complete constructors and destructors as aliases when possible">;
 def mlink_bitcode_file : Separate<["-"], "mlink-bitcode-file">,
   HelpText<"Link the given bitcode file before performing optimizations.">;
+def mlink_cuda_bitcode : Separate<["-"], "mlink-cuda-bitcode">,
+  HelpText<"Link and internalize needed symbols from the given bitcode file "
+           "before performing optimizations.">;
 def vectorize_loops : Flag<["-"], "vectorize-loops">,
   HelpText<"Run the Loop vectorization passes">;
 def vectorize_slp : Flag<["-"], "vectorize-slp">,
@@ -671,8 +674,6 @@ def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">,
   HelpText<"Incorporate CUDA device-side binary into host object file.">;
 def fcuda_target_overloads : Flag<["-"], "fcuda-target-overloads">,
   HelpText<"Enable function overloads based on CUDA target attributes.">;
-def fcuda_uses_libdevice : Flag<["-"], "fcuda-uses-libdevice">,
-  HelpText<"Selectively link and internalize bitcode.">;
 
 } // let Flags = [CC1Option]
 
diff --git a/clang/include/clang/Frontend/CodeGenOptions.h b/clang/include/clang/Frontend/CodeGenOptions.h
index c359ed6ccbca..8e8e65f3f7ba 100644
--- a/clang/include/clang/Frontend/CodeGenOptions.h
+++ b/clang/include/clang/Frontend/CodeGenOptions.h
@@ -130,7 +130,7 @@ public:
   std::string LimitFloatPrecision;
 
   /// The name of the bitcode file to link before optzns.
-  std::string LinkBitcodeFile;
+  std::vector<std::pair<unsigned, std::string>> LinkBitcodeFiles;
 
   /// The user provided name for the "main file", if non-empty. This is useful
   /// in situations where the input file name does not match the original input
diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp
index b70a0806c108..10e5cbb0e6f5 100644
--- a/clang/lib/CodeGen/CodeGenAction.cpp
+++ b/clang/lib/CodeGen/CodeGenAction.cpp
@@ -53,29 +53,35 @@ namespace clang {
 
     std::unique_ptr<CodeGenerator> Gen;
 
-    std::unique_ptr<llvm::Module> TheModule, LinkModule;
+    std::unique_ptr<llvm::Module> TheModule;
+    SmallVector<std::pair<unsigned, std::unique_ptr<llvm::Module>>, 4>
+        LinkModules;
 
   public:
-    BackendConsumer(BackendAction Action, DiagnosticsEngine &Diags,
-                    const HeaderSearchOptions &HeaderSearchOpts,
-                    const PreprocessorOptions &PPOpts,
-                    const CodeGenOptions &CodeGenOpts,
-                    const TargetOptions &TargetOpts,
-                    const LangOptions &LangOpts, bool TimePasses,
-                    const std::string &InFile, llvm::Module *LinkModule,
-                    raw_pwrite_stream *OS, LLVMContext &C,
-                    CoverageSourceInfo *CoverageInfo = nullptr)
+    BackendConsumer(
+        BackendAction Action, DiagnosticsEngine &Diags,
+        const HeaderSearchOptions &HeaderSearchOpts,
+        const PreprocessorOptions &PPOpts, const CodeGenOptions &CodeGenOpts,
+        const TargetOptions &TargetOpts, const LangOptions &LangOpts,
+        bool TimePasses, const std::string &InFile,
+        const SmallVectorImpl<std::pair<unsigned, llvm::Module *>> &LinkModules,
+        raw_pwrite_stream *OS, LLVMContext &C,
+        CoverageSourceInfo *CoverageInfo = nullptr)
         : Diags(Diags), Action(Action), CodeGenOpts(CodeGenOpts),
           TargetOpts(TargetOpts), LangOpts(LangOpts), AsmOutStream(OS),
           Context(nullptr), LLVMIRGeneration("LLVM IR Generation Time"),
           Gen(CreateLLVMCodeGen(Diags, InFile, HeaderSearchOpts, PPOpts,
-                                CodeGenOpts, C, CoverageInfo)),
-          LinkModule(LinkModule) {
+                                CodeGenOpts, C, CoverageInfo)) {
       llvm::TimePassesIsEnabled = TimePasses;
+      for (auto &I : LinkModules)
+        this->LinkModules.push_back(
+            std::make_pair(I.first, std::unique_ptr<llvm::Module>(I.second)));
     }
-
     std::unique_ptr<llvm::Module> takeModule() { return std::move(TheModule); }
-    llvm::Module *takeLinkModule() { return LinkModule.release(); }
+    void releaseLinkModules() {
+      for (auto &I : LinkModules)
+        I.second.release();
+    }
 
     void HandleCXXStaticMemberVarInstantiation(VarDecl *VD) override {
       Gen->HandleCXXStaticMemberVarInstantiation(VD);
@@ -156,15 +162,14 @@ namespace clang {
              "Unexpected module change during IR generation");
 
       // Link LinkModule into this module if present, preserving its validity.
-      if (LinkModule) {
-        if (Linker::LinkModules(
-                M, LinkModule.get(),
-                [=](const DiagnosticInfo &DI) { linkerDiagnosticHandler(DI); },
-                (LangOpts.CUDA && LangOpts.CUDAIsDevice &&
-                 LangOpts.CUDAUsesLibDevice)
-                    ? (Linker::Flags::LinkOnlyNeeded |
-                       Linker::Flags::InternalizeLinkedSymbols)
-                    : Linker::Flags::None))
+      for (auto &I : LinkModules) {
+        unsigned LinkFlags = I.first;
+        llvm::Module *LinkModule = I.second.get();
+        if (Linker::LinkModules(M, LinkModule,
+                                [=](const DiagnosticInfo &DI) {
+                                  linkerDiagnosticHandler(DI, LinkModule);
+                                },
+                                LinkFlags))
           return;
       }
 
@@ -228,7 +233,8 @@ namespace clang {
       ((BackendConsumer*)Context)->InlineAsmDiagHandler2(SM, Loc);
     }
 
-    void linkerDiagnosticHandler(const llvm::DiagnosticInfo &DI);
+    void linkerDiagnosticHandler(const llvm::DiagnosticInfo &DI,
+                                 const llvm::Module *LinkModule);
 
     static void DiagnosticHandler(const llvm::DiagnosticInfo &DI,
                                   void *Context) {
@@ -539,7 +545,8 @@ void BackendConsumer::OptimizationFailureHandler(
   EmitOptimizationMessage(D, diag::warn_fe_backend_optimization_failure);
 }
 
-void BackendConsumer::linkerDiagnosticHandler(const DiagnosticInfo &DI) {
+void BackendConsumer::linkerDiagnosticHandler(const DiagnosticInfo &DI,
+                                              const llvm::Module *LinkModule) {
   if (DI.getSeverity() != DS_Error)
     return;
 
@@ -623,9 +630,8 @@ void BackendConsumer::DiagnosticHandlerImpl(const DiagnosticInfo &DI) {
 #undef ComputeDiagID
 
 CodeGenAction::CodeGenAction(unsigned _Act, LLVMContext *_VMContext)
-  : Act(_Act), LinkModule(nullptr),
-    VMContext(_VMContext ? _VMContext : new LLVMContext),
-    OwnsVMContext(!_VMContext) {}
+    : Act(_Act), VMContext(_VMContext ? _VMContext : new LLVMContext),
+      OwnsVMContext(!_VMContext) {}
 
 CodeGenAction::~CodeGenAction() {
   TheModule.reset();
@@ -640,9 +646,9 @@ void CodeGenAction::EndSourceFileAction() {
   if (!getCompilerInstance().hasASTConsumer())
     return;
 
-  // If we were given a link module, release consumer's ownership of it.
-  if (LinkModule)
-    BEConsumer->takeLinkModule();
+  // Take back ownership of link modules we passed to consumer.
+  if (!LinkModules.empty())
+    BEConsumer->releaseLinkModules();
 
   // Steal the module from the consumer.
   TheModule = BEConsumer->takeModule();
@@ -684,28 +690,29 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
   if (BA != Backend_EmitNothing && !OS)
     return nullptr;
 
-  llvm::Module *LinkModuleToUse = LinkModule;
+  // Load bitcode modules to link with, if we need to.
+  if (LinkModules.empty())
+    for (auto &I : CI.getCodeGenOpts().LinkBitcodeFiles) {
+      const std::string &LinkBCFile = I.second;
 
-  // If we were not given a link module, and the user requested that one be
-  // loaded from bitcode, do so now.
-  const std::string &LinkBCFile = CI.getCodeGenOpts().LinkBitcodeFile;
-  if (!LinkModuleToUse && !LinkBCFile.empty()) {
-    auto BCBuf = CI.getFileManager().getBufferForFile(LinkBCFile);
-    if (!BCBuf) {
-      CI.getDiagnostics().Report(diag::err_cannot_open_file)
-          << LinkBCFile << BCBuf.getError().message();
-      return nullptr;
-    }
+      auto BCBuf = CI.getFileManager().getBufferForFile(LinkBCFile);
+      if (!BCBuf) {
+        CI.getDiagnostics().Report(diag::err_cannot_open_file)
+            << LinkBCFile << BCBuf.getError().message();
+        LinkModules.clear();
+        return nullptr;
+      }
 
-    ErrorOr<std::unique_ptr<llvm::Module>> ModuleOrErr =
-        getLazyBitcodeModule(std::move(*BCBuf), *VMContext);
-    if (std::error_code EC = ModuleOrErr.getError()) {
-      CI.getDiagnostics().Report(diag::err_cannot_open_file)
-        << LinkBCFile << EC.message();
-      return nullptr;
+      ErrorOr<std::unique_ptr<llvm::Module>> ModuleOrErr =
+          getLazyBitcodeModule(std::move(*BCBuf), *VMContext);
+      if (std::error_code EC = ModuleOrErr.getError()) {
+        CI.getDiagnostics().Report(diag::err_cannot_open_file) << LinkBCFile
+                                                               << EC.message();
+        LinkModules.clear();
+        return nullptr;
+      }
+      addLinkModule(ModuleOrErr.get().release(), I.first);
     }
-    LinkModuleToUse = ModuleOrErr.get().release();
-  }
 
   CoverageSourceInfo *CoverageInfo = nullptr;
   // Add the preprocessor callback only when the coverage mapping is generated.
@@ -714,11 +721,12 @@ CodeGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) {
     CI.getPreprocessor().addPPCallbacks(
                                     std::unique_ptr<PPCallbacks>(CoverageInfo));
   }
+
   std::unique_ptr<BackendConsumer> Result(new BackendConsumer(
       BA, CI.getDiagnostics(), CI.getHeaderSearchOpts(),
       CI.getPreprocessorOpts(), CI.getCodeGenOpts(), CI.getTargetOpts(),
-      CI.getLangOpts(), CI.getFrontendOpts().ShowTimers, InFile,
-      LinkModuleToUse, OS, *VMContext, CoverageInfo));
+      CI.getLangOpts(), CI.getFrontendOpts().ShowTimers, InFile, LinkModules,
+      OS, *VMContext, CoverageInfo));
   BEConsumer = Result.get();
   return std::move(Result);
 }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 99963bab09cb..0c1c4eaefa99 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Linker/Linker.h"
 #include "llvm/Option/Arg.h"
 #include "llvm/Option/ArgList.h"
 #include "llvm/Option/OptTable.h"
@@ -539,7 +540,13 @@ static bool ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args, InputKind IK,
   Opts.EmitOpenCLArgMetadata = Args.hasArg(OPT_cl_kernel_arg_info);
   Opts.CompressDebugSections = Args.hasArg(OPT_compress_debug_sections);
   Opts.DebugCompilationDir = Args.getLastArgValue(OPT_fdebug_compilation_dir);
-  Opts.LinkBitcodeFile = Args.getLastArgValue(OPT_mlink_bitcode_file);
+  for (auto A : Args.filtered(OPT_mlink_bitcode_file, OPT_mlink_cuda_bitcode)) {
+    unsigned LinkFlags = llvm::Linker::Flags::None;
+    if (A->getOption().matches(OPT_mlink_cuda_bitcode))
+      LinkFlags = llvm::Linker::Flags::LinkOnlyNeeded |
+                  llvm::Linker::Flags::InternalizeLinkedSymbols;
+    Opts.LinkBitcodeFiles.push_back(std::make_pair(LinkFlags, A->getValue()));
+  }
   Opts.SanitizeCoverageType =
       getLastArgIntValue(Args, OPT_fsanitize_coverage_type, 0, Diags);
   Opts.SanitizeCoverageIndirectCalls =
@@ -1394,9 +1401,6 @@ static void ParseLangArgs(LangOptions &Opts, ArgList &Args, InputKind IK,
   if (Args.hasArg(OPT_fcuda_is_device))
     Opts.CUDAIsDevice = 1;
 
-  if (Args.hasArg(OPT_fcuda_uses_libdevice))
-    Opts.CUDAUsesLibDevice = 1;
-
   if (Args.hasArg(OPT_fcuda_allow_host_calls_from_host_device))
     Opts.CUDAAllowHostCallsFromHostDevice = 1;
 
diff --git a/clang/test/CodeGen/link-bitcode-file.c b/clang/test/CodeGen/link-bitcode-file.c
index 92b1a88ffb2d..7810fe1d2941 100644
--- a/clang/test/CodeGen/link-bitcode-file.c
+++ b/clang/test/CodeGen/link-bitcode-file.c
@@ -1,6 +1,12 @@
 // RUN: %clang_cc1 -triple i386-pc-linux-gnu -DBITCODE -emit-llvm-bc -o %t.bc %s
-// RUN: %clang_cc1 -triple i386-pc-linux-gnu -mlink-bitcode-file %t.bc -O3 -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NO-BC %s
-// RUN: not %clang_cc1 -triple i386-pc-linux-gnu -DBITCODE -mlink-bitcode-file %t.bc -O3 -emit-llvm -o - %s 2>&1 | FileCheck -check-prefix=CHECK-BC %s
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -DBITCODE2 -emit-llvm-bc -o %t-2.bc %s
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -mlink-bitcode-file %t.bc \
+// RUN:     -O3 -emit-llvm -o - %s | FileCheck -check-prefix=CHECK-NO-BC %s
+// RUN: %clang_cc1 -triple i386-pc-linux-gnu -O3 -emit-llvm -o - \
+// RUN:     -mlink-bitcode-file %t.bc -mlink-bitcode-file %t-2.bc %s \
+// RUN:     | FileCheck -check-prefix=CHECK-NO-BC -check-prefix=CHECK-NO-BC2 %s
+// RUN: not %clang_cc1 -triple i386-pc-linux-gnu -DBITCODE -O3 -emit-llvm -o - \
+// RUN:     -mlink-bitcode-file %t.bc %s 2>&1 | FileCheck -check-prefix=CHECK-BC %s
 // Make sure we deal with failure to load the file.
 // RUN: not %clang_cc1 -triple i386-pc-linux-gnu -mlink-bitcode-file no-such-file.bc \
 // RUN:    -emit-llvm -o - %s 2>&1 | FileCheck -check-prefix=CHECK-NO-FILE %s
@@ -9,11 +15,15 @@ int f(void);
 
 #ifdef BITCODE
 
+extern int f2(void);
 // CHECK-BC: fatal error: cannot link module {{.*}}'f': symbol multiply defined
 int f(void) {
+  f2();
   return 42;
 }
 
+#elif BITCODE2
+int f2(void) { return 43; }
 #else
 
 // CHECK-NO-BC-LABEL: define i32 @g
@@ -23,6 +33,7 @@ int g(void) {
 }
 
 // CHECK-NO-BC-LABEL: define i32 @f
+// CHECK-NO-BC2-LABEL: define i32 @f2
 
 #endif
 
diff --git a/clang/test/CodeGenCUDA/Inputs/device-code-2.ll b/clang/test/CodeGenCUDA/Inputs/device-code-2.ll
new file mode 100644
index 000000000000..8fde3b13ec79
--- /dev/null
+++ b/clang/test/CodeGenCUDA/Inputs/device-code-2.ll
@@ -0,0 +1,16 @@
+; Simple bit of IR to mimic CUDA's libdevice.
+
+target triple = "nvptx-unknown-cuda"
+
+define double @__nv_sin(double %a) {
+       ret double 1.0
+}
+
+define double @__nv_exp(double %a) {
+       ret double 3.0
+}
+
+define double @__unused(double %a) {
+       ret double 2.0
+}
+
diff --git a/clang/test/CodeGenCUDA/link-device-bitcode.cu b/clang/test/CodeGenCUDA/link-device-bitcode.cu
index 45e5bcff995f..de3d39c20b49 100644
--- a/clang/test/CodeGenCUDA/link-device-bitcode.cu
+++ b/clang/test/CodeGenCUDA/link-device-bitcode.cu
@@ -6,13 +6,21 @@
 // Prepare bitcode file to link with
 // RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t.bc \
 // RUN:    %S/Inputs/device-code.ll
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -emit-llvm-bc -o %t-2.bc \
+// RUN:    %S/Inputs/device-code-2.ll
 //
 // Make sure function in device-code gets linked in and internalized.
 // RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
-// RUN:    -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -emit-llvm \
+// RUN:    -mlink-cuda-bitcode %t.bc  -emit-llvm \
 // RUN:    -disable-llvm-passes -o - %s \
 // RUN:    | FileCheck %s -check-prefix CHECK-IR
 //
+// Make sure we can link two bitcode files.
+// RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
+// RUN:    -mlink-cuda-bitcode %t.bc -mlink-cuda-bitcode %t-2.bc \
+// RUN:    -emit-llvm -disable-llvm-passes -o - %s \
+// RUN:    | FileCheck %s -check-prefix CHECK-IR -check-prefix CHECK-IR-2
+//
 // Make sure function in device-code gets linked but is not internalized
 // without -fcuda-uses-libdevice
 // RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
@@ -22,7 +30,7 @@
 //
 // Make sure NVVMReflect pass is enabled in NVPTX back-end.
 // RUN: %clang_cc1 -triple nvptx-unknown-cuda -fcuda-is-device \
-// RUN:    -mlink-bitcode-file %t.bc -fcuda-uses-libdevice -S -o /dev/null %s \
+// RUN:    -mlink-cuda-bitcode %t.bc -S -o /dev/null %s \
 // RUN:    -backend-option -debug-pass=Structure 2>&1 \
 // RUN:    | FileCheck %s -check-prefix CHECK-REFLECT
 
@@ -52,5 +60,11 @@ __global__ __attribute__((used)) void kernel(float *out, float *in) {
 // CHECK-IR: call i32 @__nvvm_reflect
 // CHECK-IR: ret float
 
+// Make sure we've linked in and internalized only needed functions
+// from the second bitcode file.
+// CHECK-IR-2-LABEL: define internal double @__nv_sin
+// CHECK-IR-2-LABEL: define internal double @__nv_exp
+// CHECK-IR-2-NOT: double @__unused
+
 // Verify that NVVMReflect pass is among the passes run by NVPTX back-end.
 // CHECK-REFLECT: Replace occurrences of __nvvm_reflect() calls with 0/1