[OpenMP] Update target directive codegen to use 4.5 implicit data mappings.

Summary: This patch implements the 4.5 specification for the implicit data maps. OpenMP 4.5 specification changes the default way data is captured into a target region. All the non-aggregate kinds are passed by value by default. This required activating the capturing by value during SEMA for the target region. All the non-aggregate values that can be encoded in the size of a pointer are properly casted and forwarded to the runtime library. On top of fixing the previous weird behavior for mapping pointers in nested data regions (an explicit map was always required), this also improves performance as the number of allocations/transactions to the device per non-aggregate map are reduced from two to only one - instead of passing a reference and the value, only the value passed. Explicit maps will be added later on once firstprivate, private, and map clauses' SEMA and parsing are available. Reviewers: hfinkel, rjmccall, ABataev Subscribers: cfe-commits, carlo.bertolli Differential Revision: http://reviews.llvm.org/D14940 llvm-svn: 254521
2015-12-02 17:44:43 +00:00 · 2015-12-02 17:44:43 +00:00 · 4af1b7b693
parent 32beedc982
commit 4af1b7b693
12 changed files with 1635 additions and 265 deletions
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@ -1986,6 +1986,7 @@ public:
  enum VariableCaptureKind {
    VCK_This,
    VCK_ByRef,
+    VCK_ByCopy,
    VCK_VLAType,
  };

@ -2005,21 +2006,7 @@ public:
    /// \param Var The variable being captured, or null if capturing this.
    ///
    Capture(SourceLocation Loc, VariableCaptureKind Kind,
-            VarDecl *Var = nullptr)
-      : VarAndKind(Var, Kind), Loc(Loc) {
-      switch (Kind) {
-      case VCK_This:
-        assert(!Var && "'this' capture cannot have a variable!");
-        break;
-      case VCK_ByRef:
-        assert(Var && "capturing by reference must have a variable!");
-        break;
-      case VCK_VLAType:
-        assert(!Var &&
-               "Variable-length array type capture cannot have a variable!");
-        break;
-      }
-    }
+            VarDecl *Var = nullptr);

    /// \brief Determine the kind of capture.
    VariableCaptureKind getCaptureKind() const { return VarAndKind.getInt(); }
@ -2031,9 +2018,14 @@ public:
    /// \brief Determine whether this capture handles the C++ 'this' pointer.
    bool capturesThis() const { return getCaptureKind() == VCK_This; }

-    /// \brief Determine whether this capture handles a variable.
+    /// \brief Determine whether this capture handles a variable (by reference).
    bool capturesVariable() const { return getCaptureKind() == VCK_ByRef; }

+    /// \brief Determine whether this capture handles a variable by copy.
+    bool capturesVariableByCopy() const {
+      return getCaptureKind() == VCK_ByCopy;
+    }
+
    /// \brief Determine whether this capture handles a variable-length array
    /// type.
    bool capturesVariableArrayType() const {
@ -2044,7 +2036,7 @@ public:
    ///
    /// This operation is only valid if this capture captures a variable.
    VarDecl *getCapturedVar() const {
-      assert(capturesVariable() &&
+      assert((capturesVariable() || capturesVariableByCopy()) &&
             "No variable available for 'this' or VAT capture");
      return VarAndKind.getPointer();
    }
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@ -7751,6 +7751,12 @@ private:
  ExprResult VerifyPositiveIntegerConstantInClause(Expr *Op,
                                                   OpenMPClauseKind CKind);
 public:
+  /// \brief Return true if the provided declaration \a VD should be captured by
+  /// reference in the provided scope \a RSI. This will take into account the
+  /// semantics of the directive and associated clauses.
+  bool IsOpenMPCapturedByRef(VarDecl *VD,
+                             const sema::CapturedRegionScopeInfo *RSI);
+
  /// \brief Check if the specified variable is used in one of the private
  /// clauses (private, firstprivate, lastprivate, reduction etc.) in OpenMP
  /// constructs.
--- a/clang/lib/AST/Stmt.cpp
+++ b/clang/lib/AST/Stmt.cpp
@ -945,6 +945,33 @@ SEHFinallyStmt* SEHFinallyStmt::Create(const ASTContext &C, SourceLocation Loc,
  return new(C)SEHFinallyStmt(Loc,Block);
 }

+CapturedStmt::Capture::Capture(SourceLocation Loc, VariableCaptureKind Kind,
+                               VarDecl *Var)
+    : VarAndKind(Var, Kind), Loc(Loc) {
+  switch (Kind) {
+  case VCK_This:
+    assert(!Var && "'this' capture cannot have a variable!");
+    break;
+  case VCK_ByRef:
+    assert(Var && "capturing by reference must have a variable!");
+    break;
+  case VCK_ByCopy:
+    assert(Var && "capturing by copy must have a variable!");
+    assert(
+        (Var->getType()->isScalarType() || (Var->getType()->isReferenceType() &&
+                                            Var->getType()
+                                                ->castAs<ReferenceType>()
+                                                ->getPointeeType()
+                                                ->isScalarType())) &&
+        "captures by copy are expected to have a scalar type!");
+    break;
+  case VCK_VLAType:
+    assert(!Var &&
+           "Variable-length array type capture cannot have a variable!");
+    break;
+  }
+}
+
 CapturedStmt::Capture *CapturedStmt::getStoredCaptures() const {
  unsigned Size = sizeof(CapturedStmt) + sizeof(Stmt *) * (NumCaptures + 1);

--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@ -3180,7 +3180,7 @@ CGOpenMPRuntime::emitTargetOutlinedFunction(const OMPExecutableDirective &D,
  CodeGenFunction CGF(CGM, true);
  CGOpenMPTargetRegionInfo CGInfo(CS, CodeGen);
  CodeGenFunction::CGCapturedStmtRAII CapInfoRAII(CGF, &CGInfo);
-  return CGF.GenerateOpenMPCapturedStmtFunction(CS, /*UseOnlyReferences=*/true);
+  return CGF.GenerateOpenMPCapturedStmtFunction(CS);
 }

 void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
@ -3195,6 +3195,10 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
    OMP_MAP_TO = 0x01,
    /// \brief Allocate memory on the device and move data from device to host.
    OMP_MAP_FROM = 0x02,
+    /// \brief The element passed to the device is a pointer.
+    OMP_MAP_PTR = 0x20,
+    /// \brief Pass the element to the device by value.
+    OMP_MAP_BYCOPY = 0x80,
  };

  enum OpenMPOffloadingReservedDeviceIDs {
@ -3203,6 +3207,8 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
    OMP_DEVICEID_UNDEF = -1,
  };

+  auto &Ctx = CGF.getContext();
+
  // Fill up the arrays with the all the captured variables.
  SmallVector<llvm::Value *, 16> BasePointers;
  SmallVector<llvm::Value *, 16> Pointers;
@ -3225,27 +3231,61 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
    llvm::Value *Size;
    unsigned MapType;

+    // VLA sizes are passed to the outlined region by copy.
    if (CI->capturesVariableArrayType()) {
      BasePointer = Pointer = *CV;
      Size = getTypeSize(CGF, RI->getType());
+      // Copy to the device as an argument. No need to retrieve it.
+      MapType = OMP_MAP_BYCOPY;
      hasVLACaptures = true;
-      // VLA sizes don't need to be copied back from the device.
-      MapType = OMP_MAP_TO;
    } else if (CI->capturesThis()) {
      BasePointer = Pointer = *CV;
      const PointerType *PtrTy = cast<PointerType>(RI->getType().getTypePtr());
      Size = getTypeSize(CGF, PtrTy->getPointeeType());
      // Default map type.
      MapType = OMP_MAP_TO | OMP_MAP_FROM;
+    } else if (CI->capturesVariableByCopy()) {
+      MapType = OMP_MAP_BYCOPY;
+      if (!RI->getType()->isAnyPointerType()) {
+        // If the field is not a pointer, we need to save the actual value and
+        // load it as a void pointer.
+        auto DstAddr = CGF.CreateMemTemp(
+            Ctx.getUIntPtrType(),
+            Twine(CI->getCapturedVar()->getName()) + ".casted");
+        LValue DstLV = CGF.MakeAddrLValue(DstAddr, Ctx.getUIntPtrType());
+
+        auto *SrcAddrVal = CGF.EmitScalarConversion(
+            DstAddr.getPointer(), Ctx.getPointerType(Ctx.getUIntPtrType()),
+            Ctx.getPointerType(RI->getType()), SourceLocation());
+        LValue SrcLV =
+            CGF.MakeNaturalAlignAddrLValue(SrcAddrVal, RI->getType());
+
+        // Store the value using the source type pointer.
+        CGF.EmitStoreThroughLValue(RValue::get(*CV), SrcLV);
+
+        // Load the value using the destination type pointer.
+        BasePointer = Pointer =
+            CGF.EmitLoadOfLValue(DstLV, SourceLocation()).getScalarVal();
+      } else {
+        MapType |= OMP_MAP_PTR;
+        BasePointer = Pointer = *CV;
+      }
+      Size = getTypeSize(CGF, RI->getType());
    } else {
+      assert(CI->capturesVariable() && "Expected captured reference.");
      BasePointer = Pointer = *CV;

      const ReferenceType *PtrTy =
          cast<ReferenceType>(RI->getType().getTypePtr());
      QualType ElementType = PtrTy->getPointeeType();
      Size = getTypeSize(CGF, ElementType);
-      // Default map type.
-      MapType = OMP_MAP_TO | OMP_MAP_FROM;
+      // The default map type for a scalar/complex type is 'to' because by
+      // default the value doesn't have to be retrieved. For an aggregate type,
+      // the default is 'tofrom'.
+      MapType = ElementType->isAggregateType() ? (OMP_MAP_TO | OMP_MAP_FROM)
+                                               : OMP_MAP_TO;
+      if (ElementType->isAnyPointerType())
+        MapType |= OMP_MAP_PTR;
    }

    BasePointers.push_back(BasePointer);
@ -3256,7 +3296,7 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,

  // Keep track on whether the host function has to be executed.
  auto OffloadErrorQType =
-      CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true);
+      Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true);
  auto OffloadError = CGF.MakeAddrLValue(
      CGF.CreateMemTemp(OffloadErrorQType, ".run_host_version"),
      OffloadErrorQType);
@ -3264,7 +3304,7 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
                        OffloadError);

  // Fill up the pointer arrays and transfer execution to the device.
-  auto &&ThenGen = [this, &BasePointers, &Pointers, &Sizes, &MapTypes,
+  auto &&ThenGen = [this, &Ctx, &BasePointers, &Pointers, &Sizes, &MapTypes,
                    hasVLACaptures, Device, OffloadError,
                    OffloadErrorQType](CodeGenFunction &CGF) {
    unsigned PointerNumVal = BasePointers.size();
@ -3276,8 +3316,8 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,

    if (PointerNumVal) {
      llvm::APInt PointerNumAP(32, PointerNumVal, /*isSigned=*/true);
-      QualType PointerArrayType = CGF.getContext().getConstantArrayType(
-          CGF.getContext().VoidPtrTy, PointerNumAP, ArrayType::Normal,
+      QualType PointerArrayType = Ctx.getConstantArrayType(
+          Ctx.VoidPtrTy, PointerNumAP, ArrayType::Normal,
          /*IndexTypeQuals=*/0);

      BasePointersArray =
@ -3289,8 +3329,8 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
      // sizes, otherwise we need to fill up the arrays as we do for the
      // pointers.
      if (hasVLACaptures) {
-        QualType SizeArrayType = CGF.getContext().getConstantArrayType(
-            CGF.getContext().getSizeType(), PointerNumAP, ArrayType::Normal,
+        QualType SizeArrayType = Ctx.getConstantArrayType(
+            Ctx.getSizeType(), PointerNumAP, ArrayType::Normal,
            /*IndexTypeQuals=*/0);
        SizesArray =
            CGF.CreateMemTemp(SizeArrayType, ".offload_sizes").getPointer();
@ -3323,29 +3363,41 @@ void CGOpenMPRuntime::emitTargetCall(CodeGenFunction &CGF,
      MapTypesArray = MapTypesArrayGbl;

      for (unsigned i = 0; i < PointerNumVal; ++i) {
+
+        llvm::Value *BPVal = BasePointers[i];
+        if (BPVal->getType()->isPointerTy())
+          BPVal = CGF.Builder.CreateBitCast(BPVal, CGM.VoidPtrTy);
+        else {
+          assert(BPVal->getType()->isIntegerTy() &&
+                 "If not a pointer, the value type must be an integer.");
+          BPVal = CGF.Builder.CreateIntToPtr(BPVal, CGM.VoidPtrTy);
+        }
        llvm::Value *BP = CGF.Builder.CreateConstInBoundsGEP2_32(
            llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal),
            BasePointersArray, 0, i);
-        Address BPAddr(BP, CGM.getContext().getTypeAlignInChars(
-                               CGM.getContext().VoidPtrTy));
-        CGF.Builder.CreateStore(
-            CGF.Builder.CreateBitCast(BasePointers[i], CGM.VoidPtrTy), BPAddr);
+        Address BPAddr(BP, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
+        CGF.Builder.CreateStore(BPVal, BPAddr);

+        llvm::Value *PVal = Pointers[i];
+        if (PVal->getType()->isPointerTy())
+          PVal = CGF.Builder.CreateBitCast(PVal, CGM.VoidPtrTy);
+        else {
+          assert(PVal->getType()->isIntegerTy() &&
+                 "If not a pointer, the value type must be an integer.");
+          PVal = CGF.Builder.CreateIntToPtr(PVal, CGM.VoidPtrTy);
+        }
        llvm::Value *P = CGF.Builder.CreateConstInBoundsGEP2_32(
            llvm::ArrayType::get(CGM.VoidPtrTy, PointerNumVal), PointersArray,
            0, i);
-        Address PAddr(P, CGM.getContext().getTypeAlignInChars(
-                             CGM.getContext().VoidPtrTy));
-        CGF.Builder.CreateStore(
-            CGF.Builder.CreateBitCast(Pointers[i], CGM.VoidPtrTy), PAddr);
+        Address PAddr(P, Ctx.getTypeAlignInChars(Ctx.VoidPtrTy));
+        CGF.Builder.CreateStore(PVal, PAddr);

        if (hasVLACaptures) {
          llvm::Value *S = CGF.Builder.CreateConstInBoundsGEP2_32(
              llvm::ArrayType::get(CGM.SizeTy, PointerNumVal), SizesArray,
              /*Idx0=*/0,
              /*Idx1=*/i);
-          Address SAddr(S, CGM.getContext().getTypeAlignInChars(
-                               CGM.getContext().getSizeType()));
+          Address SAddr(S, Ctx.getTypeAlignInChars(Ctx.getSizeType()));
          CGF.Builder.CreateStore(CGF.Builder.CreateIntCast(
                                      Sizes[i], CGM.SizeTy, /*isSigned=*/true),
                                  SAddr);
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@ -21,8 +21,7 @@ using namespace clang;
 using namespace CodeGen;

 void CodeGenFunction::GenerateOpenMPCapturedVars(
-    const CapturedStmt &S, SmallVectorImpl<llvm::Value *> &CapturedVars,
-    bool UseOnlyReferences) {
+    const CapturedStmt &S, SmallVectorImpl<llvm::Value *> &CapturedVars) {
  const RecordDecl *RD = S.getCapturedRecordDecl();
  auto CurField = RD->field_begin();
  auto CurCap = S.captures().begin();
@ -32,26 +31,46 @@ void CodeGenFunction::GenerateOpenMPCapturedVars(
    if (CurField->hasCapturedVLAType()) {
      auto VAT = CurField->getCapturedVLAType();
      auto *Val = VLASizeMap[VAT->getSizeExpr()];
-      // If we need to use only references, create a temporary location for the
-      // size of the VAT.
-      if (UseOnlyReferences) {
-        LValue LV =
-            MakeAddrLValue(CreateMemTemp(CurField->getType(), "__vla_size_ref"),
-                           CurField->getType());
-        EmitStoreThroughLValue(RValue::get(Val), LV);
-        Val = LV.getAddress().getPointer();
-      }
      CapturedVars.push_back(Val);
    } else if (CurCap->capturesThis())
      CapturedVars.push_back(CXXThisValue);
-    else
+    else if (CurCap->capturesVariableByCopy())
+      CapturedVars.push_back(
+          EmitLoadOfLValue(EmitLValue(*I), SourceLocation()).getScalarVal());
+    else {
+      assert(CurCap->capturesVariable() && "Expected capture by reference.");
      CapturedVars.push_back(EmitLValue(*I).getAddress().getPointer());
+    }
  }
 }

+static Address castValueFromUintptr(CodeGenFunction &CGF, QualType DstType,
+                                    StringRef Name, LValue AddrLV,
+                                    bool isReferenceType = false) {
+  ASTContext &Ctx = CGF.getContext();
+
+  auto *CastedPtr = CGF.EmitScalarConversion(
+      AddrLV.getAddress().getPointer(), Ctx.getUIntPtrType(),
+      Ctx.getPointerType(DstType), SourceLocation());
+  auto TmpAddr =
+      CGF.MakeNaturalAlignAddrLValue(CastedPtr, Ctx.getPointerType(DstType))
+          .getAddress();
+
+  // If we are dealing with references we need to return the address of the
+  // reference instead of the reference of the value.
+  if (isReferenceType) {
+    QualType RefType = Ctx.getLValueReferenceType(DstType);
+    auto *RefVal = TmpAddr.getPointer();
+    TmpAddr = CGF.CreateMemTemp(RefType, Twine(Name) + ".ref");
+    auto TmpLVal = CGF.MakeAddrLValue(TmpAddr, RefType);
+    CGF.EmitScalarInit(RefVal, TmpLVal);
+  }
+
+  return TmpAddr;
+}
+
 llvm::Function *
-CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
-                                                    bool UseOnlyReferences) {
+CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S) {
  assert(
      CapturedStmtInfo &&
      "CapturedStmtInfo should be set when generating the captured function");
@ -69,7 +88,17 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
    QualType ArgType = FD->getType();
    IdentifierInfo *II = nullptr;
    VarDecl *CapVar = nullptr;
-    if (I->capturesVariable()) {
+
+    // If this is a capture by copy and the type is not a pointer, the outlined
+    // function argument type should be uintptr and the value properly casted to
+    // uintptr. This is necessary given that the runtime library is only able to
+    // deal with pointers. We can pass in the same way the VLA type sizes to the
+    // outlined function.
+    if ((I->capturesVariableByCopy() && !ArgType->isAnyPointerType()) ||
+        I->capturesVariableArrayType())
+      ArgType = Ctx.getUIntPtrType();
+
+    if (I->capturesVariable() || I->capturesVariableByCopy()) {
      CapVar = I->getCapturedVar();
      II = CapVar->getIdentifier();
    } else if (I->capturesThis())
@ -77,9 +106,6 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
    else {
      assert(I->capturesVariableArrayType());
      II = &getContext().Idents.get("vla");
-      if (UseOnlyReferences)
-        ArgType = getContext().getLValueReferenceType(
-            ArgType, /*SpelledAsLValue=*/false);
    }
    if (ArgType->isVariablyModifiedType())
      ArgType = getContext().getVariableArrayDecayedType(ArgType);
@ -111,15 +137,24 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
  unsigned Cnt = CD->getContextParamPosition();
  I = S.captures().begin();
  for (auto *FD : RD->fields()) {
+    // If we are capturing a pointer by copy we don't need to do anything, just
+    // use the value that we get from the arguments.
+    if (I->capturesVariableByCopy() && FD->getType()->isAnyPointerType()) {
+      setAddrOfLocalVar(I->getCapturedVar(), GetAddrOfLocalVar(Args[Cnt]));
+      ++Cnt, ++I;
+      continue;
+    }
+
    LValue ArgLVal =
        MakeAddrLValue(GetAddrOfLocalVar(Args[Cnt]), Args[Cnt]->getType(),
                       AlignmentSource::Decl);
    if (FD->hasCapturedVLAType()) {
-      if (UseOnlyReferences)
-        ArgLVal = EmitLoadOfReferenceLValue(
-            ArgLVal.getAddress(), ArgLVal.getType()->castAs<ReferenceType>());
+      LValue CastedArgLVal =
+          MakeAddrLValue(castValueFromUintptr(*this, FD->getType(),
+                                              Args[Cnt]->getName(), ArgLVal),
+                         FD->getType(), AlignmentSource::Decl);
      auto *ExprArg =
-          EmitLoadOfLValue(ArgLVal, SourceLocation()).getScalarVal();
+          EmitLoadOfLValue(CastedArgLVal, SourceLocation()).getScalarVal();
      auto VAT = FD->getCapturedVLAType();
      VLASizeMap[VAT->getSizeExpr()] = ExprArg;
    } else if (I->capturesVariable()) {
@ -132,6 +167,15 @@ CodeGenFunction::GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
      }
      setAddrOfLocalVar(
          Var, Address(ArgAddr.getPointer(), getContext().getDeclAlign(Var)));
+    } else if (I->capturesVariableByCopy()) {
+      assert(!FD->getType()->isAnyPointerType() &&
+             "Not expecting a captured pointer.");
+      auto *Var = I->getCapturedVar();
+      QualType VarTy = Var->getType();
+      setAddrOfLocalVar(I->getCapturedVar(),
+                        castValueFromUintptr(*this, FD->getType(),
+                                             Args[Cnt]->getName(), ArgLVal,
+                                             VarTy->isReferenceType()));
    } else {
      // If 'this' is captured, load it into CXXThisValue.
      assert(I->capturesThis());
@ -2480,7 +2524,7 @@ void CodeGenFunction::EmitOMPTargetDirective(const OMPTargetDirective &S) {
  const CapturedStmt &CS = *cast<CapturedStmt>(S.getAssociatedStmt());

  llvm::SmallVector<llvm::Value *, 16> CapturedVars;
-  GenerateOpenMPCapturedVars(CS, CapturedVars, /*UseOnlyReferences=*/true);
+  GenerateOpenMPCapturedVars(CS, CapturedVars);

  // Emit target region as a standalone region.
  auto &&CodeGen = [&CS](CodeGenFunction &CGF) {
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@ -2200,12 +2200,9 @@ public:
  llvm::Function *EmitCapturedStmt(const CapturedStmt &S, CapturedRegionKind K);
  llvm::Function *GenerateCapturedStmtFunction(const CapturedStmt &S);
  Address GenerateCapturedStmtArgument(const CapturedStmt &S);
-  llvm::Function *
-  GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S,
-                                     bool UseOnlyReferences = false);
+  llvm::Function *GenerateOpenMPCapturedStmtFunction(const CapturedStmt &S);
  void GenerateOpenMPCapturedVars(const CapturedStmt &S,
-                                  SmallVectorImpl<llvm::Value *> &CapturedVars,
-                                  bool UseOnlyReferences = false);
+                                  SmallVectorImpl<llvm::Value *> &CapturedVars);
  /// \brief Perform element by element copying of arrays with type \a
  /// OriginalType from \a SrcAddr to \a DestAddr using copying procedure
  /// generated by \a CopyGen.
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@ -12621,10 +12621,15 @@ static bool isVariableAlreadyCapturedInScopeInfo(CapturingScopeInfo *CSI, VarDec
      
    // Compute the type of an expression that refers to this variable.
    DeclRefType = CaptureType.getNonReferenceType();
-      
+
+    // Similarly to mutable captures in lambda, all the OpenMP captures by copy
+    // are mutable in the sense that user can change their value - they are
+    // private instances of the captured declarations.
    const CapturingScopeInfo::Capture &Cap = CSI->getCapture(Var);
    if (Cap.isCopyCapture() &&
-        !(isa<LambdaScopeInfo>(CSI) && cast<LambdaScopeInfo>(CSI)->Mutable))
+        !(isa<LambdaScopeInfo>(CSI) && cast<LambdaScopeInfo>(CSI)->Mutable) &&
+        !(isa<CapturedRegionScopeInfo>(CSI) &&
+          cast<CapturedRegionScopeInfo>(CSI)->CapRegionKind == CR_OpenMP))
      DeclRefType.addConst();
    return true;
  }
@ -12812,9 +12817,17 @@ static bool captureInCapturedRegion(CapturedRegionScopeInfo *RSI,
  // By default, capture variables by reference.
  bool ByRef = true;
  // Using an LValue reference type is consistent with Lambdas (see below).
-  if (S.getLangOpts().OpenMP && S.IsOpenMPCapturedVar(Var))
-    DeclRefType = DeclRefType.getUnqualifiedType();
-  CaptureType = S.Context.getLValueReferenceType(DeclRefType);
+  if (S.getLangOpts().OpenMP) {
+    ByRef = S.IsOpenMPCapturedByRef(Var, RSI);
+    if (S.IsOpenMPCapturedVar(Var))
+      DeclRefType = DeclRefType.getUnqualifiedType();
+  }
+
+  if (ByRef)
+    CaptureType = S.Context.getLValueReferenceType(DeclRefType);
+  else
+    CaptureType = DeclRefType;
+
  Expr *CopyExpr = nullptr;
  if (BuildAndDiagnose) {
    // The current implementation assumes that all variables are captured
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@ -222,6 +222,8 @@ public:
      return Stack[Stack.size() - 2].Directive;
    return OMPD_unknown;
  }
+  /// \brief Return the directive associated with the provided scope.
+  OpenMPDirectiveKind getDirectiveForScope(const Scope *S) const;

  /// \brief Set default data sharing attribute to none.
  void setDefaultDSANone(SourceLocation Loc) {
@ -729,12 +731,107 @@ bool DSAStackTy::hasDirective(NamedDirectivesPredicate DPred, bool FromParent) {
  return false;
 }

+OpenMPDirectiveKind DSAStackTy::getDirectiveForScope(const Scope *S) const {
+  for (auto I = Stack.rbegin(), EE = Stack.rend(); I != EE; ++I)
+    if (I->CurScope == S)
+      return I->Directive;
+  return OMPD_unknown;
+}
+
 void Sema::InitDataSharingAttributesStack() {
  VarDataSharingAttributesStack = new DSAStackTy(*this);
 }

 #define DSAStack static_cast<DSAStackTy *>(VarDataSharingAttributesStack)

+bool Sema::IsOpenMPCapturedByRef(VarDecl *VD,
+                                 const CapturedRegionScopeInfo *RSI) {
+  assert(LangOpts.OpenMP && "OpenMP is not allowed");
+
+  auto &Ctx = getASTContext();
+  bool IsByRef = true;
+
+  // Find the directive that is associated with the provided scope.
+  auto DKind = DSAStack->getDirectiveForScope(RSI->TheScope);
+  auto Ty = VD->getType();
+
+  if (isOpenMPTargetDirective(DKind)) {
+    // This table summarizes how a given variable should be passed to the device
+    // given its type and the clauses where it appears. This table is based on
+    // the description in OpenMP 4.5 [2.10.4, target Construct] and
+    // OpenMP 4.5 [2.15.5, Data-mapping Attribute Rules and Clauses].
+    //
+    // =========================================================================
+    // | type |  defaultmap   | pvt | first | is_device_ptr |    map   | res.  |
+    // |      |(tofrom:scalar)|     |  pvt  |               |          |       |
+    // =========================================================================
+    // | scl  |               |     |       |       -       |          | bycopy|
+    // | scl  |               |  -  |   x   |       -       |     -    | bycopy|
+    // | scl  |               |  x  |   -   |       -       |     -    | null  |
+    // | scl  |       x       |     |       |       -       |          | byref |
+    // | scl  |       x       |  -  |   x   |       -       |     -    | bycopy|
+    // | scl  |       x       |  x  |   -   |       -       |     -    | null  |
+    // | scl  |               |  -  |   -   |       -       |     x    | byref |
+    // | scl  |       x       |  -  |   -   |       -       |     x    | byref |
+    //
+    // | agg  |      n.a.     |     |       |       -       |          | byref |
+    // | agg  |      n.a.     |  -  |   x   |       -       |     -    | byref |
+    // | agg  |      n.a.     |  x  |   -   |       -       |     -    | null  |
+    // | agg  |      n.a.     |  -  |   -   |       -       |     x    | byref |
+    // | agg  |      n.a.     |  -  |   -   |       -       |    x[]   | byref |
+    //
+    // | ptr  |      n.a.     |     |       |       -       |          | bycopy|
+    // | ptr  |      n.a.     |  -  |   x   |       -       |     -    | bycopy|
+    // | ptr  |      n.a.     |  x  |   -   |       -       |     -    | null  |
+    // | ptr  |      n.a.     |  -  |   -   |       -       |     x    | byref |
+    // | ptr  |      n.a.     |  -  |   -   |       -       |    x[]   | bycopy|
+    // | ptr  |      n.a.     |  -  |   -   |       x       |          | bycopy|
+    // | ptr  |      n.a.     |  -  |   -   |       x       |     x    | bycopy|
+    // | ptr  |      n.a.     |  -  |   -   |       x       |    x[]   | bycopy|
+    // =========================================================================
+    // Legend:
+    //  scl - scalar
+    //  ptr - pointer
+    //  agg - aggregate
+    //  x - applies
+    //  - - invalid in this combination
+    //  [] - mapped with an array section
+    //  byref - should be mapped by reference
+    //  byval - should be mapped by value
+    //  null - initialize a local variable to null on the device
+    //
+    // Observations:
+    //  - All scalar declarations that show up in a map clause have to be passed
+    //    by reference, because they may have been mapped in the enclosing data
+    //    environment.
+    //  - If the scalar value does not fit the size of uintptr, it has to be
+    //    passed by reference, regardless the result in the table above.
+    //  - For pointers mapped by value that have either an implicit map or an
+    //    array section, the runtime library may pass the NULL value to the
+    //    device instead of the value passed to it by the compiler.
+
+    // FIXME: Right now, only implicit maps are implemented. Properly mapping
+    // values requires having the map, private, and firstprivate clauses SEMA
+    // and parsing in place, which we don't yet.
+
+    if (Ty->isReferenceType())
+      Ty = Ty->castAs<ReferenceType>()->getPointeeType();
+    IsByRef = !Ty->isScalarType();
+  }
+
+  // When passing data by value, we need to make sure it fits the uintptr size
+  // and alignment, because the runtime library only deals with uintptr types.
+  // If it does not fit the uintptr size, we need to pass the data by reference
+  // instead.
+  if (!IsByRef &&
+      (Ctx.getTypeSizeInChars(Ty) >
+           Ctx.getTypeSizeInChars(Ctx.getUIntPtrType()) ||
+       Ctx.getDeclAlign(VD) > Ctx.getTypeAlignInChars(Ctx.getUIntPtrType())))
+    IsByRef = true;
+
+  return IsByRef;
+}
+
 bool Sema::IsOpenMPCapturedVar(VarDecl *VD) {
  assert(LangOpts.OpenMP && "OpenMP is not allowed");
  VD = VD->getCanonicalDecl();
@ -5015,7 +5112,13 @@ StmtResult Sema::ActOnOpenMPTargetDirective(ArrayRef<OMPClause *> Clauses,
  if (!AStmt)
    return StmtError();

-  assert(isa<CapturedStmt>(AStmt) && "Captured statement expected");
+  CapturedStmt *CS = cast<CapturedStmt>(AStmt);
+  // 1.2.2 OpenMP Language Terminology
+  // Structured block - An executable statement with a single entry at the
+  // top and a single exit at the bottom.
+  // The point of exit cannot be a branch out of the structured block.
+  // longjmp() and throw() must not violate the entry/exit criteria.
+  CS->getCapturedDecl()->setNothrow();

  // OpenMP [2.16, Nesting of Regions]
  // If specified, a teams construct must be contained within a target
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@ -3803,11 +3803,10 @@ static void buildCapturedStmtCaptureList(
      continue;
    }

-    assert(Cap->isReferenceCapture() &&
-           "non-reference capture not yet implemented");
-
    Captures.push_back(CapturedStmt::Capture(Cap->getLocation(),
-                                             CapturedStmt::VCK_ByRef,
+                                             Cap->isReferenceCapture()
+                                                 ? CapturedStmt::VCK_ByRef
+                                                 : CapturedStmt::VCK_ByCopy,
                                             Cap->getVariable()));
    CaptureInits.push_back(Cap->getInitExpr());
  }
--- a/clang/test/OpenMP/target_codegen.cpp
+++ b/clang/test/OpenMP/target_codegen.cpp
@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@ -16,15 +16,15 @@
 // sizes.

 // CHECK-DAG: [[SIZET2:@.+]] = private unnamed_addr constant [1 x i{{32|64}}] [i[[SZ:32|64]] 2]
-// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 3]
+// CHECK-DAG: [[MAPT2:@.+]] = private unnamed_addr constant [1 x i32] [i32 128]
 // CHECK-DAG: [[SIZET3:@.+]] = private unnamed_addr constant [2 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2]
-// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 3, i32 3]
-// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 3, i32 3, i32 1, i32 3, i32 3, i32 1, i32 1, i32 3, i32 3]
+// CHECK-DAG: [[MAPT3:@.+]] = private unnamed_addr constant [2 x i32] [i32 128, i32 128]
+// CHECK-DAG: [[MAPT4:@.+]] = private unnamed_addr constant [9 x i32] [i32 128, i32 3, i32 128, i32 3, i32 3, i32 128, i32 128, i32 3, i32 3]
 // CHECK-DAG: [[SIZET5:@.+]] = private unnamed_addr constant [3 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 40]
-// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 3, i32 3, i32 3]
+// CHECK-DAG: [[MAPT5:@.+]] = private unnamed_addr constant [3 x i32] [i32 128, i32 128, i32 3]
 // CHECK-DAG: [[SIZET6:@.+]] = private unnamed_addr constant [4 x i[[SZ]]] [i[[SZ]] 4, i[[SZ]] 2, i[[SZ]] 1, i[[SZ]] 40]
-// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 3, i32 3, i32 3, i32 3]
-// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 3, i32 3, i32 1, i32 1, i32 3]
+// CHECK-DAG: [[MAPT6:@.+]] = private unnamed_addr constant [4 x i32] [i32 128, i32 128, i32 128, i32 3]
+// CHECK-DAG: [[MAPT7:@.+]] = private unnamed_addr constant [5 x i32] [i32 3, i32 128, i32 128, i32 128, i32 3]
 // CHECK-DAG: @{{.*}} = private constant i8 0
 // CHECK-DAG: @{{.*}} = private constant i8 0
 // CHECK-DAG: @{{.*}} = private constant i8 0
@ -66,7 +66,7 @@ int foo(int n) {
  // CHECK:       store i32 -1, i32* [[RHV]], align 4
  // CHECK:       [[RET2:%.+]] = load i32, i32* [[RHV]], align 4
  // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0
-  // CHECK:       call void [[HVT1:@.+]](i32* {{[^,]+}})
+  // CHECK:       call void [[HVT1:@.+]](i[[SZ]] {{[^,]+}})
  #pragma omp target if(0)
  {
    a += 1;
@ -79,15 +79,15 @@ int foo(int n) {
  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [1 x i8*], [1 x i8*]* [[PR]], i32 0, i32 [[IDX0]]
  // CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
  // CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
-  // CHECK-DAG:   [[BP0]] = bitcast i16* %{{.+}} to i8*
-  // CHECK-DAG:   [[P0]] = bitcast i16* %{{.+}} to i8*
+  // CHECK-DAG:   [[BP0]] = inttoptr i[[SZ]] %{{.+}} to i8*
+  // CHECK-DAG:   [[P0]] = inttoptr i[[SZ]] %{{.+}} to i8*

  // CHECK:       store i32 [[RET]], i32* [[RHV:%.+]], align 4
  // CHECK:       [[RET2:%.+]] = load i32, i32* [[RHV]], align 4
  // CHECK-NEXT:  [[ERROR:%.+]] = icmp ne i32 [[RET2]], 0
  // CHECK-NEXT:  br i1 [[ERROR]], label %[[FAIL:[^,]+]], label %[[END:[^,]+]]
  // CHECK:       [[FAIL]]
-  // CHECK:       call void [[HVT2:@.+]](i16* {{[^,]+}})
+  // CHECK:       call void [[HVT2:@.+]](i[[SZ]] {{[^,]+}})
  // CHECK-NEXT:  br label %[[END]]
  // CHECK:       [[END]]
  #pragma omp target if(1)
@ -106,15 +106,15 @@ int foo(int n) {
  // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 0
  // CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
  // CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
-  // CHECK-DAG:   [[BP0]] = bitcast i32* %{{.+}} to i8*
-  // CHECK-DAG:   [[P0]] = bitcast i32* %{{.+}} to i8*
+  // CHECK-DAG:   [[BP0]] = inttoptr i[[SZ]] %{{.+}} to i8*
+  // CHECK-DAG:   [[P0]] = inttoptr i[[SZ]] %{{.+}} to i8*

  // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[BP]], i32 0, i32 1
  // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [2 x i8*], [2 x i8*]* [[P]], i32 0, i32 1
  // CHECK-DAG:   store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]]
  // CHECK-DAG:   store i8* [[P1:%[^,]+]], i8** [[PADDR1]]
-  // CHECK-DAG:   [[BP1]] = bitcast i16* %{{.+}} to i8*
-  // CHECK-DAG:   [[P1]] = bitcast i16* %{{.+}} to i8*
+  // CHECK-DAG:   [[BP1]] = inttoptr i[[SZ]] %{{.+}} to i8*
+  // CHECK-DAG:   [[P1]] = inttoptr i[[SZ]] %{{.+}} to i8*
  // CHECK:       store i32 [[RET]], i32* [[RHV:%.+]], align 4
  // CHECK-NEXT:  br label %[[IFEND:.+]]

@ -137,12 +137,17 @@ int foo(int n) {
  }

  // We capture 3 VLA sizes in this target region
-  // CHECK:       store i[[SZ]] [[BNELEMSIZE:%.+]], i[[SZ]]* [[VLA0:%[^,]+]]
-  // CHECK:       store i[[SZ]] 5, i[[SZ]]* [[VLA1:%[^,]+]]
-  // CHECK:       store i[[SZ]] [[CNELEMSIZE1:%.+]], i[[SZ]]* [[VLA2:%[^,]+]]
+  // CHECK-64:       [[A_VAL:%.+]] = load i32, i32* %{{.+}},
+  // CHECK-64:       [[A_ADDR:%.+]] = bitcast i[[SZ]]* [[A_CADDR:%.+]] to i32*
+  // CHECK-64:       store i32 [[A_VAL]], i32* [[A_ADDR]],
+  // CHECK-64:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],

-  // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[BNELEMSIZE]], 4
-  // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[CNELEMSIZE1]]
+  // CHECK-32:       [[A_VAL:%.+]] = load i32, i32* %{{.+}},
+  // CHECK-32:       store i32 [[A_VAL]], i32* [[A_CADDR:%.+]],
+  // CHECK-32:       [[A_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[A_CADDR]],
+
+  // CHECK:       [[BNSIZE:%.+]] = mul nuw i[[SZ]] [[VLA0:%.+]], 4
+  // CHECK:       [[CNELEMSIZE2:%.+]] = mul nuw i[[SZ]] 5, [[VLA1:%.+]]
  // CHECK:       [[CNSIZE:%.+]] = mul nuw i[[SZ]] [[CNELEMSIZE2]], 8

  // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 20
@ -183,26 +188,24 @@ int foo(int n) {

  // The names below are not necessarily consistent with the names used for the
  // addresses above as some are repeated.
-  // CHECK-DAG:   [[BP0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8*
-  // CHECK-DAG:   [[P0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8*
+  // CHECK-DAG:   [[BP0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8*
+  // CHECK-DAG:   [[P0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8*
  // CHECK-DAG:   store i8* [[BP0]], i8** {{%[^,]+}}
  // CHECK-DAG:   store i8* [[P0]], i8** {{%[^,]+}}
  // CHECK-DAG:   store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}}

-  // CHECK-DAG:   [[BP1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8*
-  // CHECK-DAG:   [[P1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8*
+  // CHECK-DAG:   [[BP1:%[^,]+]] = inttoptr i[[SZ]] [[VLA1]] to i8*
+  // CHECK-DAG:   [[P1:%[^,]+]] = inttoptr i[[SZ]] [[VLA1]] to i8*
  // CHECK-DAG:   store i8* [[BP1]], i8** {{%[^,]+}}
  // CHECK-DAG:   store i8* [[P1]], i8** {{%[^,]+}}
  // CHECK-DAG:   store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}}

-  // CHECK-DAG:   [[BP2:%[^,]+]] = bitcast i[[SZ]]* [[VLA2]] to i8*
-  // CHECK-DAG:   [[P2:%[^,]+]] = bitcast i[[SZ]]* [[VLA2]] to i8*
-  // CHECK-DAG:   store i8* [[BP2]], i8** {{%[^,]+}}
-  // CHECK-DAG:   store i8* [[P2]], i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* inttoptr (i[[SZ]] 5 to i8*), i8** {{%[^,]+}}
+  // CHECK-DAG:   store i8* inttoptr (i[[SZ]] 5 to i8*), i8** {{%[^,]+}}
  // CHECK-DAG:   store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}}

-  // CHECK-DAG:   [[BP3:%[^,]+]] = bitcast i32* %{{.+}} to i8*
-  // CHECK-DAG:   [[P3:%[^,]+]] = bitcast i32* %{{.+}} to i8*
+  // CHECK-DAG:   [[BP3:%[^,]+]] = inttoptr i[[SZ]] [[A_CVAL]] to i8*
+  // CHECK-DAG:   [[P3:%[^,]+]] = inttoptr i[[SZ]] [[A_CVAL]] to i8*
  // CHECK-DAG:   store i8* [[BP3]], i8** {{%[^,]+}}
  // CHECK-DAG:   store i8* [[P3]], i8** {{%[^,]+}}
  // CHECK-DAG:   store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}}
@ -265,67 +268,67 @@ int foo(int n) {

 // CHECK:       define internal void [[HVT0]]()

-// CHECK:       define internal void [[HVT1]](i32* dereferenceable(4) %{{.+}})
+// CHECK:       define internal void [[HVT1]](i[[SZ]] %{{.+}})
 // Create stack storage and store argument in there.
-// CHECK:       [[A_ADDR:%.+]] = alloca i32*, align
-// CHECK:       store i32* %{{.+}}, i32** [[A_ADDR]], align
-// CHECK:       [[A_ADDR2:%.+]] = load i32*, i32** [[A_ADDR]], align
-// CHECK:       load i32, i32* [[A_ADDR2]], align
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64:    [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i32*
+// CHECK-64:    load i32, i32* [[AA_CADDR]], align
+// CHECK-32:    load i32, i32* [[AA_ADDR]], align

-// CHECK:       define internal void [[HVT2]](i16* dereferenceable(2) %{{.+}})
+// CHECK:       define internal void [[HVT2]](i[[SZ]] %{{.+}})
 // Create stack storage and store argument in there.
-// CHECK:       [[AA_ADDR:%.+]] = alloca i16*, align
-// CHECK:       store i16* %{{.+}}, i16** [[AA_ADDR]], align
-// CHECK:       [[AA_ADDR2:%.+]] = load i16*, i16** [[AA_ADDR]], align
-// CHECK:       load i16, i16* [[AA_ADDR2]], align
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK:       [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
+// CHECK:       load i16, i16* [[AA_CADDR]], align

 // CHECK:       define internal void [[HVT3]]
 // Create stack storage and store argument in there.
-// CHECK-DAG:   [[A_ADDR:%.+]] = alloca i32*, align
-// CHECK-DAG:   [[AA_ADDR:%.+]] = alloca i16*, align
-// CHECK-DAG:   store i32* %{{.+}}, i32** [[A_ADDR]], align
-// CHECK-DAG:   store i16* %{{.+}}, i16** [[AA_ADDR]], align
-// CHECK-DAG:   [[A_ADDR2:%.+]] = load i32*, i32** [[A_ADDR]], align
-// CHECK-DAG:   [[AA_ADDR2:%.+]] = load i16*, i16** [[AA_ADDR]], align
-// CHECK-DAG:   load i32, i32* [[A_ADDR2]], align
-// CHECK-DAG:   load i16, i16* [[AA_ADDR2]], align
+// CHECK:       [[A_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK:       [[AA_ADDR:%.+]] = alloca i[[SZ]], align
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[A_ADDR]], align
+// CHECK-DAG:   store i[[SZ]] %{{.+}}, i[[SZ]]* [[AA_ADDR]], align
+// CHECK-64-DAG:[[A_CADDR:%.+]] = bitcast i[[SZ]]* [[A_ADDR]] to i32*
+// CHECK-DAG:   [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
+// CHECK-64-DAG:load i32, i32* [[A_CADDR]], align
+// CHECK-32-DAG:load i32, i32* [[A_ADDR]], align
+// CHECK-DAG:   load i16, i16* [[AA_CADDR]], align

 // CHECK:       define internal void [[HVT4]]
 // Create local storage for each capture.
-// CHECK-DAG:   [[LOCAL_A:%.+]] = alloca i32*
-// CHECK-DAG:   [[LOCAL_B:%.+]] = alloca [10 x float]*
-// CHECK-DAG:   [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]*
-// CHECK-DAG:   [[LOCAL_BN:%.+]] = alloca float*
-// CHECK-DAG:   [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]*
-// CHECK-DAG:   [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]*
-// CHECK-DAG:   [[LOCAL_VLA3:%.+]] = alloca i[[SZ]]*
-// CHECK-DAG:   [[LOCAL_CN:%.+]] = alloca double*
-// CHECK-DAG:   [[LOCAL_D:%.+]] = alloca [[TT]]*
-// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]]
+// CHECK:       [[LOCAL_A:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_B:%.+]] = alloca [10 x float]*
+// CHECK:       [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_BN:%.+]] = alloca float*
+// CHECK:       [[LOCAL_C:%.+]] = alloca [5 x [10 x double]]*
+// CHECK:       [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_VLA3:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_CN:%.+]] = alloca double*
+// CHECK:       [[LOCAL_D:%.+]] = alloca [[TT]]*
+// CHECK-DAG:   store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
 // CHECK-DAG:   store [10 x float]* [[ARG_B:%.+]], [10 x float]** [[LOCAL_B]]
-// CHECK-DAG:   store i[[SZ]]* [[ARG_VLA1:%.+]], i[[SZ]]** [[LOCAL_VLA1]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]]
 // CHECK-DAG:   store float* [[ARG_BN:%.+]], float** [[LOCAL_BN]]
 // CHECK-DAG:   store [5 x [10 x double]]* [[ARG_C:%.+]], [5 x [10 x double]]** [[LOCAL_C]]
-// CHECK-DAG:   store i[[SZ]]* [[ARG_VLA2:%.+]], i[[SZ]]** [[LOCAL_VLA2]]
-// CHECK-DAG:   store i[[SZ]]* [[ARG_VLA3:%.+]], i[[SZ]]** [[LOCAL_VLA3]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_VLA3:%.+]], i[[SZ]]* [[LOCAL_VLA3]]
 // CHECK-DAG:   store double* [[ARG_CN:%.+]], double** [[LOCAL_CN]]
 // CHECK-DAG:   store [[TT]]* [[ARG_D:%.+]], [[TT]]** [[LOCAL_D]]

-// CHECK-DAG:   [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]],
+// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
 // CHECK-DAG:   [[REF_B:%.+]] = load [10 x float]*, [10 x float]** [[LOCAL_B]],
-// CHECK-DAG:   [[REF_VLA1:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA1]],
-// CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA1]],
+// CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
 // CHECK-DAG:   [[REF_BN:%.+]] = load float*, float** [[LOCAL_BN]],
 // CHECK-DAG:   [[REF_C:%.+]] = load [5 x [10 x double]]*, [5 x [10 x double]]** [[LOCAL_C]],
-// CHECK-DAG:   [[REF_VLA2:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA2]],
-// CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA2]],
-// CHECK-DAG:   [[REF_VLA3:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA3]],
-// CHECK-DAG:   [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA3]],
+// CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
+// CHECK-DAG:   [[VAL_VLA3:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA3]],
 // CHECK-DAG:   [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]],
 // CHECK-DAG:   [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]],

 // Use captures.
-// CHECK-DAG:   load i32, i32* [[REF_A]]
+// CHECK-64-DAG:   load i32, i32* [[REF_A]]
+// CHECK-32-DAG:   load i32, i32* [[LOCAL_A]]
 // CHECK-DAG:   getelementptr inbounds [10 x float], [10 x float]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
 // CHECK-DAG:   getelementptr inbounds float, float* [[REF_BN]], i[[SZ]] 3
 // CHECK-DAG:   getelementptr inbounds [5 x [10 x double]], [5 x [10 x double]]* [[REF_C]], i[[SZ]] 0, i[[SZ]] 1
@ -406,10 +409,16 @@ int bar(int n){
 //
 // CHECK: define {{.*}}[[FS1]]
 //
+// CHECK:          i8* @llvm.stacksave()
+// CHECK-64:       [[B_ADDR:%.+]] = bitcast i[[SZ]]* [[B_CADDR:%.+]] to i32*
+// CHECK-64:       store i32 %{{.+}}, i32* [[B_ADDR]],
+// CHECK-64:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_CADDR]],
+
+// CHECK-32:       store i32 %{{.+}}, i32* [[B_ADDR:%.+]],
+// CHECK-32:       [[B_CVAL:%.+]] = load i[[SZ]], i[[SZ]]* [[B_ADDR]],
+
 // We capture 2 VLA sizes in this target region
-// CHECK:       store i[[SZ]] 2, i[[SZ]]* [[VLA0:%[^,]+]]
-// CHECK:       store i[[SZ]] [[CELEMSIZE1:%.+]], i[[SZ]]* [[VLA1:%[^,]+]]
-// CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[CELEMSIZE1]]
+// CHECK:       [[CELEMSIZE2:%.+]] = mul nuw i[[SZ]] 2, [[VLA0:%.+]]
 // CHECK:       [[CSIZE:%.+]] = mul nuw i[[SZ]] [[CELEMSIZE2]], 2

 // CHECK:       [[IF:%.+]] = icmp sgt i32 {{[^,]+}}, 60
@ -434,20 +443,18 @@ int bar(int n){

 // The names below are not necessarily consistent with the names used for the
 // addresses above as some are repeated.
-// CHECK-DAG:   [[BP0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8*
-// CHECK-DAG:   [[P0:%[^,]+]] = bitcast i[[SZ]]* [[VLA0]] to i8*
+// CHECK-DAG:   [[BP0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8*
+// CHECK-DAG:   [[P0:%[^,]+]] = inttoptr i[[SZ]] [[VLA0]] to i8*
 // CHECK-DAG:   store i8* [[BP0]], i8** {{%[^,]+}}
 // CHECK-DAG:   store i8* [[P0]], i8** {{%[^,]+}}
 // CHECK-DAG:   store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}}

-// CHECK-DAG:   [[BP1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8*
-// CHECK-DAG:   [[P1:%[^,]+]] = bitcast i[[SZ]]* [[VLA1]] to i8*
-// CHECK-DAG:   store i8* [[BP1]], i8** {{%[^,]+}}
-// CHECK-DAG:   store i8* [[P1]], i8** {{%[^,]+}}
+// CHECK-DAG:   store i8* inttoptr (i[[SZ]] 2 to i8*), i8** {{%[^,]+}}
+// CHECK-DAG:   store i8* inttoptr (i[[SZ]] 2 to i8*), i8** {{%[^,]+}}
 // CHECK-DAG:   store i[[SZ]] {{4|8}}, i[[SZ]]* {{%[^,]+}}

-// CHECK-DAG:   [[BP2:%[^,]+]] = bitcast i32* %{{.+}} to i8*
-// CHECK-DAG:   [[P2:%[^,]+]] = bitcast i32* %{{.+}} to i8*
+// CHECK-DAG:   [[BP2:%[^,]+]] = inttoptr i[[SZ]] [[B_CVAL]] to i8*
+// CHECK-DAG:   [[P2:%[^,]+]] = inttoptr i[[SZ]] [[B_CVAL]] to i8*
 // CHECK-DAG:   store i8* [[BP2]], i8** {{%[^,]+}}
 // CHECK-DAG:   store i8* [[P2]], i8** {{%[^,]+}}
 // CHECK-DAG:   store i[[SZ]] 4, i[[SZ]]* {{%[^,]+}}
@ -488,15 +495,15 @@ int bar(int n){
 // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 0
 // CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
 // CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
-// CHECK-DAG:   [[BP0]] = bitcast i32* %{{.+}} to i8*
-// CHECK-DAG:   [[P0]] = bitcast i32* %{{.+}} to i8*
+// CHECK-DAG:   [[BP0]] = inttoptr i[[SZ]] [[VAL0:%.+]] to i8*
+// CHECK-DAG:   [[P0]] = inttoptr i[[SZ]] [[VAL0]] to i8*

 // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 1
 // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 1
 // CHECK-DAG:   store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]]
 // CHECK-DAG:   store i8* [[P1:%[^,]+]], i8** [[PADDR1]]
-// CHECK-DAG:   [[BP1]] = bitcast i16* %{{.+}} to i8*
-// CHECK-DAG:   [[P1]] = bitcast i16* %{{.+}} to i8*
+// CHECK-DAG:   [[BP1]] = inttoptr i[[SZ]] [[VAL1:%.+]] to i8*
+// CHECK-DAG:   [[P1]] = inttoptr i[[SZ]] [[VAL1]] to i8*

 // CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[BP]], i32 0, i32 2
 // CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [4 x i8*], [4 x i8*]* [[P]], i32 0, i32 2
@ -540,15 +547,15 @@ int bar(int n){
 // CHECK-DAG:   [[PADDR0:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 0
 // CHECK-DAG:   store i8* [[BP0:%[^,]+]], i8** [[BPADDR0]]
 // CHECK-DAG:   store i8* [[P0:%[^,]+]], i8** [[PADDR0]]
-// CHECK-DAG:   [[BP0]] = bitcast i32* %{{.+}} to i8*
-// CHECK-DAG:   [[P0]] = bitcast i32* %{{.+}} to i8*
+// CHECK-DAG:   [[BP0]] = inttoptr i[[SZ]] [[VAL0:%.+]] to i8*
+// CHECK-DAG:   [[P0]] = inttoptr i[[SZ]] [[VAL0]] to i8*

 // CHECK-DAG:   [[BPADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 1
 // CHECK-DAG:   [[PADDR1:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 1
 // CHECK-DAG:   store i8* [[BP1:%[^,]+]], i8** [[BPADDR1]]
 // CHECK-DAG:   store i8* [[P1:%[^,]+]], i8** [[PADDR1]]
-// CHECK-DAG:   [[BP1]] = bitcast i16* %{{.+}} to i8*
-// CHECK-DAG:   [[P1]] = bitcast i16* %{{.+}} to i8*
+// CHECK-DAG:   [[BP1]] = inttoptr i[[SZ]] [[VAL1:%.+]] to i8*
+// CHECK-DAG:   [[P1]] = inttoptr i[[SZ]] [[VAL1]] to i8*

 // CHECK-DAG:   [[BPADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[BP]], i32 0, i32 2
 // CHECK-DAG:   [[PADDR2:%.+]] = getelementptr inbounds [3 x i8*], [3 x i8*]* [[P]], i32 0, i32 2
@ -580,65 +587,66 @@ int bar(int n){

 // CHECK:       define internal void [[HVT7]]
 // Create local storage for each capture.
-// CHECK-DAG:   [[LOCAL_THIS:%.+]] = alloca [[S1]]*
-// CHECK-DAG:   [[LOCAL_B:%.+]] = alloca i32*
-// CHECK-DAG:   [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]*
-// CHECK-DAG:   [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]*
-// CHECK-DAG:   [[LOCAL_C:%.+]] = alloca i16*
+// CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1]]*
+// CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_VLA1:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_VLA2:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_C:%.+]] = alloca i16*
 // CHECK-DAG:   store [[S1]]* [[ARG_THIS:%.+]], [[S1]]** [[LOCAL_THIS]]
-// CHECK-DAG:   store i32* [[ARG_B:%.+]], i32** [[LOCAL_B]]
-// CHECK-DAG:   store i[[SZ]]* [[ARG_VLA1:%.+]], i[[SZ]]** [[LOCAL_VLA1]]
-// CHECK-DAG:   store i[[SZ]]* [[ARG_VLA2:%.+]], i[[SZ]]** [[LOCAL_VLA2]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_B:%.+]], i[[SZ]]* [[LOCAL_B]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_VLA1:%.+]], i[[SZ]]* [[LOCAL_VLA1]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_VLA2:%.+]], i[[SZ]]* [[LOCAL_VLA2]]
 // CHECK-DAG:   store i16* [[ARG_C:%.+]], i16** [[LOCAL_C]]
 // Store captures in the context.
 // CHECK-DAG:   [[REF_THIS:%.+]] = load [[S1]]*, [[S1]]** [[LOCAL_THIS]],
-// CHECK-DAG:   [[REF_B:%.+]] = load i32*, i32** [[LOCAL_B]],
-// CHECK-DAG:   [[REF_VLA1:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA1]],
-// CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA1]],
-// CHECK-DAG:   [[REF_VLA2:%.+]] = load i[[SZ]]*, i[[SZ]]** [[LOCAL_VLA2]],
-// CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[REF_VLA2]],
+// CHECK-64-DAG:[[REF_B:%.+]] = bitcast i[[SZ]]* [[LOCAL_B]] to i32*
+// CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
+// CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
 // CHECK-DAG:   [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]],
 // Use captures.
 // CHECK-DAG:   getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0
-// CHECK-DAG:   load i32, i32* [[REF_B]]
+// CHECK-64-DAG:load i32, i32* [[REF_B]]
+// CHECK-32-DAG:load i32, i32* [[LOCAL_B]]
 // CHECK-DAG:   getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}}


 // CHECK:       define internal void [[HVT6]]
 // Create local storage for each capture.
-// CHECK-DAG:   [[LOCAL_A:%.+]] = alloca i32*
-// CHECK-DAG:   [[LOCAL_AA:%.+]] = alloca i16*
-// CHECK-DAG:   [[LOCAL_AAA:%.+]] = alloca i8*
-// CHECK-DAG:   [[LOCAL_B:%.+]] = alloca [10 x i32]*
-// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]]
-// CHECK-DAG:   store i16* [[ARG_AA:%.+]], i16** [[LOCAL_AA]]
-// CHECK-DAG:   store i8* [[ARG_AAA:%.+]], i8** [[LOCAL_AAA]]
+// CHECK:       [[LOCAL_A:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_AA:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_AAA:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_B:%.+]] = alloca [10 x i32]*
+// CHECK-DAG:   store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_AAA:%.+]], i[[SZ]]* [[LOCAL_AAA]]
 // CHECK-DAG:   store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
 // Store captures in the context.
-// CHECK-DAG:   [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]],
-// CHECK-DAG:   [[REF_AA:%.+]] = load i16*, i16** [[LOCAL_AA]],
-// CHECK-DAG:   [[REF_AAA:%.+]] = load i8*, i8** [[LOCAL_AAA]],
-// CHECK-DAG:   [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
+// CHECK-64-DAG:   [[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+// CHECK-DAG:      [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
+// CHECK-DAG:      [[REF_AAA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8*
+// CHECK-DAG:      [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
 // Use captures.
-// CHECK-DAG:   load i32, i32* [[REF_A]]
-// CHECK-DAG:   load i16, i16* [[REF_AA]]
-// CHECK-DAG:   load i8, i8* [[REF_AAA]]
-// CHECK-DAG:   getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+// CHECK-64-DAG:   load i32, i32* [[REF_A]]
+// CHECK-DAG:      load i16, i16* [[REF_AA]]
+// CHECK-DAG:      load i8, i8* [[REF_AAA]]
+// CHECK-32-DAG:   load i32, i32* [[LOCAL_A]]
+// CHECK-DAG:      getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2

 // CHECK:       define internal void [[HVT5]]
 // Create local storage for each capture.
-// CHECK-DAG:   [[LOCAL_A:%.+]] = alloca i32*
-// CHECK-DAG:   [[LOCAL_AA:%.+]] = alloca i16*
-// CHECK-DAG:   [[LOCAL_B:%.+]] = alloca [10 x i32]*
-// CHECK-DAG:   store i32* [[ARG_A:%.+]], i32** [[LOCAL_A]]
-// CHECK-DAG:   store i16* [[ARG_AA:%.+]], i16** [[LOCAL_AA]]
+// CHECK:       [[LOCAL_A:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_AA:%.+]] = alloca i[[SZ]]
+// CHECK:       [[LOCAL_B:%.+]] = alloca [10 x i32]*
+// CHECK-DAG:   store i[[SZ]] [[ARG_A:%.+]], i[[SZ]]* [[LOCAL_A]]
+// CHECK-DAG:   store i[[SZ]] [[ARG_AA:%.+]], i[[SZ]]* [[LOCAL_AA]]
 // CHECK-DAG:   store [10 x i32]* [[ARG_B:%.+]], [10 x i32]** [[LOCAL_B]]
 // Store captures in the context.
-// CHECK-DAG:   [[REF_A:%.+]] = load i32*, i32** [[LOCAL_A]],
-// CHECK-DAG:   [[REF_AA:%.+]] = load i16*, i16** [[LOCAL_AA]],
+// CHECK-64-DAG:[[REF_A:%.+]] = bitcast i[[SZ]]* [[LOCAL_A]] to i32*
+// CHECK-DAG:   [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
 // CHECK-DAG:   [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
 // Use captures.
-// CHECK-DAG:   load i32, i32* [[REF_A]]
+// CHECK-64-DAG:   load i32, i32* [[REF_A]]
+// CHECK-32-DAG:   load i32, i32* [[LOCAL_A]]
 // CHECK-DAG:   load i16, i16* [[REF_AA]]
 // CHECK-DAG:   getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
 #endif
--- a/clang/test/OpenMP/target_codegen_global_capture.cpp
+++ b/clang/test/OpenMP/target_codegen_global_capture.cpp
@ -1,9 +1,9 @@
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
-// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -emit-llvm %s -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple i386-unknown-unknown -emit-pch -o %t %s
-// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenmp -x c++ -triple i386-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
 // expected-no-diagnostics
 #ifndef HEADER
 #define HEADER
@ -41,27 +41,88 @@ int foo(short a, short b, short c, short d){
  static float Sc = 7.0;
  static float Sd = 8.0;

-  // CHECK-DAG: [[REFB:%.+]] = bitcast i16* [[LB]] to i8*
-  // CHECK-DAG: store i8* [[REFB]], i8** [[GEPB:%.+]], align
-  // CHECK-DAG: [[REFC:%.+]] = bitcast i16* [[LC]] to i8*
-  // CHECK-DAG: store i8* [[REFC]], i8** [[GEPC:%.+]], align
-  // CHECK-DAG: [[REFD:%.+]] = bitcast i16* [[LD]] to i8*
-  // CHECK-DAG: store i8* [[REFD]], i8** [[GEPD:%.+]], align
-  // CHECK-DAG: store i8* bitcast (double* [[GB]] to i8*), i8** [[GEPGB:%.+]], align
-  // CHECK-DAG: store i8* bitcast (double* [[GC]] to i8*), i8** [[GEPGC:%.+]], align
-  // CHECK-DAG: store i8* bitcast (double* [[GD]] to i8*), i8** [[GEPGD:%.+]], align
-  // CHECK-DAG: store i8* bitcast (float* [[FB]] to i8*), i8** [[GEPFB:%.+]], align
-  // CHECK-DAG: store i8* bitcast (float* [[FC]] to i8*), i8** [[GEPFC:%.+]], align
-  // CHECK-DAG: store i8* bitcast (float* [[FD]] to i8*), i8** [[GEPFD:%.+]], align
-  // CHECK-DAG: [[GEPB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPGB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPGC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPGD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPFB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPFC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-  // CHECK-DAG: [[GEPFD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
+  // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LB]],
+  // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* @Gb,
+  // CHECK-DAG:    [[VALFB:%.+]] = load float, float* @_ZZ3foossssE2Sb,
+  // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* @Gc,
+  // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LC]],
+  // CHECK-DAG:    [[VALFC:%.+]] = load float, float* @_ZZ3foossssE2Sc,
+  // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LD]],
+  // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* @Gd,
+  // CHECK-DAG:    [[VALFD:%.+]] = load float, float* @_ZZ3foossssE2Sd,
+
+  // 3 local vars being captured.
+
+  // CHECK-DAG: store i16 [[VALLB]], i16* [[CONVLB:%.+]],
+  // CHECK-DAG: [[CONVLB]] = bitcast i[[sz:64|32]]* [[CADDRLB:%.+]] to i16*
+  // CHECK-DAG: [[CVALLB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLB]],
+  // CHECK-DAG: [[CPTRLB:%.+]] = inttoptr i[[sz]] [[CVALLB]] to i8*
+  // CHECK-DAG: store i8* [[CPTRLB]], i8** [[GEPLB:%.+]],
+  // CHECK-DAG: [[GEPLB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // CHECK-DAG: store i16 [[VALLC]], i16* [[CONVLC:%.+]],
+  // CHECK-DAG: [[CONVLC]] = bitcast i[[sz]]* [[CADDRLC:%.+]] to i16*
+  // CHECK-DAG: [[CVALLC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLC]],
+  // CHECK-DAG: [[CPTRLC:%.+]] = inttoptr i[[sz]] [[CVALLC]] to i8*
+  // CHECK-DAG: store i8* [[CPTRLC]], i8** [[GEPLC:%.+]],
+  // CHECK-DAG: [[GEPLC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // CHECK-DAG: store i16 [[VALLD]], i16* [[CONVLD:%.+]],
+  // CHECK-DAG: [[CONVLD]] = bitcast i[[sz]]* [[CADDRLD:%.+]] to i16*
+  // CHECK-DAG: [[CVALLD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLD]],
+  // CHECK-DAG: [[CPTRLD:%.+]] = inttoptr i[[sz]] [[CVALLD]] to i8*
+  // CHECK-DAG: store i8* [[CPTRLD]], i8** [[GEPLD:%.+]],
+  // CHECK-DAG: [[GEPLD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // 3 static vars being captured.
+
+  // CHECK-DAG: store float [[VALFB]], float* [[CONVFB:%.+]],
+  // CHECK-DAG: [[CONVFB]] = bitcast i[[sz]]* [[CADDRFB:%.+]] to float*
+  // CHECK-DAG: [[CVALFB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFB]],
+  // CHECK-DAG: [[CPTRFB:%.+]] = inttoptr i[[sz]] [[CVALFB]] to i8*
+  // CHECK-DAG: store i8* [[CPTRFB]], i8** [[GEPFB:%.+]],
+  // CHECK-DAG: [[GEPFB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // CHECK-DAG: store float [[VALFC]], float* [[CONVFC:%.+]],
+  // CHECK-DAG: [[CONVFC]] = bitcast i[[sz]]* [[CADDRFC:%.+]] to float*
+  // CHECK-DAG: [[CVALFC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFC]],
+  // CHECK-DAG: [[CPTRFC:%.+]] = inttoptr i[[sz]] [[CVALFC]] to i8*
+  // CHECK-DAG: store i8* [[CPTRFC]], i8** [[GEPFC:%.+]],
+  // CHECK-DAG: [[GEPFC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // CHECK-DAG: store float [[VALFD]], float* [[CONVFD:%.+]],
+  // CHECK-DAG: [[CONVFD]] = bitcast i[[sz]]* [[CADDRFD:%.+]] to float*
+  // CHECK-DAG: [[CVALFD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFD]],
+  // CHECK-DAG: [[CPTRFD:%.+]] = inttoptr i[[sz]] [[CVALFD]] to i8*
+  // CHECK-DAG: store i8* [[CPTRFD]], i8** [[GEPFD:%.+]],
+  // CHECK-DAG: [[GEPFD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // 3 static global vars being captured.
+
+  // CHECK-64-DAG: store double [[VALGB]], double* [[CONVGB:%.+]],
+  // CHECK-64-DAG: [[CONVGB]] = bitcast i[[sz]]* [[CADDRGB:%.+]] to double*
+  // CHECK-64-DAG: [[CVALGB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGB]],
+  // CHECK-64-DAG: [[CPTRGB:%.+]] = inttoptr i[[sz]] [[CVALGB]] to i8*
+  // CHECK-64-DAG: store i8* [[CPTRGB]], i8** [[GEPGB:%.+]],
+  // CHECK-32-DAG: store i8* bitcast (double* @Gb to i8*), i8** [[GEPGB:%.+]],
+  // CHECK-DAG: [[GEPGB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // CHECK-64-DAG: store double [[VALGC]], double* [[CONVGC:%.+]],
+  // CHECK-64-DAG: [[CONVGC]] = bitcast i[[sz]]* [[CADDRGC:%.+]] to double*
+  // CHECK-64-DAG: [[CVALGC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGC]],
+  // CHECK-64-DAG: [[CPTRGC:%.+]] = inttoptr i[[sz]] [[CVALGC]] to i8*
+  // CHECK-64-DAG: store i8* [[CPTRGC]], i8** [[GEPGC:%.+]],
+  // CHECK-32-DAG: store i8* bitcast (double* @Gc to i8*), i8** [[GEPGC:%.+]],
+  // CHECK-DAG: [[GEPGC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+  // CHECK-64-DAG: store double [[VALGD]], double* [[CONVGD:%.+]],
+  // CHECK-64-DAG: [[CONVGD]] = bitcast i[[sz]]* [[CADDRGD:%.+]] to double*
+  // CHECK-64-DAG: [[CVALGD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGD]],
+  // CHECK-64-DAG: [[CPTRGD:%.+]] = inttoptr i[[sz]] [[CVALGD]] to i8*
+  // CHECK-64-DAG: store i8* [[CPTRGD]], i8** [[GEPGD:%.+]],
+  // CHECK-32-DAG: store i8* bitcast (double* @Gd to i8*), i8** [[GEPGD:%.+]],
+  // CHECK-DAG: [[GEPGD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
  // CHECK: call i32 @__tgt_target
  // CHECK: call void [[OFFLOADF:@.+]](
  // Capture b, Gb, Sb, Gc, c, Sc, d, Gd, Sd
@ -71,7 +132,7 @@ int foo(short a, short b, short c, short d){
    Gb += 1.0;
    Sb += 1.0;

-    // CHECK: define internal void [[OFFLOADF]]({{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}})
+    // CHECK: define internal void [[OFFLOADF]]({{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}})
    // The parallel region only uses 3 captures.
    // CHECK:     call {{.*}}@__kmpc_fork_call(%ident_t* {{.+}}, i32 {{.+}}, void (i32*, i32*, ...)* bitcast ({{.*}}[[PARF:@.+]] to {{.*}}), {{.+}}* %{{.+}}, {{.+}}* %{{.+}}, {{.+}}* %{{.+}})
    // CHECK:     call void @.omp_outlined.(i32* %{{.+}}, i32* %{{.+}}, {{.+}}* %{{.+}}, {{.+}}* %{{.+}}, {{.+}}* %{{.+}})
@ -106,45 +167,98 @@ int bar(short a, short b, short c, short d){
  // CHECK: call void {{.*}}@__kmpc_fork_call(%ident_t* {{.+}}, i32 {{.+}}, void (i32*, i32*, ...)* bitcast ({{.*}}[[PARF:@.+]] to {{.*}}), i16* %{{.+}}, i16* %{{.+}}, i16* %{{.+}}, i16* %{{.+}})
  // CHECK: define internal void [[PARF]](i32* noalias %{{.*}}, i32* noalias %{{.*}}, i16* dereferenceable(2) [[A:%.+]], i16* dereferenceable(2) [[B:%.+]], i16* dereferenceable(2) [[C:%.+]], i16* dereferenceable(2) [[D:%.+]])
  // Capture a, b, c, d
+  // CHECK: [[ALLOCLA:%.+]] = alloca i16
+  // CHECK: [[ALLOCLB:%.+]] = alloca i16
+  // CHECK: [[ALLOCLC:%.+]] = alloca i16
+  // CHECK: [[ALLOCLD:%.+]] = alloca i16
+  // CHECK: [[LLA:%.+]] = load i16*, i16** [[ALLOCLA]],
+  // CHECK: [[LLB:%.+]] = load i16*, i16** [[ALLOCLB]],
+  // CHECK: [[LLC:%.+]] = load i16*, i16** [[ALLOCLC]],
+  // CHECK: [[LLD:%.+]] = load i16*, i16** [[ALLOCLD]],
  #pragma omp parallel
  {
-    // CHECK: [[ADRA:%.+]] = alloca i16*, align
-    // CHECK: [[ADRB:%.+]] = alloca i16*, align
-    // CHECK: [[ADRC:%.+]] = alloca i16*, align
-    // CHECK: [[ADRD:%.+]] = alloca i16*, align
-    // CHECK: store i16* [[A]], i16** [[ADRA]], align
-    // CHECK: store i16* [[B]], i16** [[ADRB]], align
-    // CHECK: store i16* [[C]], i16** [[ADRC]], align
-    // CHECK: store i16* [[D]], i16** [[ADRD]], align
-    // CHECK: [[REFA:%.+]] = load i16*, i16** [[ADRA]],
-    // CHECK: [[REFB:%.+]] = load i16*, i16** [[ADRB]],
-    // CHECK: [[REFC:%.+]] = load i16*, i16** [[ADRC]],
-    // CHECK: [[REFD:%.+]] = load i16*, i16** [[ADRD]],
+    // CHECK-DAG:    [[VALLB:%.+]] = load i16, i16* [[LLB]],
+    // CHECK-64-DAG: [[VALGB:%.+]] = load double, double* @Gb,
+    // CHECK-DAG:    [[VALFB:%.+]] = load float, float* @_ZZ3barssssE2Sb,
+    // CHECK-64-DAG: [[VALGC:%.+]] = load double, double* @Gc,
+    // CHECK-DAG:    [[VALLC:%.+]] = load i16, i16* [[LLC]],
+    // CHECK-DAG:    [[VALFC:%.+]] = load float, float* @_ZZ3barssssE2Sc,
+    // CHECK-DAG:    [[VALLD:%.+]] = load i16, i16* [[LLD]],
+    // CHECK-64-DAG: [[VALGD:%.+]] = load double, double* @Gd,
+    // CHECK-DAG:    [[VALFD:%.+]] = load float, float* @_ZZ3barssssE2Sd,

-    // CHECK: load float, float* [[BA]]
+    // 3 local vars being captured.

-    // CHECK-DAG: [[CSTB:%.+]] = bitcast i16* [[REFB]] to i8*
-    // CHECK-DAG: [[CSTC:%.+]] = bitcast i16* [[REFC]] to i8*
-    // CHECK-DAG: [[CSTD:%.+]] = bitcast i16* [[REFD]] to i8*
-    // CHECK-DAG: store i8* [[CSTB]], i8** [[GEPB:%.+]], align
-    // CHECK-DAG: store i8* [[CSTC]], i8** [[GEPC:%.+]], align
-    // CHECK-DAG: store i8* [[CSTD]], i8** [[GEPD:%.+]], align
-    // CHECK-DAG: store i8* bitcast (double* [[GB]] to i8*), i8** [[GEPGB:%.+]], align
-    // CHECK-DAG: store i8* bitcast (double* [[GC]] to i8*), i8** [[GEPGC:%.+]], align
-    // CHECK-DAG: store i8* bitcast (double* [[GD]] to i8*), i8** [[GEPGD:%.+]], align
-    // CHECK-DAG: store i8* bitcast (float* [[BB]] to i8*), i8** [[GEPBB:%.+]], align
-    // CHECK-DAG: store i8* bitcast (float* [[BC]] to i8*), i8** [[GEPBC:%.+]], align
-    // CHECK-DAG: store i8* bitcast (float* [[BD]] to i8*), i8** [[GEPBD:%.+]], align
+    // CHECK-DAG: store i16 [[VALLB]], i16* [[CONVLB:%.+]],
+    // CHECK-DAG: [[CONVLB]] = bitcast i[[sz:64|32]]* [[CADDRLB:%.+]] to i16*
+    // CHECK-DAG: [[CVALLB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLB]],
+    // CHECK-DAG: [[CPTRLB:%.+]] = inttoptr i[[sz]] [[CVALLB]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLB]], i8** [[GEPLB:%.+]],
+    // CHECK-DAG: [[GEPLB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store i16 [[VALLC]], i16* [[CONVLC:%.+]],
+    // CHECK-DAG: [[CONVLC]] = bitcast i[[sz]]* [[CADDRLC:%.+]] to i16*
+    // CHECK-DAG: [[CVALLC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLC]],
+    // CHECK-DAG: [[CPTRLC:%.+]] = inttoptr i[[sz]] [[CVALLC]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLC]], i8** [[GEPLC:%.+]],
+    // CHECK-DAG: [[GEPLC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store i16 [[VALLD]], i16* [[CONVLD:%.+]],
+    // CHECK-DAG: [[CONVLD]] = bitcast i[[sz]]* [[CADDRLD:%.+]] to i16*
+    // CHECK-DAG: [[CVALLD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRLD]],
+    // CHECK-DAG: [[CPTRLD:%.+]] = inttoptr i[[sz]] [[CVALLD]] to i8*
+    // CHECK-DAG: store i8* [[CPTRLD]], i8** [[GEPLD:%.+]],
+    // CHECK-DAG: [[GEPLD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // 3 static vars being captured.
+
+    // CHECK-DAG: store float [[VALFB]], float* [[CONVFB:%.+]],
+    // CHECK-DAG: [[CONVFB]] = bitcast i[[sz]]* [[CADDRFB:%.+]] to float*
+    // CHECK-DAG: [[CVALFB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFB]],
+    // CHECK-DAG: [[CPTRFB:%.+]] = inttoptr i[[sz]] [[CVALFB]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFB]], i8** [[GEPFB:%.+]],
+    // CHECK-DAG: [[GEPFB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store float [[VALFC]], float* [[CONVFC:%.+]],
+    // CHECK-DAG: [[CONVFC]] = bitcast i[[sz]]* [[CADDRFC:%.+]] to float*
+    // CHECK-DAG: [[CVALFC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFC]],
+    // CHECK-DAG: [[CPTRFC:%.+]] = inttoptr i[[sz]] [[CVALFC]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFC]], i8** [[GEPFC:%.+]],
+    // CHECK-DAG: [[GEPFC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-DAG: store float [[VALFD]], float* [[CONVFD:%.+]],
+    // CHECK-DAG: [[CONVFD]] = bitcast i[[sz]]* [[CADDRFD:%.+]] to float*
+    // CHECK-DAG: [[CVALFD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRFD]],
+    // CHECK-DAG: [[CPTRFD:%.+]] = inttoptr i[[sz]] [[CVALFD]] to i8*
+    // CHECK-DAG: store i8* [[CPTRFD]], i8** [[GEPFD:%.+]],
+    // CHECK-DAG: [[GEPFD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // 3 static global vars being captured.
+
+    // CHECK-64-DAG: store double [[VALGB]], double* [[CONVGB:%.+]],
+    // CHECK-64-DAG: [[CONVGB]] = bitcast i[[sz]]* [[CADDRGB:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGB:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGB]],
+    // CHECK-64-DAG: [[CPTRGB:%.+]] = inttoptr i[[sz]] [[CVALGB]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGB]], i8** [[GEPGB:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gb to i8*), i8** [[GEPGB:%.+]],
+    // CHECK-DAG: [[GEPGB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-64-DAG: store double [[VALGC]], double* [[CONVGC:%.+]],
+    // CHECK-64-DAG: [[CONVGC]] = bitcast i[[sz]]* [[CADDRGC:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGC:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGC]],
+    // CHECK-64-DAG: [[CPTRGC:%.+]] = inttoptr i[[sz]] [[CVALGC]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGC]], i8** [[GEPGC:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gc to i8*), i8** [[GEPGC:%.+]],
+    // CHECK-DAG: [[GEPGC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}
+
+    // CHECK-64-DAG: store double [[VALGD]], double* [[CONVGD:%.+]],
+    // CHECK-64-DAG: [[CONVGD]] = bitcast i[[sz]]* [[CADDRGD:%.+]] to double*
+    // CHECK-64-DAG: [[CVALGD:%.+]] = load i[[sz]], i[[sz]]* [[CADDRGD]],
+    // CHECK-64-DAG: [[CPTRGD:%.+]] = inttoptr i[[sz]] [[CVALGD]] to i8*
+    // CHECK-64-DAG: store i8* [[CPTRGD]], i8** [[GEPGD:%.+]],
+    // CHECK-32-DAG: store i8* bitcast (double* @Gd to i8*), i8** [[GEPGD:%.+]],
+    // CHECK-DAG: [[GEPGD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{[0-8]}}

-    // CHECK-DAG: [[GEPB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPGB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPGC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPGD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPBB]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPBC]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
-    // CHECK-DAG: [[GEPBD]] = getelementptr inbounds [9 x i8*], [9 x i8*]* %{{.+}}, i32 0, i32 {{.+}}
    // CHECK: call i32 @__tgt_target
    // CHECK: call void [[OFFLOADF:@.+]](
    // Capture b, Gb, Sb, Gc, c, Sc, d, Gd, Sd
@ -154,7 +268,7 @@ int bar(short a, short b, short c, short d){
      Gb += 1.0;
      Sb += 1.0;

-      // CHECK: define internal void [[OFFLOADF]]({{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}}, {{.+}}* {{.*}}%{{.+}})
+      // CHECK: define internal void [[OFFLOADF]]({{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}}, {{.+}} {{.*}}%{{.+}})
      // CHECK: call void {{.*}}@__kmpc_fork_call(%ident_t* {{.+}}, i32 {{.+}}, void (i32*, i32*, ...)* bitcast ({{.*}}[[PARF:@.+]] to {{.*}})

      // CHECK: define internal void [[PARF]](i32* noalias %{{.*}}, i32* noalias %{{.*}}, {{.+}}* dereferenceable({{.+}}) %{{.+}}, {{.+}}* dereferenceable({{.+}}) %{{.+}}, {{.+}}* dereferenceable({{.+}}) %{{.+}})
--- a/clang/test/OpenMP/target_map_codegen.cpp
+++ b/clang/test/OpenMP/target_map_codegen.cpp