Revert r273313 "[NVPTX] Improve lowering of byval args of device functions."

The change causes llvm crash in some unoptimized builds. llvm-svn: 274163
2016-06-29 20:51:15 +00:00 · 2016-06-29 20:51:15 +00:00 · 4d5d7be8cc
parent a5da256f92
commit 4d5d7be8cc
5 changed files with 23 additions and 105 deletions
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@ -662,58 +662,6 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
          TM.is64Bit() ? NVPTX::cvta_to_local_yes_64 : NVPTX::cvta_to_local_yes;
      break;
    case ADDRESS_SPACE_PARAM:
-      if (Src.getOpcode() == NVPTXISD::MoveParam) {
-        // TL;DR: addrspacecast MoveParam to param space is a no-op.
-        //
-        // Longer version is that this particular change is both an
-        // optimization and a work-around for a problem in ptxas for
-        // sm_50+.
-        //
-        // Background:
-        // According to PTX ISA v7.5 5.1.6.4: "...whenever a formal
-        // parameter has its address taken within the called function
-        // [..] the parameter will be copied to the stack if
-        // necessary, and so the address will be in the .local state
-        // space and is accessed via ld.local and st.local
-        // instructions."
-        //
-        // Bug part: 'copied to the stack if necessary' part is
-        // currently broken for byval arguments with alignment < 4 for
-        // sm_50+ in all public versions of CUDA. Taking the address of
-        // such an argument results in SASS code that attempts to store
-        // the value using a 32-bit write to an unaligned address.
-        //
-        // Optimization: On top of the overhead of spill-to-stack, we
-        // also convert that address from local to generic space,
-        // which may result in further overhead. All of it is in most
-        // cases unnecessary, as we only need the value of the
-        // variable, and it can be accessed directly by using the
-        // argument symbol. We'll use the address of the local copy if
-        // it's needed (see step (a) below).
-        //
-        // In order for this bit to do its job we need to:
-        //
-        // a) Let LLVM do the spilling and make sure the IR does not
-        // access byval arguments directly. Instead, the argument
-        // pointer (which is in the default address space) is
-        // addrspacecast'ed to param space, and the data it points to
-        // is copied to allocated local space (i.e. we let compiler
-        // spill it, which gives us an opportunity to optimize the
-        // spill away later if nobody needs its address). Then all
-        // uses of arg pointer are replaced with a pointer to local
-        // copy of the arg. All this is done in NVPTXLowerKernelArgs.
-        //
-        // b) LowerFormalArguments() lowers the argument to
-        // NVPTXISD::MoveParam without any space conversion.
-        //
-        // c) And the final step is done by the code below.
-        // It replaces the addrspacecast (MoveParam) from step (a)
-        // with the arg symbol itself. This can then be used for
-        // [symbol + offset] addressing.
-
-        ReplaceNode(N, Src.getOperand(0).getNode());
-        return;
-      }
      Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
                         : NVPTX::nvvm_ptr_gen_to_param;
      break;
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@ -2078,6 +2078,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
  SDValue Root = DAG.getRoot();
  std::vector<SDValue> OutChains;

+  bool isKernel = llvm::isKernelFunction(*F);
  bool isABI = (STI.getSmVersion() >= 20);
  assert(isABI && "Non-ABI compilation is not supported");
  if (!isABI)
@ -2111,8 +2112,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
            theArgs[i],
            (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
                                     : nullptr))) {
-      assert(llvm::isKernelFunction(*F) &&
-             "Only kernels can have image/sampler params");
+      assert(isKernel && "Only kernels can have image/sampler params");
      InVals.push_back(DAG.getConstant(i + 1, dl, MVT::i32));
      continue;
    }
@ -2334,11 +2334,6 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
    // machine instruction fails because TargetExternalSymbol
    // (not lowered) is target dependent, and CopyToReg assumes
    // the source is lowered.
-    //
-    // Byval arguments for regular function are lowered the same way
-    // as for kernels in order to generate better code and work around
-    // a known issue in ptxas. See comments in
-    // NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the gory details.
    EVT ObjectVT = getValueType(DL, Ty);
    assert(ObjectVT == Ins[InsIdx].VT &&
           "Ins type did not match function type");
@ -2346,7 +2341,14 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
    SDValue p = DAG.getNode(NVPTXISD::MoveParam, dl, ObjectVT, Arg);
    if (p.getNode())
      p.getNode()->setIROrder(idx + 1);
-    InVals.push_back(p);
+    if (isKernel)
+      InVals.push_back(p);
+    else {
+      SDValue p2 = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, dl, ObjectVT,
+          DAG.getConstant(Intrinsic::nvvm_ptr_local_to_gen, dl, MVT::i32), p);
+      InVals.push_back(p2);
+    }
  }

  // Clang will check explicit VarArg and issue error if any. However, Clang
--- a/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerKernelArgs.cpp
@ -101,11 +101,6 @@ namespace {
 class NVPTXLowerKernelArgs : public FunctionPass {
  bool runOnFunction(Function &F) override;

-  // Kernels and regular device functions treat byval arguments
-  // differently.
-  bool runOnKernelFunction(Function &F);
-  bool runOnDeviceFunction(Function &F);
-
  // handle byval parameters
  void handleByValParam(Argument *Arg);
  // Knowing Ptr must point to the global address space, this function
@ -197,7 +192,11 @@ void NVPTXLowerKernelArgs::markPointerAsGlobal(Value *Ptr) {
 // =============================================================================
 // Main function for this pass.
 // =============================================================================
-bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
+bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
+  // Skip non-kernels. See the comments at the top of this file.
+  if (!isKernelFunction(F))
+    return false;
+
  if (TM && TM->getDrvInterface() == NVPTX::CUDA) {
    // Mark pointers in byval structs as global.
    for (auto &B : F) {
@ -229,19 +228,6 @@ bool NVPTXLowerKernelArgs::runOnKernelFunction(Function &F) {
  return true;
 }

-// See comments in NVPTXDAGToDAGISel::SelectAddrSpaceCast() for the
-// details on why we need to spill byval arguments into local memory.
-bool NVPTXLowerKernelArgs::runOnDeviceFunction(Function &F) {
-  for (Argument &Arg : F.args())
-    if (Arg.getType()->isPointerTy() && Arg.hasByValAttr())
-      handleByValParam(&Arg);
-  return true;
-}
-
-bool NVPTXLowerKernelArgs::runOnFunction(Function &F) {
-  return isKernelFunction(F) ? runOnKernelFunction(F) : runOnDeviceFunction(F);
-}
-
 FunctionPass *
 llvm::createNVPTXLowerKernelArgsPass(const NVPTXTargetMachine *TM) {
  return new NVPTXLowerKernelArgs(TM);
--- a/llvm/test/CodeGen/NVPTX/bug21465.ll
+++ b/llvm/test/CodeGen/NVPTX/bug21465.ll
@ -15,7 +15,7 @@ entry:
  %b = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
  %0 = load i32, i32* %b, align 4
 ; PTX-NOT: ld.param.u32 {{%r[0-9]+}}, [{{%rd[0-9]+}}]
-; PTX: ld.param.u32 [[value:%r[0-9]+]], [_Z11TakesStruct1SPi_param_0+4]
+; PTX: ld.param.u32 [[value:%r[0-9]+]], [{{%rd[0-9]+}}+4]
  store i32 %0, i32* %output, align 4
 ; PTX-NEXT: st.global.u32 [{{%rd[0-9]+}}], [[value]]
  ret void
--- a/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-kernel-ptr-arg.ll
@ -28,38 +28,20 @@ define void @kernel2(float addrspace(1)* %input, float addrspace(1)* %output) {

 %struct.S = type { i32*, i32* }

-define void @ptr_in_byval_kernel(%struct.S* byval %input, i32* %output) {
-; CHECK-LABEL: .visible .entry ptr_in_byval_kernel(
-; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_kernel_param_1]
-; CHECK: cvta.to.global.u64 %[[optr_g:.*]], %[[optr]];
-; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_kernel_param_0+8]
-; CHECK: cvta.to.global.u64 %[[iptr_g:.*]], %[[iptr]];
+define void @ptr_in_byval(%struct.S* byval %input, i32* %output) {
+; CHECK-LABEL: .visible .entry ptr_in_byval(
+; CHECK: cvta.to.global.u64
+; CHECK: cvta.to.global.u64
  %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
  %b = load i32*, i32** %b_ptr, align 4
  %v = load i32, i32* %b, align 4
-; CHECK: ld.global.u32 %[[val:.*]], [%[[iptr_g]]]
+; CHECK: ld.global.u32
  store i32 %v, i32* %output, align 4
-; CHECK: st.global.u32 [%[[optr_g]]], %[[val]]
-  ret void
-}
-
-; Regular functions lower byval arguments differently. We need to make
-; sure that we're loading byval argument data using [symbol+offset].
-; There's also no assumption that all pointers within are in global space.
-define void @ptr_in_byval_func(%struct.S* byval %input, i32* %output) {
-; CHECK-LABEL: .visible .func ptr_in_byval_func(
-; CHECK: ld.param.u64 	%[[optr:rd.*]], [ptr_in_byval_func_param_1]
-; CHECK: ld.param.u64 	%[[iptr:rd.*]], [ptr_in_byval_func_param_0+8]
-  %b_ptr = getelementptr inbounds %struct.S, %struct.S* %input, i64 0, i32 1
-  %b = load i32*, i32** %b_ptr, align 4
-  %v = load i32, i32* %b, align 4
-; CHECK: ld.u32 %[[val:.*]], [%[[iptr]]]
-  store i32 %v, i32* %output, align 4
-; CHECK: st.u32 [%[[optr]]], %[[val]]
+; CHECK: st.global.u32
  ret void
 }

 !nvvm.annotations = !{!0, !1, !2}
 !0 = !{void (float*, float*)* @kernel, !"kernel", i32 1}
 !1 = !{void (float addrspace(1)*, float addrspace(1)*)* @kernel2, !"kernel", i32 1}
-!2 = !{void (%struct.S*, i32*)* @ptr_in_byval_kernel, !"kernel", i32 1}
+!2 = !{void (%struct.S*, i32*)* @ptr_in_byval, !"kernel", i32 1}