diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp index 810e0e013e83..c3391d087b75 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp @@ -56,6 +56,16 @@ enum OpenMPRTLFunctionNVPTX { /// lane_offset, int16_t shortCircuit), /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num)); OMPRTL_NVPTX__kmpc_parallel_reduce_nowait, + /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, + /// int32_t num_vars, size_t reduce_size, void *reduce_data, + /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t + /// lane_offset, int16_t shortCircuit), + /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), + /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad, + /// int32_t index, int32_t width), + /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t + /// index, int32_t width, int32_t reduce)) + OMPRTL_NVPTX__kmpc_teams_reduce_nowait, /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); OMPRTL_NVPTX__kmpc_end_reduce_nowait }; @@ -125,6 +135,9 @@ enum MachineConfiguration : unsigned { /// computed as log_2(WarpSize). LaneIDBits = 5, LaneIDMask = WarpSize - 1, + + /// Global memory alignment for performance. + GlobalMemoryAlignment = 256, }; enum NamedBarrier : unsigned { @@ -694,6 +707,49 @@ CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait"); break; } + case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: { + // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, + // int32_t num_vars, size_t reduce_size, void *reduce_data, + // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t + // lane_offset, int16_t shortCircuit), + // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num), + // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad, + // int32_t index, int32_t width), + // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, + // int32_t index, int32_t width, int32_t reduce)) + llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty, + CGM.Int16Ty, CGM.Int16Ty}; + auto *ShuffleReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty}; + auto *InterWarpCopyFnTy = + llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams, + /*isVarArg=*/false); + llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy, + CGM.Int32Ty, CGM.Int32Ty}; + auto *CopyToScratchpadFnTy = + llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams, + /*isVarArg=*/false); + llvm::Type *LoadReduceTypeParams[] = { + CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty}; + auto *LoadReduceFnTy = + llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams, + /*isVarArg=*/false); + llvm::Type *TypeParams[] = {CGM.Int32Ty, + CGM.Int32Ty, + CGM.SizeTy, + CGM.VoidPtrTy, + ShuffleReduceFnTy->getPointerTo(), + InterWarpCopyFnTy->getPointerTo(), + CopyToScratchpadFnTy->getPointerTo(), + LoadReduceFnTy->getPointerTo()}; + llvm::FunctionType *FnTy = + llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false); + RTLFn = CGM.CreateRuntimeFunction( + FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait"); + break; + } case OMPRTL_NVPTX__kmpc_end_reduce_nowait: { // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid); llvm::Type *TypeParams[] = {CGM.Int32Ty}; @@ -966,24 +1022,39 @@ enum CopyAction : unsigned { RemoteLaneToThread, // ThreadCopy: Make a copy of a Reduce list on the thread's stack. ThreadCopy, + // ThreadToScratchpad: Copy a team-reduced array to the scratchpad. + ThreadToScratchpad, + // ScratchpadToThread: Copy from a scratchpad array in global memory + // containing team-reduced data to a thread's stack. + ScratchpadToThread, }; } // namespace +struct CopyOptionsTy { + llvm::Value *RemoteLaneOffset; + llvm::Value *ScratchpadIndex; + llvm::Value *ScratchpadWidth; +}; + /// Emit instructions to copy a Reduce list, which contains partially /// aggregated values, in the specified direction. -static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, - QualType ReductionArrayTy, - ArrayRef Privates, - Address SrcBase, Address DestBase, - llvm::Value *RemoteLaneOffset = nullptr) { +static void emitReductionListCopy( + CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy, + ArrayRef Privates, Address SrcBase, Address DestBase, + CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) { auto &CGM = CGF.CGM; auto &C = CGM.getContext(); auto &Bld = CGF.Builder; + auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset; + auto *ScratchpadIndex = CopyOptions.ScratchpadIndex; + auto *ScratchpadWidth = CopyOptions.ScratchpadWidth; + // Iterates, element-by-element, through the source Reduce list and // make a copy. unsigned Idx = 0; + unsigned Size = Privates.size(); for (auto &Private : Privates) { Address SrcElementAddr = Address::invalid(); Address DestElementAddr = Address::invalid(); @@ -993,6 +1064,10 @@ static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, // Set to true to update the pointer in the dest Reduce list to a // newly created element. bool UpdateDestListPtr = false; + // Increment the src or dest pointer to the scratchpad, for each + // new element. + bool IncrScratchpadSrc = false; + bool IncrScratchpadDest = false; switch (Action) { case RemoteLaneToThread: { @@ -1036,6 +1111,59 @@ static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, DestElemAddr, CGF.ConvertTypeForMem(Private->getType())); break; } + case ThreadToScratchpad: { + // Step 1.1: Get the address for the src element in the Reduce list. + Address SrcElementPtrAddr = + Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize()); + llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar( + SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + SrcElementAddr = + Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType())); + + // Step 1.2: Get the address for dest element: + // address = base + index * ElementSizeInChars. + unsigned ElementSizeInChars = + C.getTypeSizeInChars(Private->getType()).getQuantity(); + auto *CurrentOffset = + Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars), + ScratchpadIndex); + auto *ScratchPadElemAbsolutePtrVal = + Bld.CreateAdd(DestBase.getPointer(), CurrentOffset); + ScratchPadElemAbsolutePtrVal = + Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); + Address ScratchpadPtr = + Address(ScratchPadElemAbsolutePtrVal, + C.getTypeAlignInChars(Private->getType())); + DestElementAddr = Bld.CreateElementBitCast( + ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType())); + IncrScratchpadDest = true; + break; + } + case ScratchpadToThread: { + // Step 1.1: Get the address for the src element in the scratchpad. + // address = base + index * ElementSizeInChars. + unsigned ElementSizeInChars = + C.getTypeSizeInChars(Private->getType()).getQuantity(); + auto *CurrentOffset = + Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars), + ScratchpadIndex); + auto *ScratchPadElemAbsolutePtrVal = + Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset); + ScratchPadElemAbsolutePtrVal = + Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy); + SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal, + C.getTypeAlignInChars(Private->getType())); + IncrScratchpadSrc = true; + + // Step 1.2: Create a temporary to store the element in the destination + // Reduce list. + DestElementPtrAddr = + Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize()); + DestElementAddr = + CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element"); + UpdateDestListPtr = true; + break; + } } // Regardless of src and dest of copy, we emit the load of src @@ -1069,10 +1197,262 @@ static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF, C.VoidPtrTy); } + // Step 4.1: Increment SrcBase/DestBase so that it points to the starting + // address of the next element in scratchpad memory, unless we're currently + // processing the last one. Memory alignment is also taken care of here. + if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) { + llvm::Value *ScratchpadBasePtr = + IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer(); + unsigned ElementSizeInChars = + C.getTypeSizeInChars(Private->getType()).getQuantity(); + ScratchpadBasePtr = Bld.CreateAdd( + ScratchpadBasePtr, + Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get( + CGM.SizeTy, ElementSizeInChars))); + + // Take care of global memory alignment for performance + ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr, + llvm::ConstantInt::get(CGM.SizeTy, 1)); + ScratchpadBasePtr = Bld.CreateSDiv( + ScratchpadBasePtr, + llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); + ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr, + llvm::ConstantInt::get(CGM.SizeTy, 1)); + ScratchpadBasePtr = Bld.CreateMul( + ScratchpadBasePtr, + llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment)); + + if (IncrScratchpadDest) + DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); + else /* IncrScratchpadSrc = true */ + SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign()); + } + Idx++; } } +/// This function emits a helper that loads data from the scratchpad array +/// and (optionally) reduces it with the input operand. +/// +/// load_and_reduce(local, scratchpad, index, width, should_reduce) +/// reduce_data remote; +/// for elem in remote: +/// remote.elem = Scratchpad[elem_id][index] +/// if (should_reduce) +/// local = local @ remote +/// else +/// local = remote +llvm::Value *emitReduceScratchpadFunction(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy, + llvm::Value *ReduceFn) { + auto &C = CGM.getContext(); + auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true); + + // Destination of the copy. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // Base address of the scratchpad array, with each element storing a + // Reduce list per team. + ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // A source index into the scratchpad array. + ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, Int32Ty); + // Row width of an element in the scratchpad array, typically + // the number of teams. + ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, Int32Ty); + // If should_reduce == 1, then it's load AND reduce, + // If should_reduce == 0 (or otherwise), then it only loads (+ copy). + // The latter case is used for initialization. + ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, Int32Ty); + + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&ScratchPadArg); + Args.push_back(&IndexArg); + Args.push_back(&WidthArg); + Args.push_back(&ShouldReduceArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_load_and_reduce", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + // Get local Reduce list pointer. + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address ReduceListAddr( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg); + llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar( + AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + + Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg); + llvm::Value *IndexVal = + Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, + Int32Ty, SourceLocation()), + CGM.SizeTy, /*isSigned=*/true); + + Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); + llvm::Value *WidthVal = + Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, + Int32Ty, SourceLocation()), + CGM.SizeTy, /*isSigned=*/true); + + Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg); + llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar( + AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation()); + + // The absolute ptr address to the base addr of the next element to copy. + llvm::Value *CumulativeElemBasePtr = + Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy); + Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign()); + + // Create a Remote Reduce list to store the elements read from the + // scratchpad array. + Address RemoteReduceList = + CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list"); + + // Assemble remote Reduce list from scratchpad array. + emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates, + SrcDataAddr, RemoteReduceList, + {/*RemoteLaneOffset=*/nullptr, + /*ScratchpadIndex=*/IndexVal, + /*ScratchpadWidth=*/WidthVal}); + + llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then"); + llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else"); + llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont"); + + auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1)); + Bld.CreateCondBr(CondReduce, ThenBB, ElseBB); + + CGF.EmitBlock(ThenBB); + // We should reduce with the local Reduce list. + // reduce_function(LocalReduceList, RemoteReduceList) + llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + ReduceListAddr.getPointer(), CGF.VoidPtrTy); + llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast( + RemoteReduceList.getPointer(), CGF.VoidPtrTy); + CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr}); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(ElseBB); + // No reduction; just copy: + // Local Reduce list = Remote Reduce list. + emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates, + RemoteReduceList, ReduceListAddr); + Bld.CreateBr(MergeBB); + + CGF.EmitBlock(MergeBB); + + CGF.FinishFunction(); + return Fn; +} + +/// This function emits a helper that stores reduced data from the team +/// master to a scratchpad array in global memory. +/// +/// for elem in Reduce List: +/// scratchpad[elem_id][index] = elem +/// +llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM, + ArrayRef Privates, + QualType ReductionArrayTy) { + + auto &C = CGM.getContext(); + auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true); + + // Source of the copy. + ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // Base address of the scratchpad array, with each element storing a + // Reduce list per team. + ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, C.VoidPtrTy); + // A destination index into the scratchpad array, typically the team + // identifier. + ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, Int32Ty); + // Row width of an element in the scratchpad array, typically + // the number of teams. + ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, SourceLocation(), + /*Id=*/nullptr, Int32Ty); + + FunctionArgList Args; + Args.push_back(&ReduceListArg); + Args.push_back(&ScratchPadArg); + Args.push_back(&IndexArg); + Args.push_back(&WidthArg); + + auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args); + auto *Fn = llvm::Function::Create( + CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage, + "_omp_reduction_copy_to_scratchpad", &CGM.getModule()); + CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI); + CodeGenFunction CGF(CGM); + // We don't need debug information in this function as nothing here refers to + // user code. + CGF.disableDebugInfo(); + CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args); + + auto &Bld = CGF.Builder; + + Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg); + Address SrcDataAddr( + Bld.CreatePointerBitCastOrAddrSpaceCast( + CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false, + C.VoidPtrTy, SourceLocation()), + CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()), + CGF.getPointerAlign()); + + Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg); + llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar( + AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation()); + + Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg); + llvm::Value *IndexVal = + Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, + Int32Ty, SourceLocation()), + CGF.SizeTy, /*isSigned=*/true); + + Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg); + llvm::Value *WidthVal = + Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, + Int32Ty, SourceLocation()), + CGF.SizeTy, /*isSigned=*/true); + + // The absolute ptr address to the base addr of the next element to copy. + llvm::Value *CumulativeElemBasePtr = + Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy); + Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign()); + + emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates, + SrcDataAddr, DestDataAddr, + {/*RemoteLaneOffset=*/nullptr, + /*ScratchpadIndex=*/IndexVal, + /*ScratchpadWidth=*/WidthVal}); + + CGF.FinishFunction(); + return Fn; +} + /// This function emits a helper that gathers Reduce lists from the first /// lane of every active warp to lanes in the first warp. /// @@ -1402,7 +1782,9 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM, // hosted on the thread's stack. emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates, LocalReduceList, RemoteReduceList, - RemoteLaneOffsetArgVal); + {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal, + /*ScratchpadIndex=*/nullptr, + /*ScratchpadWidth=*/nullptr}); // The actions to be performed on the Remote Reduce list is dependent // on the algorithm version. @@ -1575,6 +1957,23 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM, /// reduced variables across warps. It tunnels, through CUDA /// shared memory, the thread-private data of type 'ReduceData' /// from lane 0 of each warp to a lane in the first warp. +/// 4. Call the OpenMP runtime on the GPU to reduce across teams. +/// The last team writes the global reduced value to memory. +/// +/// ret = __kmpc_nvptx_teams_reduce_nowait(..., +/// reduceData, shuffleReduceFn, interWarpCpyFn, +/// scratchpadCopyFn, loadAndReduceFn) +/// +/// 'scratchpadCopyFn' is a helper that stores reduced +/// data from the team master to a scratchpad array in +/// global memory. +/// +/// 'loadAndReduceFn' is a helper that loads data from +/// the scratchpad array and reduces it with the input +/// operand. +/// +/// These compiler generated functions hide address +/// calculation and alignment information from the runtime. /// 5. if ret == 1: /// The team master of the last team stores the reduced /// result to the globals in memory. @@ -1696,6 +2095,21 @@ emitShuffleAndReduceFunction(CodeGenModule &CGM, /// a mathematical sense) the problem of reduction across warp masters in /// a block to the problem of warp reduction. /// +/// +/// Inter-Team Reduction +/// +/// Once a team has reduced its data to a single value, it is stored in +/// a global scratchpad array. Since each team has a distinct slot, this +/// can be done without locking. +/// +/// The last team to write to the scratchpad array proceeds to reduce the +/// scratchpad array. One or more workers in the last team use the helper +/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e., +/// the k'th worker reduces every k'th element. +/// +/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to +/// reduce across workers and compute a globally reduced value. +/// void CGOpenMPRuntimeNVPTX::emitReduction( CodeGenFunction &CGF, SourceLocation Loc, ArrayRef Privates, ArrayRef LHSExprs, ArrayRef RHSExprs, @@ -1704,7 +2118,10 @@ void CGOpenMPRuntimeNVPTX::emitReduction( return; bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind); - assert(ParallelReduction && "Invalid reduction selection in emitReduction."); + bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind); + // FIXME: Add support for simd reduction. + assert((TeamsReduction || ParallelReduction) && + "Invalid reduction selection in emitReduction."); auto &C = CGM.getContext(); @@ -1777,6 +2194,25 @@ void CGOpenMPRuntimeNVPTX::emitReduction( Args); } + if (TeamsReduction) { + auto *ScratchPadCopyFn = + emitCopyToScratchpad(CGM, Privates, ReductionArrayTy); + auto *LoadAndReduceFn = emitReduceScratchpadFunction( + CGM, Privates, ReductionArrayTy, ReductionFn); + + llvm::Value *Args[] = {ThreadId, + CGF.Builder.getInt32(RHSExprs.size()), + ReductionArrayTySize, + RL, + ShuffleAndReduceFn, + InterWarpCopyFn, + ScratchPadCopyFn, + LoadAndReduceFn}; + Res = CGF.EmitRuntimeCall( + createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait), + Args); + } + // 5. Build switch(res) auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default"); auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1); diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 0af8dc49b3dd..2f64da29123c 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -3553,10 +3553,14 @@ void CodeGenFunction::EmitOMPTeamsDirective(const OMPTeamsDirective &S) { OMPPrivateScope PrivateScope(CGF); (void)CGF.EmitOMPFirstprivateClause(S, PrivateScope); CGF.EmitOMPPrivateClause(S, PrivateScope); + CGF.EmitOMPReductionClauseInit(S, PrivateScope); (void)PrivateScope.Privatize(); CGF.EmitStmt(cast(S.getAssociatedStmt())->getCapturedStmt()); + CGF.EmitOMPReductionClauseFinal(S, /*ReductionKind=*/OMPD_teams); }; emitCommonOMPTeamsDirective(*this, S, OMPD_teams, CodeGen); + emitPostUpdateForReductionClause( + *this, S, [](CodeGenFunction &) -> llvm::Value * { return nullptr; }); } static void emitTargetTeamsRegion(CodeGenFunction &CGF, PrePostActionTy &Action, diff --git a/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp new file mode 100644 index 000000000000..ae129ebfae4d --- /dev/null +++ b/clang/test/OpenMP/nvptx_teams_reduction_codegen.cpp @@ -0,0 +1,1143 @@ +// Test target codegen - host bc file has to be created first. +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64 +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc +// RUN: %clang_cc1 -verify -fopenmp -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// RUN: %clang_cc1 -verify -fopenmp -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32 +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +// Check for the data transfer medium in shared memory to transfer the reduction list to the first warp. +// CHECK-DAG: [[TRANSFER_STORAGE:@.+]] = common addrspace([[SHARED_ADDRSPACE:[0-9]+]]) global [32 x i64] + +// Check that the execution mode of all 3 target regions is set to Generic Mode. +// CHECK-DAG: {{@__omp_offloading_.+l27}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l33}}_exec_mode = weak constant i8 1 +// CHECK-DAG: {{@__omp_offloading_.+l40}}_exec_mode = weak constant i8 1 + +template +tx ftemplate(int n) { + int a; + short b; + tx c; + float d; + double e; + + #pragma omp target + #pragma omp teams reduction(+: e) + { + e += 5; + } + + #pragma omp target + #pragma omp teams reduction(^: c) reduction(*: d) + { + c ^= 2; + d *= 33; + } + + #pragma omp target + #pragma omp teams reduction(|: a) reduction(max: b) + { + a |= 1; + b = 99 > b ? 99 : b; + } + + return a+b+c+d+e; +} + +int bar(int n){ + int a = 0; + + a += ftemplate(n); + + return a; +} + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l27}}_worker() + + // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+template.+l27]]( + // + // CHECK: {{call|invoke}} void [[T1]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store double {{[0\.e\+]+}}, double* [[E:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[EV]], 5 + // CHECK: store double [[ADD]], double* [[E]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [1 x i8*]* [[RL:%.+]], i[[SZ:32|64]] 0, i{{32|64}} 0 + // CHECK: [[E_CAST:%.+]] = bitcast double* [[E]] to i8* + // CHECK: store i8* [[E_CAST]], i8** [[PTR1]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 1, i[[SZ]] {{4|8}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[E_INV:%.+]] = load double, double* [[E_IN:%.+]], align + // CHECK: [[EV:%.+]] = load double, double* [[E]], align + // CHECK: [[ADD:%.+]] = fadd double [[E_INV]], [[EV]] + // CHECK: store double [[ADD]], double* [[E_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR_RHS_VOID:%.+]] = load i8*, i8** [[VAR_RHS_REF]], + // CHECK: [[VAR_RHS:%.+]] = bitcast i8* [[VAR_RHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR_LHS_VOID:%.+]] = load i8*, i8** [[VAR_LHS_REF]], + // CHECK: [[VAR_LHS:%.+]] = bitcast i8* [[VAR_LHS_VOID]] to double* + // + // CHECK: [[VAR_LHS_VAL:%.+]] = load double, double* [[VAR_LHS]], + // CHECK: [[VAR_RHS_VAL:%.+]] = load double, double* [[VAR_RHS]], + // CHECK: [[RES:%.+]] = fadd double [[VAR_LHS_VAL]], [[VAR_RHS_VAL]] + // CHECK: store double [[RES]], double* [[VAR_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast double [[ELT_VAL]] to i64 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT_VAL64:%.+]] = call i64 @__kmpc_shuffle_int64(i64 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT_VAL:%.+]] = bitcast i64 [[REMOTE_ELT_VAL64]] to double + // + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_VOID:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_VOID]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store double [[ELT_VAL]], double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to double addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load double, double addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: store double [[MEDIUM_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[ELT_VAL:%.+]] = load double, double* [[ELT]], align + // CHECK: store double [[ELT_VAL]], double* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT:%.+]] = alloca double + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 8, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[REMOTE_ELT]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast double* [[REMOTE_ELT]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to double* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load double, double* [[REMOTE_ELT]], align + // CHECK: store double [[REMOTE_ELT_VAL]], double* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + + + + + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l33}}_worker() + + // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+template.+l33]]( + // + // CHECK: {{call|invoke}} void [[T2]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store float {{1\.[0e\+]+}}, float* [[D:%.+]], align + // CHECK: [[C_VAL:%.+]] = load i8, i8* [[C:%.+]], align + // CHECK: [[CONV:%.+]] = sext i8 [[C_VAL]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[CONV]], 2 + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[DV]], {{[0-9e\.\+]+}} + // CHECK: store float [[MUL]], float* [[D]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: store i8* [[C]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[D_CAST:%.+]] = bitcast float* [[D]] to i8* + // CHECK: store i8* [[D_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[C_INV8:%.+]] = load i8, i8* [[C_IN:%.+]], align + // CHECK: [[C_INV:%.+]] = sext i8 [[C_INV8]] to i32 + // CHECK: [[CV8:%.+]] = load i8, i8* [[C]], align + // CHECK: [[CV:%.+]] = sext i8 [[CV8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[C_INV]], [[CV]] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[TRUNC]], i8* [[C_IN]], align + // CHECK: [[D_INV:%.+]] = load float, float* [[D_IN:%.+]], align + // CHECK: [[DV:%.+]] = load float, float* [[D]], align + // CHECK: [[MUL:%.+]] = fmul float [[D_INV]], [[DV]] + // CHECK: store float [[MUL]], float* [[D_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_RHS:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_LHS:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to float* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to float* + // + // CHECK: [[VAR1_LHS_VAL8:%.+]] = load i8, i8* [[VAR1_LHS]], + // CHECK: [[VAR1_LHS_VAL:%.+]] = sext i8 [[VAR1_LHS_VAL8]] to i32 + // CHECK: [[VAR1_RHS_VAL8:%.+]] = load i8, i8* [[VAR1_RHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = sext i8 [[VAR1_RHS_VAL8]] to i32 + // CHECK: [[XOR:%.+]] = xor i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: [[RES:%.+]] = trunc i32 [[XOR]] to i8 + // CHECK: store i8 [[RES]], i8* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL:%.+]] = load float, float* [[VAR2_LHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = load float, float* [[VAR2_RHS]], + // CHECK: [[RES:%.+]] = fmul float [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: store float [[RES]], float* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i8 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = trunc i32 [[REMOTE_ELT1_VAL32]] to i8 + // + // CHECK: store i8 [[REMOTE_ELT1_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = bitcast float [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = bitcast i32 [[REMOTE_ELT2_VAL32]] to float + // + // CHECK: store float [[REMOTE_ELT2_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i8 [[ELT_VAL]], i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i8 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i8, i8 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: store i8 [[MEDIUM_ELT_VAL]], i8* [[ELT_VOID]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store float [[ELT_VAL]], float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to float addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load float, float addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: store float [[MEDIUM_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[ELT_VAL:%.+]] = load i8, i8* [[ELT_VOID]], align + // CHECK: store i8 [[ELT_VAL]], i8* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[ELT_VAL:%.+]] = load float, float* [[ELT]], align + // CHECK: store float [[ELT_VAL]], float* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i8 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca float + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 1, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[SCRATCHPAD_ELT_PTR_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[REMOTE_ELT1]], align + // CHECK: store i8* [[REMOTE_ELT1]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 1 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast float* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i8, i8* [[REMOTE_ELT_VOID]], align + // CHECK: store i8 [[REMOTE_ELT_VAL]], i8* [[ELT_VOID]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to float* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load float, float* [[REMOTE_ELT]], align + // CHECK: store float [[REMOTE_ELT_VAL]], float* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + + + + + + + + + + // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l40}}_worker() + + // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+template.+l40]]( + // + // CHECK: {{call|invoke}} void [[T3]]_worker() + // + // CHECK: call void @__kmpc_kernel_init( + // + // CHECK: store i32 0, i32* [[A:%.+]], align + // CHECK: store i16 -32768, i16* [[B:%.+]], align + // CHECK: [[A_VAL:%.+]] = load i32, i32* [[A:%.+]], align + // CHECK: [[OR:%.+]] = or i32 [[A_VAL]], 1 + // CHECK: store i32 [[OR]], i32* [[A]], align + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 99, [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[BV:%.+]] = load i16, i16* [[B]], align + // CHECK: [[MAX:%.+]] = sext i16 [[BV]] to i32 + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_LVALUE:%.+]] = phi i32 [ 99, %[[DO_MAX]] ], [ [[MAX]], %[[MAX_ELSE]] ] + // CHECK: [[TRUNC:%.+]] = trunc i32 [[B_LVALUE]] to i16 + // CHECK: store i16 [[TRUNC]], i16* [[B]], align + // CHECK: [[PTR1:%.+]] = getelementptr inbounds [[RLT:.+]], [2 x i8*]* [[RL:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[A_CAST:%.+]] = bitcast i32* [[A]] to i8* + // CHECK: store i8* [[A_CAST]], i8** [[PTR1]], align + // CHECK: [[PTR2:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RL]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[B_CAST:%.+]] = bitcast i16* [[B]] to i8* + // CHECK: store i8* [[B_CAST]], i8** [[PTR2]], align + // CHECK: [[ARG_RL:%.+]] = bitcast [[RLT]]* [[RL]] to i8* + // CHECK: [[RET:%.+]] = call i32 @__kmpc_nvptx_teams_reduce_nowait(i32 {{.+}}, i32 2, i[[SZ]] {{8|16}}, i8* [[ARG_RL]], void (i8*, i16, i16, i16)* [[SHUFFLE_REDUCE_FN:@.+]], void (i8*, i32)* [[WARP_COPY_FN:@.+]], void (i8*, i8*, i32, i32)* [[SCRATCH_COPY_FN:@.+]], void (i8*, i8*, i32, i32, i32)* [[LOAD_REDUCE_FN:@.+]]) + // CHECK: [[COND:%.+]] = icmp eq i32 [[RET]], 1 + // CHECK: br i1 [[COND]], label {{%?}}[[IFLABEL:.+]], label {{%?}}[[EXIT:.+]] + // + // CHECK: [[IFLABEL]] + // CHECK: [[A_INV:%.+]] = load i32, i32* [[A_IN:%.+]], align + // CHECK: [[AV:%.+]] = load i32, i32* [[A]], align + // CHECK: [[OR:%.+]] = or i32 [[A_INV]], [[AV]] + // CHECK: store i32 [[OR]], i32* [[A_IN]], align + // CHECK: [[B_INV16:%.+]] = load i16, i16* [[B_IN:%.+]], align + // CHECK: [[B_INV:%.+]] = sext i16 [[B_INV16]] to i32 + // CHECK: [[BV16:%.+]] = load i16, i16* [[B]], align + // CHECK: [[BV:%.+]] = sext i16 [[BV16]] to i32 + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[B_INV]], [[BV]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[B_IN]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[B]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[B_MAX:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[B_MAX]], i16* [[B_IN]], align + // CHECK: call void @__kmpc_nvptx_end_reduce_nowait( + // CHECK: br label %[[EXIT]] + // + // CHECK: [[EXIT]] + // CHECK: call void @__kmpc_kernel_deinit() + + // + // Reduction function + // CHECK: define internal void [[REDUCTION_FUNC:@.+]](i8*, i8*) + // CHECK: [[VAR1_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_RHS_VOID:%.+]] = load i8*, i8** [[VAR1_RHS_REF]], + // CHECK: [[VAR1_RHS:%.+]] = bitcast i8* [[VAR1_RHS_VOID]] to i32* + // + // CHECK: [[VAR1_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[VAR1_LHS_VOID:%.+]] = load i8*, i8** [[VAR1_LHS_REF]], + // CHECK: [[VAR1_LHS:%.+]] = bitcast i8* [[VAR1_LHS_VOID]] to i32* + // + // CHECK: [[VAR2_RHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_RHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_RHS_VOID:%.+]] = load i8*, i8** [[VAR2_RHS_REF]], + // CHECK: [[VAR2_RHS:%.+]] = bitcast i8* [[VAR2_RHS_VOID]] to i16* + // + // CHECK: [[VAR2_LHS_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST_LHS]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[VAR2_LHS_VOID:%.+]] = load i8*, i8** [[VAR2_LHS_REF]], + // CHECK: [[VAR2_LHS:%.+]] = bitcast i8* [[VAR2_LHS_VOID]] to i16* + // + // CHECK: [[VAR1_LHS_VAL:%.+]] = load i32, i32* [[VAR1_LHS]], + // CHECK: [[VAR1_RHS_VAL:%.+]] = load i32, i32* [[VAR1_RHS]], + // CHECK: [[OR:%.+]] = or i32 [[VAR1_LHS_VAL]], [[VAR1_RHS_VAL]] + // CHECK: store i32 [[OR]], i32* [[VAR1_LHS]], + // + // CHECK: [[VAR2_LHS_VAL16:%.+]] = load i16, i16* [[VAR2_LHS]], + // CHECK: [[VAR2_LHS_VAL:%.+]] = sext i16 [[VAR2_LHS_VAL16]] to i32 + // CHECK: [[VAR2_RHS_VAL16:%.+]] = load i16, i16* [[VAR2_RHS]], + // CHECK: [[VAR2_RHS_VAL:%.+]] = sext i16 [[VAR2_RHS_VAL16]] to i32 + // + // CHECK: [[CMP:%.+]] = icmp sgt i32 [[VAR2_LHS_VAL]], [[VAR2_RHS_VAL]] + // CHECK: br i1 [[CMP]], label {{%?}}[[DO_MAX:.+]], label {{%?}}[[MAX_ELSE:.+]] + // + // CHECK: [[DO_MAX]] + // CHECK: [[MAX1:%.+]] = load i16, i16* [[VAR2_LHS]], align + // CHECK: br label {{%?}}[[MAX_CONT:.+]] + // + // CHECK: [[MAX_ELSE]] + // CHECK: [[MAX2:%.+]] = load i16, i16* [[VAR2_RHS]], align + // CHECK: br label {{%?}}[[MAX_CONT]] + // + // CHECK: [[MAX_CONT]] + // CHECK: [[MAXV:%.+]] = phi i16 [ [[MAX1]], %[[DO_MAX]] ], [ [[MAX2]], %[[MAX_ELSE]] ] + // CHECK: store i16 [[MAXV]], i16* [[VAR2_LHS]], + // CHECK: ret void + + // + // Shuffle and reduce function + // CHECK: define internal void [[SHUFFLE_REDUCE_FN]](i8*, i16 {{.*}}, i16 {{.*}}, i16 {{.*}}) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // + // CHECK: [[LANEID:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[LANEOFFSET:%.+]] = load i16, i16* {{.+}}, align + // CHECK: [[ALGVER:%.+]] = load i16, i16* {{.+}}, align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT1_VAL:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_VAL]], i16 [[LANEOFFSET]], i16 [[WS]]) + // + // CHECK: store i32 [[REMOTE_ELT1_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1C:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1C]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[ELT_CAST:%.+]] = sext i16 [[ELT_VAL]] to i32 + // CHECK: [[WS32:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[WS:%.+]] = trunc i32 [[WS32]] to i16 + // CHECK: [[REMOTE_ELT2_VAL32:%.+]] = call i32 @__kmpc_shuffle_int32(i32 [[ELT_CAST]], i16 [[LANEOFFSET]], i16 [[WS]]) + // CHECK: [[REMOTE_ELT2_VAL:%.+]] = trunc i32 [[REMOTE_ELT2_VAL32]] to i16 + // + // CHECK: store i16 [[REMOTE_ELT2_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT2C:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT2C]], i8** [[REMOTE_ELT_REF]], align + // + // Condition to reduce + // CHECK: [[CONDALG0:%.+]] = icmp eq i16 [[ALGVER]], 0 + // + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp ult i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[CONDALG1:%.+]] = and i1 [[COND1]], [[COND2]] + // + // CHECK: [[COND3:%.+]] = icmp eq i16 [[ALGVER]], 2 + // CHECK: [[COND4:%.+]] = and i16 [[LANEID]], 1 + // CHECK: [[COND5:%.+]] = icmp eq i16 [[COND4]], 0 + // CHECK: [[COND6:%.+]] = and i1 [[COND3]], [[COND5]] + // CHECK: [[COND7:%.+]] = icmp sgt i16 [[LANEOFFSET]], 0 + // CHECK: [[CONDALG2:%.+]] = and i1 [[COND6]], [[COND7]] + // + // CHECK: [[COND8:%.+]] = or i1 [[CONDALG0]], [[CONDALG1]] + // CHECK: [[SHOULD_REDUCE:%.+]] = or i1 [[COND8]], [[CONDALG2]] + // CHECK: br i1 [[SHOULD_REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // CHECK: [[REDUCE_ELSE]] + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // Now check if we should just copy over the remote reduction list + // CHECK: [[COND1:%.+]] = icmp eq i16 [[ALGVER]], 1 + // CHECK: [[COND2:%.+]] = icmp uge i16 [[LANEID]], [[LANEOFFSET]] + // CHECK: [[SHOULD_COPY:%.+]] = and i1 [[COND1]], [[COND2]] + // CHECK: br i1 [[SHOULD_COPY]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // CHECK: [[DO_COPY]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // CHECK: [[COPY_CONT]] + // CHECK: void + + // + // Inter warp copy function + // CHECK: define internal void [[WARP_COPY_FN]](i8*, i32) + // CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31 + // CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5 + // CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i32 [[ELT_VAL]], i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i32 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i32, i32 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: store i32 [[MEDIUM_ELT_VAL]], i32* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0 + // CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]] + // + // [[DO_COPY]] + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[WARPID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: store i16 [[ELT_VAL]], i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: br label {{%?}}[[COPY_CONT:.+]] + // + // CHECK: [[COPY_ELSE]] + // CHECK: br label {{%?}}[[COPY_CONT]] + // + // Barrier after copy to shared memory storage medium. + // CHECK: [[COPY_CONT]] + // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize() + // CHECK: [[ACTIVE_THREADS:%.+]] = mul nsw i32 [[ACTIVE_WARPS:%.+]], [[WS]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // + // Read into warp 0. + // CHECK: [[IS_W0_ACTIVE_THREAD:%.+]] = icmp ult i32 [[TID:%.+]], [[ACTIVE_WARPS]] + // CHECK: br i1 [[IS_W0_ACTIVE_THREAD]], label {{%?}}[[DO_READ:.+]], label {{%?}}[[READ_ELSE:.+]] + // + // CHECK: [[DO_READ]] + // CHECK: [[MEDIUM_ELT64:%.+]] = getelementptr inbounds [32 x i64], [32 x i64] addrspace([[SHARED_ADDRSPACE]])* [[TRANSFER_STORAGE]], i64 0, i32 [[TID]] + // CHECK: [[MEDIUM_ELT:%.+]] = bitcast i64 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT64]] to i16 addrspace([[SHARED_ADDRSPACE]])* + // CHECK: [[MEDIUM_ELT_VAL:%.+]] = load i16, i16 addrspace([[SHARED_ADDRSPACE]])* [[MEDIUM_ELT]], align + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: store i16 [[MEDIUM_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[READ_CONT:.+]] + // + // CHECK: [[READ_ELSE]] + // CHECK: br label {{%?}}[[READ_CONT]] + // + // CHECK: [[READ_CONT]] + // CHECK: call void @llvm.nvvm.barrier(i32 1, i32 [[ACTIVE_THREADS]]) + // CHECK: ret + + // + // Copy to scratchpad function + // CHECK: define internal void [[SCRATCH_COPY_FN]](i8*, i8*, i32, i32) + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[ELT_VAL:%.+]] = load i32, i32* [[ELT]], align + // CHECK: store i32 [[ELT_VAL]], i32* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[ELT_VAL:%.+]] = load i16, i16* [[ELT]], align + // CHECK: store i16 [[ELT_VAL]], i16* [[SCRATCHPAD_ELT_PTR]], align + // + // CHECK: ret + + // + // Load and reduce function + // CHECK: define internal void [[LOAD_REDUCE_FN]](i8*, i8*, i32, i32, i32) + // CHECK: [[REMOTE_RED_LIST:%.+]] = alloca [[RLT]], align + // CHECK: [[REMOTE_ELT1:%.+]] = alloca i32 + // CHECK: [[REMOTE_ELT2:%.+]] = alloca i16 + // CHECK: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]* + // CHECK: [[SCRATCHPAD_PTR:%.+]] = load i8*, i8** {{.+}}, align + // CHECK-64: [[TEAM32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[TEAM:%.+]] = sext i32 [[TEAM32]] to i64 + // CHECK-32: [[TEAM:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS32:%.+]] = load i32, i32* {{.+}}, align + // CHECK-64: [[NUM_TEAMS:%.+]] = sext i32 [[NUM_TEAMS32]] to i64 + // CHECK-32: [[NUM_TEAMS:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SHOULD_REDUCE:%.+]] = load i32, i32* {{.+}}, align + // CHECK: [[SCRATCHPAD:%.+]] = ptrtoint i8* [[SCRATCHPAD_PTR]] to i[[SZ]] + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 4, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[REMOTE_ELT1]], align + // CHECK: [[REMOTE_ELT1_PTR:%.+]] = bitcast i32* [[REMOTE_ELT1]] to i8* + // CHECK: store i8* [[REMOTE_ELT1_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[OF:%.+]] = mul i[[SZ]] [[NUM_TEAMS]], 4 + // CHECK: [[POS1:%.+]] = add i[[SZ]] [[SCRATCHPAD]], [[OF]] + // CHECK: [[POS2:%.+]] = sub i[[SZ]] [[POS1]], 1 + // CHECK: [[POS3:%.+]] = sdiv i[[SZ]] [[POS2]], 256 + // CHECK: [[POS4:%.+]] = add i[[SZ]] [[POS3]], 1 + // CHECK: [[SCRATCHPAD_NEXT:%.+]] = mul i[[SZ]] [[POS4]], 256 + // + // CHECK: [[P:%.+]] = mul i[[SZ]] 2, [[TEAM]] + // CHECK: [[SCRATCHPAD_ELT_PTR64:%.+]] = add i[[SZ]] [[SCRATCHPAD_NEXT]], [[P]] + // CHECK: [[SCRATCHPAD_ELT_PTR_VOID:%.+]] = inttoptr i[[SZ]] [[SCRATCHPAD_ELT_PTR64]] to i8* + + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[SCRATCHPAD_ELT_PTR:%.+]] = bitcast i8* [[SCRATCHPAD_ELT_PTR_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[SCRATCHPAD_ELT_PTR]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[REMOTE_ELT2]], align + // CHECK: [[REMOTE_ELT_PTR:%.+]] = bitcast i16* [[REMOTE_ELT2]] to i8* + // CHECK: store i8* [[REMOTE_ELT_PTR]], i8** [[REMOTE_ELT_REF]], align + // + // CHECK: [[REDUCE:%.+]] = icmp eq i32 [[SHOULD_REDUCE]], 1 + // CHECK: br i1 [[REDUCE]], label {{%?}}[[DO_REDUCE:.+]], label {{%?}}[[REDUCE_ELSE:.+]] + // + // CHECK: [[DO_REDUCE]] + // CHECK: [[RED_LIST1_VOID:%.+]] = bitcast [[RLT]]* [[RED_LIST]] to i8* + // CHECK: [[RED_LIST2_VOID:%.+]] = bitcast [[RLT]]* [[REMOTE_RED_LIST]] to i8* + // CHECK: call void [[REDUCTION_FUNC]](i8* [[RED_LIST1_VOID]], i8* [[RED_LIST2_VOID]]) + // CHECK: br label {{%?}}[[REDUCE_CONT:.+]] + // + // Copy element from remote reduce list + // CHECK: [[REDUCE_ELSE]] + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 0 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i32* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i32, i32* [[REMOTE_ELT]], align + // CHECK: store i32 [[REMOTE_ELT_VAL]], i32* [[ELT]], align + // + // CHECK: [[REMOTE_ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[REMOTE_RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[REMOTE_ELT_VOID:%.+]] = load i8*, i8** [[REMOTE_ELT_REF]], + // CHECK: [[ELT_REF:%.+]] = getelementptr inbounds [[RLT]], [[RLT]]* [[RED_LIST:%.+]], i[[SZ]] 0, i[[SZ]] 1 + // CHECK: [[ELT_VOID:%.+]] = load i8*, i8** [[ELT_REF]], + // CHECK: [[ELT:%.+]] = bitcast i8* [[ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT:%.+]] = bitcast i8* [[REMOTE_ELT_VOID]] to i16* + // CHECK: [[REMOTE_ELT_VAL:%.+]] = load i16, i16* [[REMOTE_ELT]], align + // CHECK: store i16 [[REMOTE_ELT_VAL]], i16* [[ELT]], align + // CHECK: br label {{%?}}[[REDUCE_CONT]] + // + // CHECK: [[REDUCE_CONT]] + // CHECK: ret + + +#endif