From 8e45acfc3884a89d776c5c850c71ae80f645db99 Mon Sep 17 00:00:00 2001 From: Stanislav Mekhanoshin Date: Fri, 17 Mar 2017 23:56:58 +0000 Subject: [PATCH] [AMDGPU] Add address space based alias analysis pass This is direct port of HSAILAliasAnalysis pass, just cleaned for style and renamed. Differential Revision: https://reviews.llvm.org/D31103 llvm-svn: 298172 --- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 + .../lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp | 117 ++++++++++++++++++ llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h | 86 +++++++++++++ .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 16 +++ llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + .../CodeGen/AMDGPU/amdgpu.private-memory.ll | 12 +- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 6 +- .../AMDGPU/si-triv-disjoint-mem-access.ll | 2 +- .../AMDGPU/split-vector-memoperand-offsets.ll | 2 +- .../CodeGen/AMDGPU/vectorize-global-local.ll | 80 ++++++++++++ 10 files changed, 314 insertions(+), 11 deletions(-) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h create mode 100644 llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 3e269fad71fa..1cc68c7e242a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -119,6 +119,9 @@ extern char &SIDebuggerInsertNopsID; void initializeSIInsertWaitsPass(PassRegistry&); extern char &SIInsertWaitsID; +ImmutablePass *createAMDGPUAAWrapperPass(); +void initializeAMDGPUAAWrapperPassPass(PassRegistry&); + Target &getTheAMDGPUTarget(); Target &getTheGCNTarget(); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp new file mode 100644 index 000000000000..127b26397946 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -0,0 +1,117 @@ +//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This is the AMGPU address space based alias analysis pass. +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/Passes.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-aa" + +// Register this pass... +char AMDGPUAAWrapperPass::ID = 0; +INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa", + "AMDGPU Address space based Alias Analysis", false, true) + +ImmutablePass *llvm::createAMDGPUAAWrapperPass() { + return new AMDGPUAAWrapperPass(); +} + +void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const { + AU.setPreservesAll(); +} + +AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA, + const MemoryLocation &LocB) { + // This array is indexed by the AMDGPUAS::AddressSpaces + // enum elements PRIVATE_ADDRESS ... to FLAT_ADDRESS + // see "llvm/Transforms/AMDSPIRUtils.h" + static const AliasResult ASAliasRules[5][5] = { + /* Private Global Constant Group Flat */ + /* Private */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias}, + /* Global */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias}, + /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias}, + /* Group */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}, + /* Flat */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias} + }; + unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace(); + unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace(); + if (asA > AMDGPUAS::AddressSpaces::FLAT_ADDRESS || + asB > AMDGPUAS::AddressSpaces::FLAT_ADDRESS) + report_fatal_error("Pointer address space out of range"); + + AliasResult Result = ASAliasRules[asA][asB]; + if (Result == NoAlias) return Result; + + if (isa(LocA.Ptr) && isa(LocB.Ptr)) { + Type *T1 = cast(LocA.Ptr->getType())->getElementType(); + Type *T2 = cast(LocB.Ptr->getType())->getElementType(); + + if ((T1->isVectorTy() && !T2->isVectorTy()) || + (T2->isVectorTy() && !T1->isVectorTy())) + return NoAlias; + } + // Forward the query to the next alias analysis. + return AAResultBase::alias(LocA, LocB); +} + +bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, + bool OrLocal) { + const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); + + if (Base->getType()->getPointerAddressSpace() == + AMDGPUAS::AddressSpaces::CONSTANT_ADDRESS) { + return true; + } + + if (const GlobalVariable *GV = dyn_cast(Base)) { + if (GV->isConstant()) + return true; + } else if (const Argument *Arg = dyn_cast(Base)) { + const Function *F = Arg->getParent(); + + // Only assume constant memory for arguments on kernels. + switch (F->getCallingConv()) { + default: + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); + case CallingConv::AMDGPU_VS: + case CallingConv::AMDGPU_GS: + case CallingConv::AMDGPU_PS: + case CallingConv::AMDGPU_CS: + case CallingConv::AMDGPU_KERNEL: + case CallingConv::SPIR_KERNEL: + break; + } + + unsigned ArgNo = Arg->getArgNo(); + /* On an argument, ReadOnly attribute indicates that the function does + not write through this pointer argument, even though it may write + to the memory that the pointer points to. + On an argument, ReadNone attribute indicates that the function does + not dereference that pointer argument, even though it may read or write + the memory that the pointer points to if accessed through other pointers. + */ + if (F->getAttributes().hasAttribute(ArgNo + 1, Attribute::NoAlias) && + (F->getAttributes().hasAttribute(ArgNo + 1, Attribute::ReadNone) || + F->getAttributes().hasAttribute(ArgNo + 1, Attribute::ReadOnly))) { + return true; + } + } + return AAResultBase::pointsToConstantMemory(Loc, OrLocal); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h new file mode 100644 index 000000000000..943b4a68b259 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h @@ -0,0 +1,86 @@ +//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This is the AMGPU address space based alias analysis pass. +//===----------------------------------------------------------------------===// + +#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H +#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H + +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Pass.h" + +namespace llvm { + +/// A simple AA result that uses TBAA metadata to answer queries. +class AMDGPUAAResult : public AAResultBase { + friend AAResultBase; + + const DataLayout &DL; + +public: + explicit AMDGPUAAResult(const DataLayout &DL) : AAResultBase(), DL(DL) {} + AMDGPUAAResult(AMDGPUAAResult &&Arg) + : AAResultBase(std::move(Arg)), DL(Arg.DL){} + + /// Handle invalidation events from the new pass manager. + /// + /// By definition, this result is stateless and so remains valid. + bool invalidate(Function &, const PreservedAnalyses &) { return false; } + + AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB); + bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal); + +private: + bool Aliases(const MDNode *A, const MDNode *B) const; + bool PathAliases(const MDNode *A, const MDNode *B) const; +}; + +/// Analysis pass providing a never-invalidated alias analysis result. +class AMDGPUAA : public AnalysisInfoMixin { + friend AnalysisInfoMixin; + static char PassID; + +public: + typedef AMDGPUAAResult Result; + + AMDGPUAAResult run(Function &F, AnalysisManager &AM) { + return AMDGPUAAResult(F.getParent()->getDataLayout()); + } +}; + +/// Legacy wrapper pass to provide the AMDGPUAAResult object. +class AMDGPUAAWrapperPass : public ImmutablePass { + std::unique_ptr Result; + +public: + static char ID; + + AMDGPUAAWrapperPass() : ImmutablePass(ID) { + initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry()); + } + + AMDGPUAAResult &getResult() { return *Result; } + const AMDGPUAAResult &getResult() const { return *Result; } + + bool doInitialization(Module &M) override { + Result.reset(new AMDGPUAAResult(M.getDataLayout())); + return false; + } + bool doFinalization(Module &M) override { + Result.reset(); + return false; + } + void getAnalysisUsage(AnalysisUsage &AU) const override; +}; + +} +#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 48442bcf2f1d..33ba0883e30c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -15,6 +15,7 @@ #include "AMDGPUTargetMachine.h" #include "AMDGPU.h" +#include "AMDGPUAliasAnalysis.h" #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" @@ -93,6 +94,11 @@ static cl::opt InternalizeSymbols( cl::init(false), cl::Hidden); +// Enable address space based alias analysis +static cl::opt EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden, + cl::desc("Enable AMDGPU Alias Analysis"), + cl::init(true)); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -119,6 +125,7 @@ extern "C" void LLVMInitializeAMDGPUTarget() { initializeSIInsertSkipsPass(*PR); initializeSIDebuggerInsertNopsPass(*PR); initializeSIOptimizeExecMaskingPass(*PR); + initializeAMDGPUAAWrapperPassPass(*PR); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -507,6 +514,15 @@ void AMDGPUPassConfig::addIRPasses() { addPass(createSROAPass()); addStraightLineScalarOptimizationPasses(); + + if (EnableAMDGPUAliasAnalysis) { + addPass(createAMDGPUAAWrapperPass()); + addPass(createExternalAAWrapperPass([](Pass &P, Function &, + AAResults &AAR) { + if (auto *WrapperPass = P.getAnalysisIfAvailable()) + AAR.addAAResult(WrapperPass->getResult()); + })); + } } TargetPassConfig::addIRPasses(); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index a1c263c87643..7b550e8f2b5b 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -36,6 +36,7 @@ endif() add_llvm_target(AMDGPUCodeGen AMDILCFGStructurizer.cpp + AMDGPUAliasAnalysis.cpp AMDGPUAlwaysInlinePass.cpp AMDGPUAnnotateKernelFeatures.cpp AMDGPUAnnotateUniformValues.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll index f76b94cd9a02..f812f4cf7024 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu.private-memory.ll @@ -1,9 +1,9 @@ -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA -; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA +; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC +; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index 1272faab7b9c..b7945842e6e2 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1,6 +1,6 @@ -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s -; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s +; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s ; GCN-LABEL: {{^}}s_insertelement_v2i16_0: ; GCN: s_load_dword [[VEC:s[0-9]+]] diff --git a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll index c1d691fcff83..313f5ee474e4 100644 --- a/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll +++ b/llvm/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s +; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) diff --git a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll index 37ec2b012896..da6c7204cd14 100644 --- a/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll +++ b/llvm/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll @@ -1,4 +1,4 @@ -; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s @sPrivateStorage = internal addrspace(3) global [256 x [8 x <4 x i64>]] undef diff --git a/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll new file mode 100644 index 000000000000..7c254abc642d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vectorize-global-local.ll @@ -0,0 +1,80 @@ +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: flat_load_dwordx4 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 +; CHECK-DAG: ds_write2_b32 + +define void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) { +bb: + %tmp = load i32, i32 addrspace(1)* %arg, align 4 + store i32 %tmp, i32 addrspace(3)* %arg1, align 4 + %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1 + %tmp3 = load i32, i32 addrspace(1)* %tmp2, align 4 + %tmp4 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 1 + store i32 %tmp3, i32 addrspace(3)* %tmp4, align 4 + %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2 + %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4 + %tmp7 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 2 + store i32 %tmp6, i32 addrspace(3)* %tmp7, align 4 + %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3 + %tmp9 = load i32, i32 addrspace(1)* %tmp8, align 4 + %tmp10 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 3 + store i32 %tmp9, i32 addrspace(3)* %tmp10, align 4 + %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4 + %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4 + %tmp13 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 4 + store i32 %tmp12, i32 addrspace(3)* %tmp13, align 4 + %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5 + %tmp15 = load i32, i32 addrspace(1)* %tmp14, align 4 + %tmp16 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 5 + store i32 %tmp15, i32 addrspace(3)* %tmp16, align 4 + %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6 + %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4 + %tmp19 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 6 + store i32 %tmp18, i32 addrspace(3)* %tmp19, align 4 + %tmp20 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7 + %tmp21 = load i32, i32 addrspace(1)* %tmp20, align 4 + %tmp22 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 7 + store i32 %tmp21, i32 addrspace(3)* %tmp22, align 4 + %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8 + %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4 + %tmp25 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 8 + store i32 %tmp24, i32 addrspace(3)* %tmp25, align 4 + %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 9 + %tmp27 = load i32, i32 addrspace(1)* %tmp26, align 4 + %tmp28 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 9 + store i32 %tmp27, i32 addrspace(3)* %tmp28, align 4 + %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 10 + %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4 + %tmp31 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 10 + store i32 %tmp30, i32 addrspace(3)* %tmp31, align 4 + %tmp32 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 11 + %tmp33 = load i32, i32 addrspace(1)* %tmp32, align 4 + %tmp34 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 11 + store i32 %tmp33, i32 addrspace(3)* %tmp34, align 4 + %tmp35 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12 + %tmp36 = load i32, i32 addrspace(1)* %tmp35, align 4 + %tmp37 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 12 + store i32 %tmp36, i32 addrspace(3)* %tmp37, align 4 + %tmp38 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 13 + %tmp39 = load i32, i32 addrspace(1)* %tmp38, align 4 + %tmp40 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 13 + store i32 %tmp39, i32 addrspace(3)* %tmp40, align 4 + %tmp41 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 14 + %tmp42 = load i32, i32 addrspace(1)* %tmp41, align 4 + %tmp43 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 14 + store i32 %tmp42, i32 addrspace(3)* %tmp43, align 4 + %tmp44 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 15 + %tmp45 = load i32, i32 addrspace(1)* %tmp44, align 4 + %tmp46 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 15 + store i32 %tmp45, i32 addrspace(3)* %tmp46, align 4 + ret void +}