llvm-project/llvm/lib/Target/R600/AMDGPUPromoteAlloca.cpp

//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
//
//                     The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
//
// This pass eliminates allocas by either converting them into vectors or
// by migrating them to local address space.
//
//===----------------------------------------------------------------------===//

#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/InstVisitor.h"
#include "llvm/Support/Debug.h"

#define DEBUG_TYPE "amdgpu-promote-alloca"

using namespace llvm;

namespace {

class AMDGPUPromoteAlloca : public FunctionPass,
                       public InstVisitor<AMDGPUPromoteAlloca> {

  static char ID;
  Module *Mod;
  const AMDGPUSubtarget &ST;
  int LocalMemAvailable;

public:
  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
                                                   LocalMemAvailable(0) { }
  virtual bool doInitialization(Module &M);
  virtual bool runOnFunction(Function &F);
  virtual const char *getPassName() const {
    return "AMDGPU Promote Alloca";
  }
  void visitAlloca(AllocaInst &I);
};

} // End anonymous namespace

char AMDGPUPromoteAlloca::ID = 0;

bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
  Mod = &M;
  return false;
}

bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {

  const FunctionType *FTy = F.getFunctionType();

  LocalMemAvailable = ST.getLocalMemorySize();


  // If the function has any arguments in the local address space, then it's
  // possible these arguments require the entire local memory space, so
  // we cannot use local memory in the pass.
  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
    const Type *ParamTy = FTy->getParamType(i);
    if (ParamTy->isPointerTy() &&
        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
      LocalMemAvailable = 0;
      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
                      "local memory disabled.\n");
      break;
    }
  }

  if (LocalMemAvailable > 0) {
    // Check how much local memory is being used by global objects
    for (Module::global_iterator I = Mod->global_begin(),
                                 E = Mod->global_end(); I != E; ++I) {
      GlobalVariable *GV = I;
      PointerType *GVTy = GV->getType();
      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
        continue;
      for (Value::use_iterator U = GV->use_begin(),
                               UE = GV->use_end(); U != UE; ++U) {
        Instruction *Use = dyn_cast<Instruction>(*U);
        if (!Use)
          continue;
        if (Use->getParent()->getParent() == &F)
          LocalMemAvailable -=
              Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
      }
    }
  }

  LocalMemAvailable = std::max(0, LocalMemAvailable);
  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");

  visit(F);

  return false;
}

static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
  return VectorType::get(ArrayTy->getArrayElementType(),
                         ArrayTy->getArrayNumElements());
}

static Value* calculateVectorIndex(Value *Ptr,
                                  std::map<GetElementPtrInst*, Value*> GEPIdx) {
  if (isa<AllocaInst>(Ptr))
    return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));

  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);

  return GEPIdx[GEP];
}

static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
  // FIXME we only support simple cases
  if (GEP->getNumOperands() != 3)
    return NULL;

  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
  if (!I0 || !I0->isZero())
    return NULL;

  return GEP->getOperand(2);
}

// Not an instruction handled below to turn into a vector.
//
// TODO: Check isTriviallyVectorizable for calls and handle other
// instructions.
static bool canVectorizeInst(Instruction *Inst) {
  switch (Inst->getOpcode()) {
  case Instruction::Load:
  case Instruction::Store:
  case Instruction::BitCast:
  case Instruction::AddrSpaceCast:
    return true;
  default:
    return false;
  }
}

static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
  Type *AllocaTy = Alloca->getAllocatedType();

  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");

  // FIXME: There is no reason why we can't support larger arrays, we
  // are just being conservative for now.
  if (!AllocaTy->isArrayTy() ||
      AllocaTy->getArrayElementType()->isVectorTy() ||
      AllocaTy->getArrayNumElements() > 4) {

    DEBUG(dbgs() << "  Cannot convert type to vector");
    return false;
  }

  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
  std::vector<Value*> WorkList;
  for (User *AllocaUser : Alloca->users()) {
    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
    if (!GEP) {
      if (!canVectorizeInst(cast<Instruction>(AllocaUser)))
        return false;

      WorkList.push_back(AllocaUser);
      continue;
    }

    Value *Index = GEPToVectorIndex(GEP);

    // If we can't compute a vector index from this GEP, then we can't
    // promote this alloca to vector.
    if (!Index) {
      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << '\n');
      return false;
    }

    GEPVectorIdx[GEP] = Index;
    for (User *GEPUser : AllocaUser->users()) {
      if (!canVectorizeInst(cast<Instruction>(GEPUser)))
        return false;

      WorkList.push_back(GEPUser);
    }
  }

  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);

  DEBUG(dbgs() << "  Converting alloca to vector "
        << *AllocaTy << " -> " << *VectorTy << '\n');

  for (std::vector<Value*>::iterator I = WorkList.begin(),
                                     E = WorkList.end(); I != E; ++I) {
    Instruction *Inst = cast<Instruction>(*I);
    IRBuilder<> Builder(Inst);
    switch (Inst->getOpcode()) {
    case Instruction::Load: {
      Value *Ptr = Inst->getOperand(0);
      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
      Value *VecValue = Builder.CreateLoad(BitCast);
      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
      Inst->replaceAllUsesWith(ExtractElement);
      Inst->eraseFromParent();
      break;
    }
    case Instruction::Store: {
      Value *Ptr = Inst->getOperand(1);
      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
      Value *VecValue = Builder.CreateLoad(BitCast);
      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
                                                       Inst->getOperand(0),
                                                       Index);
      Builder.CreateStore(NewVecValue, BitCast);
      Inst->eraseFromParent();
      break;
    }
    case Instruction::BitCast:
    case Instruction::AddrSpaceCast:
      break;

    default:
      Inst->dump();
      llvm_unreachable("Inconsistency in instructions promotable to vector");
    }
  }
  return true;
}

static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
  for (User *User : Val->users()) {
    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
      continue;
    if (isa<CallInst>(User)) {
      WorkList.push_back(User);
      continue;
    }
    if (!User->getType()->isPointerTy())
      continue;
    WorkList.push_back(User);
    collectUsesWithPtrTypes(User, WorkList);
  }
}

void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
  IRBuilder<> Builder(&I);

  // First try to replace the alloca with a vector
  Type *AllocaTy = I.getAllocatedType();

  DEBUG(dbgs() << "Trying to promote " << I << '\n');

  if (tryPromoteAllocaToVector(&I))
    return;

  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");

  // FIXME: This is the maximum work group size.  We should try to get
  // value from the reqd_work_group_size function attribute if it is
  // available.
  unsigned WorkGroupSize = 256;
  int AllocaSize = WorkGroupSize *
      Mod->getDataLayout()->getTypeAllocSize(AllocaTy);

  if (AllocaSize > LocalMemAvailable) {
    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
    return;
  }

  DEBUG(dbgs() << "Promoting alloca to local memory\n");
  LocalMemAvailable -= AllocaSize;

  GlobalVariable *GV = new GlobalVariable(
      *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
      GlobalValue::ExternalLinkage, 0, I.getName(), 0,
      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);

  FunctionType *FTy = FunctionType::get(
      Type::getInt32Ty(Mod->getContext()), false);
  AttributeSet AttrSet;
  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);

  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
      "llvm.r600.read.local.size.y", FTy, AttrSet);
  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
      "llvm.r600.read.local.size.z", FTy, AttrSet);
  Value *ReadTIDIGX = Mod->getOrInsertFunction(
      "llvm.r600.read.tidig.x", FTy, AttrSet);
  Value *ReadTIDIGY = Mod->getOrInsertFunction(
      "llvm.r600.read.tidig.y", FTy, AttrSet);
  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
      "llvm.r600.read.tidig.z", FTy, AttrSet);


  Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
  Value *TIdX  = Builder.CreateCall(ReadTIDIGX);
  Value *TIdY  = Builder.CreateCall(ReadTIDIGY);
  Value *TIdZ  = Builder.CreateCall(ReadTIDIGZ);

  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
  TID = Builder.CreateAdd(TID, TIdZ);

  std::vector<Value*> Indices;
  Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));
  Indices.push_back(TID);

  Value *Offset = Builder.CreateGEP(GV, Indices);
  I.mutateType(Offset->getType());
  I.replaceAllUsesWith(Offset);
  I.eraseFromParent();

  std::vector<Value*> WorkList;

  collectUsesWithPtrTypes(Offset, WorkList);

  for (std::vector<Value*>::iterator i = WorkList.begin(),
                                     e = WorkList.end(); i != e; ++i) {
    Value *V = *i;
    CallInst *Call = dyn_cast<CallInst>(V);
    if (!Call) {
      Type *EltTy = V->getType()->getPointerElementType();
      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
      V->mutateType(NewTy);
      continue;
    }

    IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
    if (!Intr) {
      std::vector<Type*> ArgTypes;
      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
                                ArgIdx != ArgEnd; ++ArgIdx) {
        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
      }
      Function *F = Call->getCalledFunction();
      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
                                                F->isVarArg());
      Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
                                             F->getAttributes());
      Function *NewF = cast<Function>(C);
      Call->setCalledFunction(NewF);
      continue;
    }

    Builder.SetInsertPoint(Intr);
    switch (Intr->getIntrinsicID()) {
    case Intrinsic::lifetime_start:
    case Intrinsic::lifetime_end:
      // These intrinsics are for address space 0 only
      Intr->eraseFromParent();
      continue;
    case Intrinsic::memcpy: {
      MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
      Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
                           MemCpy->getLength(), MemCpy->getAlignment(),
                           MemCpy->isVolatile());
      Intr->eraseFromParent();
      continue;
    }
    case Intrinsic::memset: {
      MemSetInst *MemSet = cast<MemSetInst>(Intr);
      Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
                           MemSet->getLength(), MemSet->getAlignment(),
                           MemSet->isVolatile());
      Intr->eraseFromParent();
      continue;
    }
    default:
      Intr->dump();
      llvm_unreachable("Don't know how to promote alloca intrinsic use.");
    }
  }
}

FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
  return new AMDGPUPromoteAlloca(ST);
}
R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00			`//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//`
			`//`
			`// The LLVM Compiler Infrastructure`
			`//`
			`// This file is distributed under the University of Illinois Open Source`
			`// License. See LICENSE.TXT for details.`
			`//`
			`//===----------------------------------------------------------------------===//`
			`//`
			`// This pass eliminates allocas by either converting them into vectors or`
			`// by migrating them to local address space.`
			`//`
			`//===----------------------------------------------------------------------===//`

			`#include "AMDGPU.h"`
			`#include "AMDGPUSubtarget.h"`
			`#include "llvm/Analysis/ValueTracking.h"`
			`#include "llvm/IR/IRBuilder.h"`
			`#include "llvm/IR/InstVisitor.h"`
			`#include "llvm/Support/Debug.h"`

			`#define DEBUG_TYPE "amdgpu-promote-alloca"`

			`using namespace llvm;`

			`namespace {`

			`class AMDGPUPromoteAlloca : public FunctionPass,`
			`public InstVisitor<AMDGPUPromoteAlloca> {`

			`static char ID;`
			`Module *Mod;`
			`const AMDGPUSubtarget &ST;`
			`int LocalMemAvailable;`

			`public:`
			`AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),`
			`LocalMemAvailable(0) { }`
			`virtual bool doInitialization(Module &M);`
			`virtual bool runOnFunction(Function &F);`
			`virtual const char *getPassName() const {`
			`return "AMDGPU Promote Alloca";`
			`}`
			`void visitAlloca(AllocaInst &I);`
			`};`

			`} // End anonymous namespace`

			`char AMDGPUPromoteAlloca::ID = 0;`

			`bool AMDGPUPromoteAlloca::doInitialization(Module &M) {`
			`Mod = &M;`
			`return false;`
			`}`

			`bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {`

			`const FunctionType *FTy = F.getFunctionType();`

			`LocalMemAvailable = ST.getLocalMemorySize();`


			`// If the function has any arguments in the local address space, then it's`
			`// possible these arguments require the entire local memory space, so`
			`// we cannot use local memory in the pass.`
			`for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {`
			`const Type *ParamTy = FTy->getParamType(i);`
			`if (ParamTy->isPointerTy() &&`
			`ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {`
			`LocalMemAvailable = 0;`
			`DEBUG(dbgs() << "Function has local memory argument. Promoting to "`
			`"local memory disabled.\n");`
			`break;`
			`}`
			`}`

			`if (LocalMemAvailable > 0) {`
			`// Check how much local memory is being used by global objects`
			`for (Module::global_iterator I = Mod->global_begin(),`
			`E = Mod->global_end(); I != E; ++I) {`
			`GlobalVariable *GV = I;`
			`PointerType *GVTy = GV->getType();`
			`if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)`
			`continue;`
			`for (Value::use_iterator U = GV->use_begin(),`
			`UE = GV->use_end(); U != UE; ++U) {`
			`Instruction Use = dyn_cast<Instruction>(U);`
			`if (!Use)`
			`continue;`
			`if (Use->getParent()->getParent() == &F)`
			`LocalMemAvailable -=`
			`Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());`
			`}`
			`}`
			`}`

			`LocalMemAvailable = std::max(0, LocalMemAvailable);`
			`DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");`

			`visit(F);`

			`return false;`
			`}`

			`static VectorType arrayTypeToVecType(const Type ArrayTy) {`
			`return VectorType::get(ArrayTy->getArrayElementType(),`
			`ArrayTy->getArrayNumElements());`
			`}`

			`static Value* calculateVectorIndex(Value *Ptr,`
			`std::map<GetElementPtrInst, Value> GEPIdx) {`
			`if (isa<AllocaInst>(Ptr))`
			`return Constant::getNullValue(Type::getInt32Ty(Ptr->getContext()));`

			`GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);`

			`return GEPIdx[GEP];`
			`}`

			`static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {`
			`// FIXME we only support simple cases`
			`if (GEP->getNumOperands() != 3)`
			`return NULL;`

			`ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));`
			`if (!I0 \|\| !I0->isZero())`
			`return NULL;`

			`return GEP->getOperand(2);`
			`}`

R600: Don't crash on unhandled instruction in promote alloca llvm-svn: 211906 2014-06-28 00:52:49 +08:00			`// Not an instruction handled below to turn into a vector.`
			`//`
			`// TODO: Check isTriviallyVectorizable for calls and handle other`
			`// instructions.`
			`static bool canVectorizeInst(Instruction *Inst) {`
			`switch (Inst->getOpcode()) {`
			`case Instruction::Load:`
			`case Instruction::Store:`
			`case Instruction::BitCast:`
			`case Instruction::AddrSpaceCast:`
			`return true;`
			`default:`
			`return false;`
			`}`
			`}`

R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00			`static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {`
			`Type *AllocaTy = Alloca->getAllocatedType();`

			`DEBUG(dbgs() << "Alloca Candidate for vectorization \n");`

			`// FIXME: There is no reason why we can't support larger arrays, we`
			`// are just being conservative for now.`
			`if (!AllocaTy->isArrayTy() \|\|`
			`AllocaTy->getArrayElementType()->isVectorTy() \|\|`
			`AllocaTy->getArrayNumElements() > 4) {`

			`DEBUG(dbgs() << " Cannot convert type to vector");`
			`return false;`
			`}`

			`std::map<GetElementPtrInst, Value> GEPVectorIdx;`
			`std::vector<Value*> WorkList;`
			`for (User *AllocaUser : Alloca->users()) {`
			`GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);`
			`if (!GEP) {`
R600: Don't crash on unhandled instruction in promote alloca llvm-svn: 211906 2014-06-28 00:52:49 +08:00			`if (!canVectorizeInst(cast<Instruction>(AllocaUser)))`
			`return false;`

R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00			`WorkList.push_back(AllocaUser);`
			`continue;`
			`}`

			`Value *Index = GEPToVectorIndex(GEP);`

			`// If we can't compute a vector index from this GEP, then we can't`
			`// promote this alloca to vector.`
			`if (!Index) {`
Fix missing newline and simplify debug printing. llvm-svn: 211850 2014-06-27 10:36:59 +08:00			`DEBUG(dbgs() << " Cannot compute vector index for GEP " << *GEP << '\n');`
R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00			`return false;`
			`}`

			`GEPVectorIdx[GEP] = Index;`
			`for (User *GEPUser : AllocaUser->users()) {`
R600: Don't crash on unhandled instruction in promote alloca llvm-svn: 211906 2014-06-28 00:52:49 +08:00			`if (!canVectorizeInst(cast<Instruction>(GEPUser)))`
			`return false;`

R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00			`WorkList.push_back(GEPUser);`
			`}`
			`}`

			`VectorType *VectorTy = arrayTypeToVecType(AllocaTy);`

Fix missing newline and simplify debug printing. llvm-svn: 211850 2014-06-27 10:36:59 +08:00			`DEBUG(dbgs() << " Converting alloca to vector "`
			`<< AllocaTy << " -> " << VectorTy << '\n');`
R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00
			`for (std::vector<Value*>::iterator I = WorkList.begin(),`
			`E = WorkList.end(); I != E; ++I) {`
			`Instruction Inst = cast<Instruction>(I);`
			`IRBuilder<> Builder(Inst);`
			`switch (Inst->getOpcode()) {`
			`case Instruction::Load: {`
			`Value *Ptr = Inst->getOperand(0);`
			`Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);`
			`Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));`
			`Value *VecValue = Builder.CreateLoad(BitCast);`
			`Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);`
			`Inst->replaceAllUsesWith(ExtractElement);`
			`Inst->eraseFromParent();`
			`break;`
			`}`
			`case Instruction::Store: {`
			`Value *Ptr = Inst->getOperand(1);`
			`Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);`
			`Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));`
			`Value *VecValue = Builder.CreateLoad(BitCast);`
			`Value *NewVecValue = Builder.CreateInsertElement(VecValue,`
			`Inst->getOperand(0),`
			`Index);`
			`Builder.CreateStore(NewVecValue, BitCast);`
			`Inst->eraseFromParent();`
			`break;`
			`}`
			`case Instruction::BitCast:`
R600: Don't crash on unhandled instruction in promote alloca llvm-svn: 211906 2014-06-28 00:52:49 +08:00			`case Instruction::AddrSpaceCast:`
R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00			`break;`

			`default:`
			`Inst->dump();`
R600: Don't crash on unhandled instruction in promote alloca llvm-svn: 211906 2014-06-28 00:52:49 +08:00			`llvm_unreachable("Inconsistency in instructions promotable to vector");`
R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00			`}`
			`}`
			`return true;`
			`}`

			`static void collectUsesWithPtrTypes(Value Val, std::vector<Value> &WorkList) {`
			`for (User *User : Val->users()) {`
			`if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())`
			`continue;`
			`if (isa<CallInst>(User)) {`
			`WorkList.push_back(User);`
			`continue;`
			`}`
			`if (!User->getType()->isPointerTy())`
			`continue;`
			`WorkList.push_back(User);`
			`collectUsesWithPtrTypes(User, WorkList);`
			`}`
			`}`

			`void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {`
			`IRBuilder<> Builder(&I);`

			`// First try to replace the alloca with a vector`
			`Type *AllocaTy = I.getAllocatedType();`

Fix missing newline and simplify debug printing. llvm-svn: 211850 2014-06-27 10:36:59 +08:00			`DEBUG(dbgs() << "Trying to promote " << I << '\n');`
R600: Use LDS and vectors for private memory llvm-svn: 211110 2014-06-18 00:53:14 +08:00
			`if (tryPromoteAllocaToVector(&I))`
			`return;`

			`DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");`

			`// FIXME: This is the maximum work group size. We should try to get`
			`// value from the reqd_work_group_size function attribute if it is`
			`// available.`
			`unsigned WorkGroupSize = 256;`
			`int AllocaSize = WorkGroupSize *`
			`Mod->getDataLayout()->getTypeAllocSize(AllocaTy);`

			`if (AllocaSize > LocalMemAvailable) {`
			`DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");`
			`return;`
			`}`

			`DEBUG(dbgs() << "Promoting alloca to local memory\n");`
			`LocalMemAvailable -= AllocaSize;`

			`GlobalVariable *GV = new GlobalVariable(`
			`*Mod, ArrayType::get(I.getAllocatedType(), 256), false,`
			`GlobalValue::ExternalLinkage, 0, I.getName(), 0,`
			`GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);`

			`FunctionType *FTy = FunctionType::get(`
			`Type::getInt32Ty(Mod->getContext()), false);`
			`AttributeSet AttrSet;`
			`AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);`

			`Value *ReadLocalSizeY = Mod->getOrInsertFunction(`
			`"llvm.r600.read.local.size.y", FTy, AttrSet);`
			`Value *ReadLocalSizeZ = Mod->getOrInsertFunction(`
			`"llvm.r600.read.local.size.z", FTy, AttrSet);`
			`Value *ReadTIDIGX = Mod->getOrInsertFunction(`
			`"llvm.r600.read.tidig.x", FTy, AttrSet);`
			`Value *ReadTIDIGY = Mod->getOrInsertFunction(`
			`"llvm.r600.read.tidig.y", FTy, AttrSet);`
			`Value *ReadTIDIGZ = Mod->getOrInsertFunction(`
			`"llvm.r600.read.tidig.z", FTy, AttrSet);`


			`Value *TCntY = Builder.CreateCall(ReadLocalSizeY);`
			`Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);`
			`Value *TIdX = Builder.CreateCall(ReadTIDIGX);`
			`Value *TIdY = Builder.CreateCall(ReadTIDIGY);`
			`Value *TIdZ = Builder.CreateCall(ReadTIDIGZ);`

			`Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);`
			`Tmp0 = Builder.CreateMul(Tmp0, TIdX);`
			`Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);`
			`Value *TID = Builder.CreateAdd(Tmp0, Tmp1);`
			`TID = Builder.CreateAdd(TID, TIdZ);`

			`std::vector<Value*> Indices;`
			`Indices.push_back(Constant::getNullValue(Type::getInt32Ty(Mod->getContext())));`
			`Indices.push_back(TID);`

			`Value *Offset = Builder.CreateGEP(GV, Indices);`
			`I.mutateType(Offset->getType());`
			`I.replaceAllUsesWith(Offset);`
			`I.eraseFromParent();`

			`std::vector<Value*> WorkList;`

			`collectUsesWithPtrTypes(Offset, WorkList);`

			`for (std::vector<Value*>::iterator i = WorkList.begin(),`
			`e = WorkList.end(); i != e; ++i) {`
			`Value V = i;`
			`CallInst *Call = dyn_cast<CallInst>(V);`
			`if (!Call) {`
			`Type *EltTy = V->getType()->getPointerElementType();`
			`PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);`
			`V->mutateType(NewTy);`
			`continue;`
			`}`

			`IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);`
			`if (!Intr) {`
			`std::vector<Type*> ArgTypes;`
			`for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();`
			`ArgIdx != ArgEnd; ++ArgIdx) {`
			`ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());`
			`}`
			`Function *F = Call->getCalledFunction();`
			`FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,`
			`F->isVarArg());`
			`Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,`
			`F->getAttributes());`
			`Function *NewF = cast<Function>(C);`
			`Call->setCalledFunction(NewF);`
			`continue;`
			`}`

			`Builder.SetInsertPoint(Intr);`
			`switch (Intr->getIntrinsicID()) {`
			`case Intrinsic::lifetime_start:`
			`case Intrinsic::lifetime_end:`
			`// These intrinsics are for address space 0 only`
			`Intr->eraseFromParent();`
			`continue;`
			`case Intrinsic::memcpy: {`
			`MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);`
			`Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),`
			`MemCpy->getLength(), MemCpy->getAlignment(),`
			`MemCpy->isVolatile());`
			`Intr->eraseFromParent();`
			`continue;`
			`}`
			`case Intrinsic::memset: {`
			`MemSetInst *MemSet = cast<MemSetInst>(Intr);`
			`Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),`
			`MemSet->getLength(), MemSet->getAlignment(),`
			`MemSet->isVolatile());`
			`Intr->eraseFromParent();`
			`continue;`
			`}`
			`default:`
			`Intr->dump();`
			`llvm_unreachable("Don't know how to promote alloca intrinsic use.");`
			`}`
			`}`
			`}`

			`FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {`
			`return new AMDGPUPromoteAlloca(ST);`
			`}`