From f4e3f3e31cacd412bcef0906da3c6a4bd503d0a2 Mon Sep 17 00:00:00 2001 From: Rafael Espindola Date: Wed, 7 Feb 2018 18:09:35 +0000 Subject: [PATCH] Revert "AMDGPU: Add 32-bit constant address space" This reverts commit r324487. It broke clang tests. llvm-svn: 324494 --- llvm/docs/AMDGPUUsage.rst | 1 - llvm/lib/Target/AMDGPU/AMDGPU.h | 3 - .../lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp | 3 +- .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp | 3 +- llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 34 +-- .../AMDGPU/AMDGPUInstructionSelector.cpp | 6 +- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 +- .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 1 - llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 +- .../Target/AMDGPU/SIMachineFunctionInfo.cpp | 8 +- .../lib/Target/AMDGPU/SIMachineFunctionInfo.h | 6 - llvm/lib/Target/AMDGPU/SMInstructions.td | 3 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 6 +- .../AMDGPU/constant-address-space-32bit.ll | 288 ------------------ 14 files changed, 19 insertions(+), 375 deletions(-) delete mode 100644 llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 21e9308e0593..ff22f2c35977 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -285,7 +285,6 @@ LLVM Address Space number is used throughout LLVM (for example, in LLVM IR). 3 Local (group/LDS) Local (group/LDS) Local (group/LDS) Local (group/LDS) 4 Generic (Flat) Region (GDS) Region (GDS) Constant 5 Region (GDS) Private (Scratch) Private (Scratch) Private (Scratch) - 6 Constant 32-bit Constant 32-bit Constant 32-bit Constant 32-bit ================== ================= ================= ================= ================= Current Default diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index 0b590c3c1228..0ddc43ad5033 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -224,9 +224,6 @@ struct AMDGPUAS { GLOBAL_ADDRESS = 1, ///< Address space for global memory (RAT0, VTX0). CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2) LOCAL_ADDRESS = 3, ///< Address space for local memory. - - CONSTANT_ADDRESS_32BIT = 6, ///< Address space for 32-bit constant memory - /// Address space for direct addressible parameter memory (CONST0) PARAM_D_ADDRESS = 6, /// Address space for indirect addressible parameter memory (VTX1) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp index fa52bbb9def8..392b011e387c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp @@ -115,8 +115,7 @@ bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal) { const Value *Base = GetUnderlyingObject(Loc.Ptr, DL); - if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS || - Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS_32BIT) { + if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) { return true; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index 0c30f0519322..b17b67167666 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -466,8 +466,7 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) { } bool AMDGPUCodeGenPrepare::visitLoadInst(LoadInst &I) { - if ((I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + if (I.getPointerAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && canWidenScalarExtLoad(I)) { IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 192d4b0f1ef8..440f8b20d48c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -162,7 +162,6 @@ private: bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; - SDValue Expand32BitAddress(SDValue Addr) const; bool SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const; bool SelectSMRDImm(SDValue Addr, SDValue &SBase, SDValue &Offset) const; @@ -637,8 +636,7 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const { if (!N->readMem()) return false; if (CbId == -1) - return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT; + return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS; return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId; } @@ -1440,45 +1438,19 @@ bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode, return true; } -SDValue AMDGPUDAGToDAGISel::Expand32BitAddress(SDValue Addr) const { - if (Addr.getValueType() != MVT::i32) - return Addr; - - // Zero-extend a 32-bit address. - SDLoc SL(Addr); - - const MachineFunction &MF = CurDAG->getMachineFunction(); - const SIMachineFunctionInfo *Info = MF.getInfo(); - unsigned AddrHiVal = Info->get32BitAddressHighBits(); - SDValue AddrHi = CurDAG->getTargetConstant(AddrHiVal, SL, MVT::i32); - - const SDValue Ops[] = { - CurDAG->getTargetConstant(AMDGPU::SReg_64_XEXECRegClassID, SL, MVT::i32), - Addr, - CurDAG->getTargetConstant(AMDGPU::sub0, SL, MVT::i32), - SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, AddrHi), - 0), - CurDAG->getTargetConstant(AMDGPU::sub1, SL, MVT::i32), - }; - - return SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, SL, MVT::i64, - Ops), 0); -} - bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase, SDValue &Offset, bool &Imm) const { SDLoc SL(Addr); - if (CurDAG->isBaseWithConstantOffset(Addr)) { SDValue N0 = Addr.getOperand(0); SDValue N1 = Addr.getOperand(1); if (SelectSMRDOffset(N1, Offset, Imm)) { - SBase = Expand32BitAddress(N0); + SBase = N0; return true; } } - SBase = Expand32BitAddress(Addr); + SBase = Addr; Offset = CurDAG->getTargetConstant(0, SL, MVT::i32); Imm = true; return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 7cb6ef0648a3..b7f65c20507c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -229,9 +229,6 @@ static bool isInstrUniform(const MachineInstr &MI) { isa(Ptr) || isa(Ptr)) return true; - if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return true; - const Instruction *I = dyn_cast(Ptr); return I && I->getMetadata("amdgpu.uniform"); } @@ -296,8 +293,7 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I, if (!I.hasOneMemOperand()) return false; - if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS && - (*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT) + if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS) return false; if (!isInstrUniform(I)) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 1af1e10dac9b..b5d43af11f65 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -266,7 +266,7 @@ static StringRef computeDataLayout(const Triple &TT) { // 32-bit private, local, and region pointers. 64-bit global, constant and // flat. - return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32-p6:32:32" + return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32" "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128" "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"; } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index 00ff0308ba19..3ad099ca6866 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -237,7 +237,6 @@ unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { AMDGPUAS AS = ST->getAMDGPUAS(); if (AddrSpace == AS.GLOBAL_ADDRESS || AddrSpace == AS.CONSTANT_ADDRESS || - AddrSpace == AS.CONSTANT_ADDRESS_32BIT || AddrSpace == AS.FLAT_ADDRESS) return 128; if (AddrSpace == AS.LOCAL_ADDRESS || diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 6361c2c9ea94..83fe7e377bbf 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -900,8 +900,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, if (AS == AMDGPUASI.GLOBAL_ADDRESS) return isLegalGlobalAddressingMode(AM); - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { // If the offset isn't a multiple of 4, it probably isn't going to be // correctly aligned. // FIXME: Can we get the real alignment here? @@ -1024,8 +1023,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT, // If we have an uniform constant load, it still requires using a slow // buffer instruction if unaligned. if (IsFast) { - *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS || - AddrSpace == AMDGPUASI.CONSTANT_ADDRESS_32BIT) ? + *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ? (Align % 4 == 0) : true; } @@ -1068,8 +1066,7 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign, static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) { return AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT; + AS == AMDGPUASI.CONSTANT_ADDRESS; } bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS, @@ -4011,15 +4008,13 @@ void SITargetLowering::createDebuggerPrologueStackObjects( bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const { const Triple &TT = getTargetMachine().getTargetTriple(); - return (GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && AMDGPU::shouldEmitConstantsToTextSection(TT); } bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const { return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && !shouldEmitFixup(GV) && !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV); } @@ -4396,8 +4391,7 @@ bool SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const { // We can fold offsets for anything that doesn't require a GOT relocation. return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) && !shouldEmitGOTReloc(GA->getGlobal()); } @@ -4450,7 +4444,6 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI, const GlobalValue *GV = GSD->getGlobal(); if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS && - GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS_32BIT && GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS && // FIXME: It isn't correct to rely on the type of the pointer. This should // be removed when address space 0 is 64-bit. @@ -5385,8 +5378,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS; unsigned NumElements = MemVT.getVectorNumElements(); - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS) { if (isMemOpUniform(Load)) return SDValue(); // Non-uniform loads will be selected to MUBUF instructions, so they @@ -5394,9 +5386,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || - AS == AMDGPUASI.GLOBAL_ADDRESS) { + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) { if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) && !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load)) return SDValue(); @@ -5405,9 +5395,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { // loads. // } - if (AS == AMDGPUASI.CONSTANT_ADDRESS || - AS == AMDGPUASI.CONSTANT_ADDRESS_32BIT || - AS == AMDGPUASI.GLOBAL_ADDRESS || + if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { if (NumElements > 4) return SplitVectorLoad(Op, DAG); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 2534ad02478c..888d8f978aff 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -47,8 +47,7 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDZ(false), ImplicitBufferPtr(false), ImplicitArgPtr(false), - GITPtrHigh(0xffffffff), - HighBitsOf32BitAddress(0) { + GITPtrHigh(0xffffffff) { const SISubtarget &ST = MF.getSubtarget(); const Function &F = MF.getFunction(); FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F); @@ -165,11 +164,6 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) StringRef S = A.getValueAsString(); if (!S.empty()) S.consumeInteger(0, GITPtrHigh); - - A = F.getFnAttribute("amdgpu-32bit-address-high-bits"); - S = A.getValueAsString(); - if (!S.empty()) - S.consumeInteger(0, HighBitsOf32BitAddress); } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 6eed4fcd8ad8..63875c55df03 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -186,8 +186,6 @@ private: // current hardware only allows a 16 bit value. unsigned GITPtrHigh; - unsigned HighBitsOf32BitAddress; - MCPhysReg getNextUserSGPR() const { assert(NumSystemSGPRs == 0 && "System SGPRs must be added after user SGPRs"); return AMDGPU::SGPR0 + NumUserSGPRs; @@ -413,10 +411,6 @@ public: return GITPtrHigh; } - unsigned get32BitAddressHighBits() const { - return HighBitsOf32BitAddress; - } - unsigned getNumUserSGPRs() const { return NumUserSGPRs; } diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td index 7ee0af0877c8..8f347986eb8a 100644 --- a/llvm/lib/Target/AMDGPU/SMInstructions.td +++ b/llvm/lib/Target/AMDGPU/SMInstructions.td @@ -223,8 +223,7 @@ def S_MEMREALTIME : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime> def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{ auto Ld = cast(N); return Ld->getAlignment() >= 4 && - (((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS || - Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS_32BIT) && + ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS && static_cast(getTargetLowering())->isMemOpUniform(N)) || (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && !Ld->isVolatile() && diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 0367ce724ce4..50311c241f2a 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -447,8 +447,7 @@ bool isGlobalSegment(const GlobalValue *GV) { } bool isReadOnlySegment(const GlobalValue *GV) { - return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS || - GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT; + return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS; } bool shouldEmitConstantsToTextSection(const Triple &TT) { @@ -917,9 +916,6 @@ bool isUniformMMO(const MachineMemOperand *MMO) { isa(Ptr) || isa(Ptr)) return true; - if (MMO->getAddrSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) - return true; - if (const Argument *Arg = dyn_cast(Ptr)) return isArgPassedInSGPR(Arg); diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll deleted file mode 100644 index 61ad224e4d75..000000000000 --- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll +++ /dev/null @@ -1,288 +0,0 @@ -; RUN: llc -march=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=GCN,SICI,SI %s -; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN,SICI %s -; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s -; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,VIGFX9 %s - -; GCN-LABEL: {{^}}load_i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 -define amdgpu_vs float @load_i32(i32 addrspace(6)* inreg %p0, i32 addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr i32, i32 addrspace(6)* %p1, i64 2 - %r0 = load i32, i32 addrspace(6)* %p0 - %r1 = load i32, i32 addrspace(6)* %gep1 - %r = add i32 %r0, %r1 - %r2 = bitcast i32 %r to float - ret float %r2 -} - -; GCN-LABEL: {{^}}load_v2i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 -define amdgpu_vs <2 x float> @load_v2i32(<2 x i32> addrspace(6)* inreg %p0, <2 x i32> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <2 x i32>, <2 x i32> addrspace(6)* %p1, i64 2 - %r0 = load <2 x i32>, <2 x i32> addrspace(6)* %p0 - %r1 = load <2 x i32>, <2 x i32> addrspace(6)* %gep1 - %r = add <2 x i32> %r0, %r1 - %r2 = bitcast <2 x i32> %r to <2 x float> - ret <2 x float> %r2 -} - -; GCN-LABEL: {{^}}load_v4i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 -define amdgpu_vs <4 x float> @load_v4i32(<4 x i32> addrspace(6)* inreg %p0, <4 x i32> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <4 x i32>, <4 x i32> addrspace(6)* %p1, i64 2 - %r0 = load <4 x i32>, <4 x i32> addrspace(6)* %p0 - %r1 = load <4 x i32>, <4 x i32> addrspace(6)* %gep1 - %r = add <4 x i32> %r0, %r1 - %r2 = bitcast <4 x i32> %r to <4 x float> - ret <4 x float> %r2 -} - -; GCN-LABEL: {{^}}load_v8i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 -define amdgpu_vs <8 x float> @load_v8i32(<8 x i32> addrspace(6)* inreg %p0, <8 x i32> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <8 x i32>, <8 x i32> addrspace(6)* %p1, i64 2 - %r0 = load <8 x i32>, <8 x i32> addrspace(6)* %p0 - %r1 = load <8 x i32>, <8 x i32> addrspace(6)* %gep1 - %r = add <8 x i32> %r0, %r1 - %r2 = bitcast <8 x i32> %r to <8 x float> - ret <8 x float> %r2 -} - -; GCN-LABEL: {{^}}load_v16i32: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 -define amdgpu_vs <16 x float> @load_v16i32(<16 x i32> addrspace(6)* inreg %p0, <16 x i32> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <16 x i32>, <16 x i32> addrspace(6)* %p1, i64 2 - %r0 = load <16 x i32>, <16 x i32> addrspace(6)* %p0 - %r1 = load <16 x i32>, <16 x i32> addrspace(6)* %gep1 - %r = add <16 x i32> %r0, %r1 - %r2 = bitcast <16 x i32> %r to <16 x float> - ret <16 x float> %r2 -} - -; GCN-LABEL: {{^}}load_float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; SICI-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x2 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[0:1], 0x0 -; VIGFX9-DAG: s_load_dword s{{[0-9]}}, s[2:3], 0x8 -define amdgpu_vs float @load_float(float addrspace(6)* inreg %p0, float addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr float, float addrspace(6)* %p1, i64 2 - %r0 = load float, float addrspace(6)* %p0 - %r1 = load float, float addrspace(6)* %gep1 - %r = fadd float %r0, %r1 - ret float %r -} - -; GCN-LABEL: {{^}}load_v2float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x4 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx2 s[{{.*}}], s[2:3], 0x10 -define amdgpu_vs <2 x float> @load_v2float(<2 x float> addrspace(6)* inreg %p0, <2 x float> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <2 x float>, <2 x float> addrspace(6)* %p1, i64 2 - %r0 = load <2 x float>, <2 x float> addrspace(6)* %p0 - %r1 = load <2 x float>, <2 x float> addrspace(6)* %gep1 - %r = fadd <2 x float> %r0, %r1 - ret <2 x float> %r -} - -; GCN-LABEL: {{^}}load_v4float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x8 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx4 s[{{.*}}], s[2:3], 0x20 -define amdgpu_vs <4 x float> @load_v4float(<4 x float> addrspace(6)* inreg %p0, <4 x float> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <4 x float>, <4 x float> addrspace(6)* %p1, i64 2 - %r0 = load <4 x float>, <4 x float> addrspace(6)* %p0 - %r1 = load <4 x float>, <4 x float> addrspace(6)* %gep1 - %r = fadd <4 x float> %r0, %r1 - ret <4 x float> %r -} - -; GCN-LABEL: {{^}}load_v8float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x10 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx8 s[{{.*}}], s[2:3], 0x40 -define amdgpu_vs <8 x float> @load_v8float(<8 x float> addrspace(6)* inreg %p0, <8 x float> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <8 x float>, <8 x float> addrspace(6)* %p1, i64 2 - %r0 = load <8 x float>, <8 x float> addrspace(6)* %p0 - %r1 = load <8 x float>, <8 x float> addrspace(6)* %gep1 - %r = fadd <8 x float> %r0, %r1 - ret <8 x float> %r -} - -; GCN-LABEL: {{^}}load_v16float: -; GCN-DAG: s_mov_b32 s3, 0 -; GCN-DAG: s_mov_b32 s2, s1 -; GCN-DAG: s_mov_b32 s1, s3 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; SICI-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x20 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[0:1], 0x0 -; VIGFX9-DAG: s_load_dwordx16 s[{{.*}}], s[2:3], 0x80 -define amdgpu_vs <16 x float> @load_v16float(<16 x float> addrspace(6)* inreg %p0, <16 x float> addrspace(6)* inreg %p1) #0 { - %gep1 = getelementptr <16 x float>, <16 x float> addrspace(6)* %p1, i64 2 - %r0 = load <16 x float>, <16 x float> addrspace(6)* %p0 - %r1 = load <16 x float>, <16 x float> addrspace(6)* %gep1 - %r = fadd <16 x float> %r0, %r1 - ret <16 x float> %r -} - -; GCN-LABEL: {{^}}load_i32_hi0: -; GCN: s_mov_b32 s1, 0 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 -define amdgpu_vs i32 @load_i32_hi0(i32 addrspace(6)* inreg %p) #1 { - %r0 = load i32, i32 addrspace(6)* %p - ret i32 %r0 -} - -; GCN-LABEL: {{^}}load_i32_hi1: -; GCN: s_mov_b32 s1, 1 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 -define amdgpu_vs i32 @load_i32_hi1(i32 addrspace(6)* inreg %p) #2 { - %r0 = load i32, i32 addrspace(6)* %p - ret i32 %r0 -} - -; GCN-LABEL: {{^}}load_i32_hiffff8000: -; GCN: s_movk_i32 s1, 0x8000 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 -define amdgpu_vs i32 @load_i32_hiffff8000(i32 addrspace(6)* inreg %p) #3 { - %r0 = load i32, i32 addrspace(6)* %p - ret i32 %r0 -} - -; GCN-LABEL: {{^}}load_i32_hifffffff0: -; GCN: s_mov_b32 s1, -16 -; GCN-NEXT: s_load_dword s0, s[0:1], 0x0 -define amdgpu_vs i32 @load_i32_hifffffff0(i32 addrspace(6)* inreg %p) #4 { - %r0 = load i32, i32 addrspace(6)* %p - ret i32 %r0 -} - -; GCN-LABEL: {{^}}load_sampler -; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 -; SI: s_nop -; GCN-NEXT: s_load_dwordx8 -; GCN-NEXT: s_load_dwordx4 -; GCN: image_sample -define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { -main_body: - %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8 - %23 = bitcast float %22 to i32 - %24 = shl i32 %23, 1 - %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24, !amdgpu.uniform !0 - %26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0 - %27 = shl i32 %23, 2 - %28 = or i32 %27, 3 - %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)* - %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28, !amdgpu.uniform !0 - %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0 - %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8 - %33 = extractelement <4 x float> %32, i32 0 - %34 = extractelement <4 x float> %32, i32 1 - %35 = extractelement <4 x float> %32, i32 2 - %36 = extractelement <4 x float> %32, i32 3 - %37 = bitcast float %4 to i32 - %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4 - %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5 - %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6 - %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7 - %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8 - %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19 - ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43 -} - -; GCN-LABEL: {{^}}load_sampler_nouniform -; GCN: v_readfirstlane_b32 -; GCN-NEXT: v_readfirstlane_b32 -; SI: s_nop -; GCN-NEXT: s_load_dwordx8 -; GCN-NEXT: s_load_dwordx4 -; GCN: image_sample -define amdgpu_ps <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @load_sampler_nouniform([0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <4 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), [0 x <8 x i32>] addrspace(6)* inreg noalias dereferenceable(18446744073709551615), float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #5 { -main_body: - %22 = call nsz float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %5) #8 - %23 = bitcast float %22 to i32 - %24 = shl i32 %23, 1 - %25 = getelementptr [0 x <8 x i32>], [0 x <8 x i32>] addrspace(6)* %1, i32 0, i32 %24 - %26 = load <8 x i32>, <8 x i32> addrspace(6)* %25, align 32, !invariant.load !0 - %27 = shl i32 %23, 2 - %28 = or i32 %27, 3 - %29 = bitcast [0 x <8 x i32>] addrspace(6)* %1 to [0 x <4 x i32>] addrspace(6)* - %30 = getelementptr [0 x <4 x i32>], [0 x <4 x i32>] addrspace(6)* %29, i32 0, i32 %28 - %31 = load <4 x i32>, <4 x i32> addrspace(6)* %30, align 16, !invariant.load !0 - %32 = call nsz <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> zeroinitializer, <8 x i32> %26, <4 x i32> %31, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #8 - %33 = extractelement <4 x float> %32, i32 0 - %34 = extractelement <4 x float> %32, i32 1 - %35 = extractelement <4 x float> %32, i32 2 - %36 = extractelement <4 x float> %32, i32 3 - %37 = bitcast float %4 to i32 - %38 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, i32 %37, 4 - %39 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %38, float %33, 5 - %40 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %39, float %34, 6 - %41 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %40, float %35, 7 - %42 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %41, float %36, 8 - %43 = insertvalue <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %42, float %20, 19 - ret <{ i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %43 -} - -; Function Attrs: nounwind readnone speculatable -declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #6 - -; Function Attrs: nounwind readonly -declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #7 - - -!0 = !{} - -attributes #0 = { nounwind } -attributes #1 = { nounwind "amdgpu-32bit-address-high-bits"="0" } -attributes #2 = { nounwind "amdgpu-32bit-address-high-bits"="1" } -attributes #3 = { nounwind "amdgpu-32bit-address-high-bits"="0xffff8000" } -attributes #4 = { nounwind "amdgpu-32bit-address-high-bits"="0xfffffff0" } -attributes #5 = { "InitialPSInputAddr"="45175" } -attributes #6 = { nounwind readnone speculatable } -attributes #7 = { nounwind readonly } -attributes #8 = { nounwind readnone }