forked from OSchip/llvm-project
AMDGPU: Don't use MUBUF vaddr if address may overflow
Effectively revert r263964. Before we would not allow this if vaddr was not known to be positive. llvm-svn: 318240
This commit is contained in:
parent
45cabacd2f
commit
45b98189bd
|
@ -346,6 +346,13 @@ def FeatureMaxPrivateElementSize4 : FeatureMaxPrivateElementSize<4>;
|
|||
def FeatureMaxPrivateElementSize8 : FeatureMaxPrivateElementSize<8>;
|
||||
def FeatureMaxPrivateElementSize16 : FeatureMaxPrivateElementSize<16>;
|
||||
|
||||
def FeatureEnableHugePrivateBuffer : SubtargetFeature<
|
||||
"huge-private-buffer",
|
||||
"EnableHugePrivateBuffer",
|
||||
"true",
|
||||
"Enable private/scratch buffer sizes greater than 128 GB"
|
||||
>;
|
||||
|
||||
def FeatureVGPRSpilling : SubtargetFeature<"vgpr-spilling",
|
||||
"EnableVGPRSpilling",
|
||||
"true",
|
||||
|
|
|
@ -1160,8 +1160,13 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
|
|||
SDValue N1 = Addr.getOperand(1);
|
||||
|
||||
// Offsets in vaddr must be positive.
|
||||
//
|
||||
// The total computation of vaddr + soffset + offset must not overflow.
|
||||
// If vaddr is negative, even if offset is 0 the sgpr offset add will end up
|
||||
// overflowing.
|
||||
ConstantSDNode *C1 = cast<ConstantSDNode>(N1);
|
||||
if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue())) {
|
||||
if (SIInstrInfo::isLegalMUBUFImmOffset(C1->getZExtValue()) &&
|
||||
CurDAG->SignBitIsZero(N0)) {
|
||||
std::tie(VAddr, SOffset) = foldFrameIndex(N0);
|
||||
ImmOffset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i16);
|
||||
return true;
|
||||
|
|
|
@ -121,6 +121,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
|
|||
DebuggerReserveRegs(false),
|
||||
DebuggerEmitPrologue(false),
|
||||
|
||||
EnableHugePrivateBuffer(false),
|
||||
EnableVGPRSpilling(false),
|
||||
EnablePromoteAlloca(false),
|
||||
EnableLoadStoreOpt(false),
|
||||
|
|
|
@ -130,6 +130,7 @@ protected:
|
|||
bool DebuggerEmitPrologue;
|
||||
|
||||
// Used as options.
|
||||
bool EnableHugePrivateBuffer;
|
||||
bool EnableVGPRSpilling;
|
||||
bool EnablePromoteAlloca;
|
||||
bool EnableLoadStoreOpt;
|
||||
|
@ -351,6 +352,10 @@ public:
|
|||
return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
|
||||
}
|
||||
|
||||
bool enableHugePrivateBuffer() const {
|
||||
return EnableHugePrivateBuffer;
|
||||
}
|
||||
|
||||
bool isPromoteAllocaEnabled() const {
|
||||
return EnablePromoteAlloca;
|
||||
}
|
||||
|
|
|
@ -94,6 +94,12 @@ static cl::opt<bool> EnableVGPRIndexMode(
|
|||
cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
|
||||
cl::init(false));
|
||||
|
||||
static cl::opt<unsigned> AssumeFrameIndexHighZeroBits(
|
||||
"amdgpu-frame-index-zero-bits",
|
||||
cl::desc("High bits of frame index assumed to be zero"),
|
||||
cl::init(5),
|
||||
cl::ReallyHidden);
|
||||
|
||||
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
|
||||
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
|
||||
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
|
||||
|
@ -1600,6 +1606,17 @@ SDValue SITargetLowering::LowerFormalArguments(
|
|||
Reg = MF.addLiveIn(Reg, RC);
|
||||
SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
|
||||
|
||||
if (Arg.Flags.isSRet() && !getSubtarget()->enableHugePrivateBuffer()) {
|
||||
// The return object should be reasonably addressable.
|
||||
|
||||
// FIXME: This helps when the return is a real sret. If it is a
|
||||
// automatically inserted sret (i.e. CanLowerReturn returns false), an
|
||||
// extra copy is inserted in SelectionDAGBuilder which obscures this.
|
||||
unsigned NumBits = 32 - AssumeFrameIndexHighZeroBits;
|
||||
Val = DAG.getNode(ISD::AssertZext, DL, VT, Val,
|
||||
DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
|
||||
}
|
||||
|
||||
// If this is an 8 or 16-bit value, it is really passed promoted
|
||||
// to 32 bits. Insert an assert[sz]ext to capture this, then
|
||||
// truncate to the right size.
|
||||
|
@ -3216,7 +3233,6 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
|
|||
return lowerEXTRACT_VECTOR_ELT(Op, DAG);
|
||||
case ISD::FP_ROUND:
|
||||
return lowerFP_ROUND(Op, DAG);
|
||||
|
||||
case ISD::TRAP:
|
||||
case ISD::DEBUGTRAP:
|
||||
return lowerTRAP(Op, DAG);
|
||||
|
@ -6997,3 +7013,21 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
|
|||
|
||||
TargetLoweringBase::finalizeLowering(MF);
|
||||
}
|
||||
|
||||
void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
|
||||
KnownBits &Known,
|
||||
const APInt &DemandedElts,
|
||||
const SelectionDAG &DAG,
|
||||
unsigned Depth) const {
|
||||
TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
|
||||
DAG, Depth);
|
||||
|
||||
if (getSubtarget()->enableHugePrivateBuffer())
|
||||
return;
|
||||
|
||||
// Technically it may be possible to have a dispatch with a single workitem
|
||||
// that uses the full private memory size, but that's not really useful. We
|
||||
// can't use vaddr in MUBUF instructions if we don't know the address
|
||||
// calculation won't overflow, so assume the sign bit is never set.
|
||||
Known.Zero.setHighBits(AssumeFrameIndexHighZeroBits);
|
||||
}
|
||||
|
|
|
@ -277,6 +277,12 @@ public:
|
|||
SDValue V) const;
|
||||
|
||||
void finalizeLowering(MachineFunction &MF) const override;
|
||||
|
||||
void computeKnownBitsForFrameIndex(const SDValue Op,
|
||||
KnownBits &Known,
|
||||
const APInt &DemandedElts,
|
||||
const SelectionDAG &DAG,
|
||||
unsigned Depth = 0) const override;
|
||||
};
|
||||
|
||||
} // End namespace llvm
|
||||
|
|
|
@ -1,12 +1,12 @@
|
|||
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
|
||||
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
|
||||
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE %s
|
||||
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
|
||||
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
|
||||
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
|
||||
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
|
||||
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA %s
|
||||
; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck -enable-var-scope -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC %s
|
||||
|
||||
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s
|
||||
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s
|
||||
; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=HSAOPT -check-prefix=OPT %s
|
||||
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -enable-var-scope -check-prefix=NOHSAOPT -check-prefix=OPT %s
|
||||
|
||||
; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s -check-prefix=R600 -check-prefix=FUNC
|
||||
|
||||
|
@ -391,7 +391,8 @@ entry:
|
|||
; FUNC-LABEL: ptrtoint:
|
||||
; SI-NOT: ds_write
|
||||
; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
|
||||
; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
|
||||
; SI: v_add_i32_e32 [[ADD_OFFSET:v[0-9]+]], vcc, 5,
|
||||
; SI: buffer_load_dword v{{[0-9]+}}, [[ADD_OFFSET:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
|
||||
define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
|
||||
%alloca = alloca [16 x i32]
|
||||
%tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
|
||||
|
|
|
@ -30,14 +30,14 @@ define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias
|
|||
%tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
|
||||
%a_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inA, i32 %tid
|
||||
%b_ptr = getelementptr inbounds i32, i32 addrspace(1)* %inB, i32 %tid
|
||||
%a = load i32, i32 addrspace(1)* %a_ptr
|
||||
%b = load i32, i32 addrspace(1)* %b_ptr
|
||||
%a = load i32, i32 addrspace(1)* %a_ptr, !range !0
|
||||
%b = load i32, i32 addrspace(1)* %b_ptr, !range !0
|
||||
%result = add i32 %a, %b
|
||||
%alloca_ptr = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 1, i32 %b
|
||||
store i32 %result, i32* %alloca_ptr, align 4
|
||||
; Dummy call
|
||||
call void @llvm.amdgcn.s.barrier()
|
||||
%reload = load i32, i32* %alloca_ptr, align 4
|
||||
%reload = load i32, i32* %alloca_ptr, align 4, !range !0
|
||||
%out_ptr = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
|
||||
store i32 %reload, i32 addrspace(1)* %out_ptr, align 4
|
||||
ret void
|
||||
|
@ -46,3 +46,5 @@ define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias
|
|||
attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" }
|
||||
attributes #1 = { nounwind readnone }
|
||||
attributes #2 = { nounwind convergent }
|
||||
|
||||
!0 = !{i32 0, i32 65536 }
|
||||
|
|
|
@ -101,6 +101,9 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to see that this can use vaddr, but the
|
||||
; FrameIndex is hidden behind a CopyFromReg in the second block.
|
||||
|
||||
; GCN-LABEL: {{^}}void_func_byval_struct_i8_i32_ptr_nonentry_block:
|
||||
; GCN: s_sub_u32 [[SUB_OFFSET:s[0-9]+]], s5, s4
|
||||
; GCN: v_lshr_b32_e64 [[SHIFT:v[0-9]+]], [[SUB_OFFSET]], 6
|
||||
|
@ -108,7 +111,7 @@ define void @void_func_byval_struct_i8_i32_ptr_value({ i8, i32 }* byval %arg0) #
|
|||
; GCN: s_and_saveexec_b64
|
||||
|
||||
; GCN: v_add_i32_e32 v0, vcc, 4, [[ADD]]
|
||||
; GCN: buffer_load_dword v1, v1, s[0:3], s4 offen offset:4
|
||||
; GCN: buffer_load_dword v1, v0, s[0:3], s4 offen{{$}}
|
||||
; GCN: ds_write_b32
|
||||
define void @void_func_byval_struct_i8_i32_ptr_nonentry_block({ i8, i32 }* byval %arg0, i32 %arg2) #0 {
|
||||
%cmp = icmp eq i32 %arg2, 0
|
||||
|
@ -195,4 +198,23 @@ bb5:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}alloca_ptr_nonentry_block:
|
||||
; GCN: s_and_saveexec_b64
|
||||
; GCN: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s5 offset:12
|
||||
define void @alloca_ptr_nonentry_block(i32 %arg0) #0 {
|
||||
%alloca0 = alloca { i8, i32 }, align 4
|
||||
%cmp = icmp eq i32 %arg0, 0
|
||||
br i1 %cmp, label %bb, label %ret
|
||||
|
||||
bb:
|
||||
%gep0 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %alloca0, i32 0, i32 0
|
||||
%gep1 = getelementptr inbounds { i8, i32 }, { i8, i32 }* %alloca0, i32 0, i32 1
|
||||
%load1 = load volatile i32, i32* %gep1
|
||||
store volatile i32* %gep1, i32* addrspace(3)* undef
|
||||
br label %ret
|
||||
|
||||
ret:
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
|
|
@ -385,40 +385,116 @@ define void @void_func_sret_struct_i8_i32({ i8, i32 }* sret %arg0) #0 {
|
|||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be able to fold offsets in all of these. Call lowering
|
||||
; introduces an extra CopyToReg/CopyFromReg obscuring the AssertZext
|
||||
; inserted. Not using it introduces the spills.
|
||||
|
||||
; GCN-LABEL: {{^}}v33i32_func_void:
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
|
||||
; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
|
||||
; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
|
||||
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_4:v[0-9]+]], vcc, 4, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_4]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_8:v[0-9]+]], vcc, 8, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_8]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_12:v[0-9]+]], vcc, 12, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_12]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_16:v[0-9]+]], vcc, 16, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_16]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_20:v[0-9]+]], vcc, 20, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_20]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_24:v[0-9]+]], vcc, 24, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_24]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_28:v[0-9]+]], vcc, 28, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_28]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_32:v[0-9]+]], vcc, 32, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_32]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_36:v[0-9]+]], vcc, 36, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_36]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_40:v[0-9]+]], vcc, 40, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_40]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_44:v[0-9]+]], vcc, 44, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_44]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_48:v[0-9]+]], vcc, 48, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_48]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_52:v[0-9]+]], vcc, 52, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_52]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_56:v[0-9]+]], vcc, 56, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_56]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_60:v[0-9]+]], vcc, 60, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_60]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_64:v[0-9]+]], vcc, 64, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_64]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_68:v[0-9]+]], vcc, 0x44, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_68]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_72:v[0-9]+]], vcc, 0x48, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_72]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_76:v[0-9]+]], vcc, 0x4c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_76]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_80:v[0-9]+]], vcc, 0x50, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_80]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_84:v[0-9]+]], vcc, 0x54, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_84]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_88:v[0-9]+]], vcc, 0x58, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_88]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_92:v[0-9]+]], vcc, 0x5c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_92]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_96:v[0-9]+]], vcc, 0x60, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_96]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_100:v[0-9]+]], vcc, 0x64, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_100]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_104:v[0-9]+]], vcc, 0x68, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_104]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_108:v[0-9]+]], vcc, 0x6c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_108]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_112:v[0-9]+]], vcc, 0x70, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_112]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_116:v[0-9]+]], vcc, 0x74, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_116]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_120:v[0-9]+]], vcc, 0x78, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_120]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_124:v[0-9]+]], vcc, 0x7c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_124]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN: buffer_load_dword v34
|
||||
; GCN: buffer_load_dword v33
|
||||
; GCN: buffer_load_dword v32
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define <33 x i32> @v33i32_func_void() #0 {
|
||||
|
@ -428,39 +504,111 @@ define <33 x i32> @v33i32_func_void() #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}struct_v32i32_i32_func_void:
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:12 ; 4-byte Folded Spill
|
||||
; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
|
||||
; GCN: buffer_store_dword v34, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
|
||||
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:8{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:12{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:16{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:20{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:24{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:28{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:32{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:36{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:40{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:44{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:48{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:52{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:56{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:60{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:64{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:68{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:72{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:76{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:80{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:84{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:88{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:92{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:96{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:100{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:104{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:108{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:112{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:116{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:120{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:124{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_4:v[0-9]+]], vcc, 4, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_4]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_8:v[0-9]+]], vcc, 8, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_8]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_12:v[0-9]+]], vcc, 12, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_12]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_16:v[0-9]+]], vcc, 16, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_16]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_20:v[0-9]+]], vcc, 20, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_20]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_24:v[0-9]+]], vcc, 24, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_24]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_28:v[0-9]+]], vcc, 28, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_28]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_32:v[0-9]+]], vcc, 32, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_32]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_36:v[0-9]+]], vcc, 36, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_36]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_40:v[0-9]+]], vcc, 40, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_40]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_44:v[0-9]+]], vcc, 44, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_44]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_48:v[0-9]+]], vcc, 48, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_48]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_52:v[0-9]+]], vcc, 52, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_52]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_56:v[0-9]+]], vcc, 56, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_56]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_60:v[0-9]+]], vcc, 60, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_60]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_64:v[0-9]+]], vcc, 64, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_64]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_68:v[0-9]+]], vcc, 0x44, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_68]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_72:v[0-9]+]], vcc, 0x48, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_72]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_76:v[0-9]+]], vcc, 0x4c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_76]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_80:v[0-9]+]], vcc, 0x50, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_80]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_84:v[0-9]+]], vcc, 0x54, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_84]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_88:v[0-9]+]], vcc, 0x58, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_88]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_92:v[0-9]+]], vcc, 0x5c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_92]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_96:v[0-9]+]], vcc, 0x60, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_96]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_100:v[0-9]+]], vcc, 0x64, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_100]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_104:v[0-9]+]], vcc, 0x68, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_104]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_108:v[0-9]+]], vcc, 0x6c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_108]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_112:v[0-9]+]], vcc, 0x70, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_112]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_116:v[0-9]+]], vcc, 0x74, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_116]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_120:v[0-9]+]], vcc, 0x78, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_120]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_124:v[0-9]+]], vcc, 0x7c, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_124]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN: buffer_load_dword v34
|
||||
; GCN: buffer_load_dword v33
|
||||
; GCN: buffer_load_dword v32
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
|
||||
|
@ -470,39 +618,20 @@ define { <32 x i32>, i32 } @struct_v32i32_i32_func_void() #0 {
|
|||
}
|
||||
|
||||
; GCN-LABEL: {{^}}struct_i32_v32i32_func_void:
|
||||
; GCN: buffer_store_dword v32, off, s[0:3], s5 offset:8 ; 4-byte Folded Spill
|
||||
; GCN: buffer_store_dword v33, off, s[0:3], s5 offset:4 ; 4-byte Folded Spill
|
||||
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:128{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:132{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:136{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:140{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:144{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:148{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:152{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:156{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:160{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:164{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:168{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:172{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:176{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:180{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:184{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:188{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:192{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:196{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:200{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:204{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:208{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:212{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:216{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:220{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:224{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:228{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:232{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:236{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:240{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:244{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:248{{$}}
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v0, s[0:3], s4 offen offset:252{{$}}
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_128:v[0-9]+]], vcc, 0x80, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_128]], s[0:3], s4 offen{{$}}
|
||||
|
||||
|
||||
; GCN-DAG: v_add_i32_e32 [[ADD_256:v[0-9]+]], vcc, 0xfc, v0
|
||||
; GCN-DAG: buffer_store_dword v{{[0-9]+}}, [[ADD_256]], s[0:3], s4 offen{{$}}
|
||||
|
||||
; GCN: buffer_load_dword v33
|
||||
; GCN: buffer_load_dword v32
|
||||
; GCN: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define { i32, <32 x i32> } @struct_i32_v32i32_func_void() #0 {
|
||||
|
|
|
@ -0,0 +1,31 @@
|
|||
; RUN: llc -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_small:
|
||||
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
|
||||
; GCN-NOT: [[FI]]
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[FI]]
|
||||
define amdgpu_kernel void @scratch_buffer_known_high_bit_small() #0 {
|
||||
%alloca = alloca i32, align 4
|
||||
store volatile i32 0, i32* %alloca
|
||||
%toint = ptrtoint i32* %alloca to i32
|
||||
%masked = and i32 %toint, 2147483647
|
||||
store volatile i32 %masked, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}scratch_buffer_known_high_bit_huge:
|
||||
; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4
|
||||
; GCN-DAG: buffer_store_dword
|
||||
; GCN-DAG: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x7ffffffc, [[FI]]
|
||||
; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[MASKED]]
|
||||
define amdgpu_kernel void @scratch_buffer_known_high_bit_huge() #1 {
|
||||
%alloca = alloca i32, align 4
|
||||
store volatile i32 0, i32* %alloca
|
||||
%toint = ptrtoint i32* %alloca to i32
|
||||
%masked = and i32 %toint, 2147483647
|
||||
store volatile i32 %masked, i32 addrspace(1)* undef
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
attributes #1 = { nounwind "target-features"="+huge-private-buffer" }
|
|
@ -295,16 +295,16 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg(i16* %in, i16 %reg) #0 {
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg(i16* byval %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2047
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2045
|
||||
%load = load i16, i16* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 1
|
||||
|
@ -314,16 +314,16 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2f16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_hi_v2f16_reglo_vreg(half* %in, half %reg) #0 {
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
|
||||
define void @load_private_hi_v2f16_reglo_vreg(half* byval %in, half %reg) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds half, half* %in, i64 2047
|
||||
%gep = getelementptr inbounds half, half* %in, i64 2045
|
||||
%load = load half, half* %gep
|
||||
%build0 = insertelement <2 x half> undef, half %reg, i32 0
|
||||
%build1 = insertelement <2 x half> %build0, half %load, i32 1
|
||||
|
@ -333,14 +333,14 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_nooff:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16_hi v1, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9: buffer_load_short_d16_hi v0, off, s[0:3], s4 offset:4094{{$}}
|
||||
; GFX9: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s4 offset:4094{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* %in, i16 %reg) #0 {
|
||||
define void @load_private_hi_v2i16_reglo_vreg_nooff(i16* byval %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%load = load volatile i16, i16* inttoptr (i32 4094 to i16*)
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||
|
@ -369,16 +369,16 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ubyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
; GFX9: buffer_load_ubyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* %in, i16 %reg) #0 {
|
||||
; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_zexti8(i8* byval %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 2047
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 4091
|
||||
%load = load i8, i8* %gep
|
||||
%ext = zext i8 %load to i16
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||
|
@ -389,16 +389,16 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_hi_v2i16_reglo_vreg_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_sbyte_d16_hi v1, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
; GFX9: buffer_load_sbyte_d16_hi v0, off, s[0:3], s5 offset:4095{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* %in, i16 %reg) #0 {
|
||||
; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
|
||||
define void @load_private_hi_v2i16_reglo_vreg_sexti8(i8* byval %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 2047
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 4091
|
||||
%load = load i8, i8* %gep
|
||||
%ext = sext i8 %load to i16
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 0
|
||||
|
@ -641,13 +641,12 @@ entry:
|
|||
; FIXME: Is there a cost to using the extload over not?
|
||||
; GCN-LABEL: {{^}}load_private_v2i16_split:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ushort v1, v0, s[0:3], s4 offen{{$}}
|
||||
; GFX9: buffer_load_ushort v0, off, s[0:3], s5 offset:4{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16_hi v1, v0, s[0:3], s4 offen offset:2
|
||||
; GFX9-NEXT: buffer_load_short_d16_hi v0, off, s[0:3], s5 offset:6
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: v_mov_b32_e32 v0, v1
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
define <2 x i16> @load_private_v2i16_split(i16* %in) #0 {
|
||||
define <2 x i16> @load_private_v2i16_split(i16* byval %in) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16* %in, i32 1
|
||||
%load0 = load volatile i16, i16* %in
|
||||
|
|
|
@ -340,17 +340,17 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg(i16* %in, i32 %reg) #0 {
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg(i16* byval %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2047
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2045
|
||||
%load = load i16, i16* %gep
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %load, i32 0
|
||||
store <2 x i16> %build1, <2 x i16> addrspace(1)* undef
|
||||
|
@ -359,7 +359,7 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reghi_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ushort v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9: buffer_load_ushort v1, off, s[0:3], s5 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9: v_and_b32
|
||||
; GFX9: v_lshl_or_b32
|
||||
|
@ -368,10 +368,10 @@ entry:
|
|||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reghi_vreg(i16* %in, i16 %reg) #0 {
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
|
||||
define void @load_private_lo_v2i16_reghi_vreg(i16* byval %in, i16 %reg) #0 {
|
||||
entry:
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2047
|
||||
%gep = getelementptr inbounds i16, i16* %in, i64 2045
|
||||
%load = load i16, i16* %gep
|
||||
%build0 = insertelement <2 x i16> undef, i16 %reg, i32 1
|
||||
%build1 = insertelement <2 x i16> %build0, i16 %load, i32 0
|
||||
|
@ -381,17 +381,17 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2f16_reglo_vreg:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_short_d16 v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9: buffer_load_short_d16 v0, off, s[0:3], s5 offset:4094{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
define void @load_private_lo_v2f16_reglo_vreg(half* %in, i32 %reg) #0 {
|
||||
; VI: buffer_load_ushort v{{[0-9]+}}, off, s[0:3], s5 offset:4094{{$}}
|
||||
define void @load_private_lo_v2f16_reglo_vreg(half* byval %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x half>
|
||||
%gep = getelementptr inbounds half, half* %in, i64 2047
|
||||
%gep = getelementptr inbounds half, half* %in, i64 2045
|
||||
%load = load half, half* %gep
|
||||
%build1 = insertelement <2 x half> %reg.bc, half %load, i32 0
|
||||
store <2 x half> %build1, <2 x half> addrspace(1)* undef
|
||||
|
@ -454,17 +454,17 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_zexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_ubyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
; GFX9: buffer_load_ubyte_d16 v0, off, s[0:3], s5 offset:4095{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_ubyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* %in, i32 %reg) #0 {
|
||||
; VI: buffer_load_ubyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_zexti8(i8* byval %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 2047
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 4091
|
||||
%load = load i8, i8* %gep
|
||||
%ext = zext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
|
@ -474,17 +474,17 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}load_private_lo_v2i16_reglo_vreg_sexti8:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_load_sbyte_d16 v1, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
; GFX9: buffer_load_sbyte_d16 v0, off, s[0:3], s5 offset:4095{{$}}
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v1
|
||||
; GFX9-NEXT: global_store_dword v{{\[[0-9]+:[0-9]+\]}}, v0
|
||||
; GFX9-NEXT: s_waitcnt
|
||||
; GFX9-NEXT: s_setpc_b64
|
||||
|
||||
; VI: buffer_load_sbyte v{{[0-9]+}}, v0, s[0:3], s4 offen offset:2047{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* %in, i32 %reg) #0 {
|
||||
; VI: buffer_load_sbyte v{{[0-9]+}}, off, s[0:3], s5 offset:4095{{$}}
|
||||
define void @load_private_lo_v2i16_reglo_vreg_sexti8(i8* byval %in, i32 %reg) #0 {
|
||||
entry:
|
||||
%reg.bc = bitcast i32 %reg to <2 x i16>
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 2047
|
||||
%gep = getelementptr inbounds i8, i8* %in, i64 4091
|
||||
%load = load i8, i8* %gep
|
||||
%ext = sext i8 %load to i16
|
||||
%build1 = insertelement <2 x i16> %reg.bc, i16 %ext, i32 0
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
|
||||
|
||||
; When a frame index offset is more than 12-bits, make sure we don't store
|
||||
; it in mubuf's offset field.
|
||||
|
@ -86,8 +86,21 @@ done:
|
|||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}neg_vaddr_offset_inbounds:
|
||||
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}}
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
|
||||
define amdgpu_kernel void @neg_vaddr_offset_inbounds(i32 %offset) {
|
||||
entry:
|
||||
%array = alloca [8192 x i32]
|
||||
%ptr_offset = add i32 %offset, 4
|
||||
%ptr = getelementptr inbounds [8192 x i32], [8192 x i32]* %array, i32 0, i32 %ptr_offset
|
||||
store i32 0, i32* %ptr
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: {{^}}neg_vaddr_offset:
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}}
|
||||
; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 16, v{{[0-9]+}}
|
||||
; GCN: buffer_store_dword v{{[0-9]+}}, [[ADD]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen{{$}}
|
||||
define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) {
|
||||
entry:
|
||||
%array = alloca [8192 x i32]
|
||||
|
|
|
@ -440,18 +440,18 @@ entry:
|
|||
|
||||
; GCN-LABEL: {{^}}store_private_hi_v2i16_max_offset:
|
||||
; GCN: s_waitcnt
|
||||
; GFX9-NEXT: buffer_store_short_d16_hi v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; GFX9: buffer_store_short_d16_hi v0, off, s[0:3], s5 offset:4094{{$}}
|
||||
|
||||
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; VI-NEXT: buffer_store_short v1, v0, s[0:3], s4 offen offset:4094{{$}}
|
||||
; VI: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; VI-NEXT: buffer_store_short v0, off, s[0:3], s5 offset:4094{{$}}
|
||||
|
||||
; GCN-NEXT: s_waitcnt
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define void @store_private_hi_v2i16_max_offset(i16* %out, i32 %arg) #0 {
|
||||
define void @store_private_hi_v2i16_max_offset(i16* byval %out, i32 %arg) #0 {
|
||||
entry:
|
||||
%value = bitcast i32 %arg to <2 x i16>
|
||||
%hi = extractelement <2 x i16> %value, i32 1
|
||||
%gep = getelementptr inbounds i16, i16* %out, i64 2047
|
||||
%gep = getelementptr inbounds i16, i16* %out, i64 2045
|
||||
store i16 %hi, i16* %gep
|
||||
ret void
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue