forked from OSchip/llvm-project
[InstCombine][AMDGPU] Trim components of s_buffer_load
Summary: Add trimming of unused components of s_buffer_load. For s_buffer_load and unformatted buffer_load also trim unused components at the beginning of vector and update offset accordingly. Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits Tags: #llvm Differential Revision: https://reviews.llvm.org/D71785
This commit is contained in:
parent
021f531786
commit
dd7148822b
|
@ -1012,13 +1012,64 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
|||
if (VWidth == 1)
|
||||
return nullptr;
|
||||
|
||||
ConstantInt *NewDMask = nullptr;
|
||||
IRBuilderBase::InsertPointGuard Guard(Builder);
|
||||
Builder.SetInsertPoint(II);
|
||||
|
||||
// Assume the arguments are unchanged and later override them, if needed.
|
||||
SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
|
||||
|
||||
if (DMaskIdx < 0) {
|
||||
// Pretend that a prefix of elements is demanded to simplify the code
|
||||
// below.
|
||||
DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
|
||||
// Buffer case.
|
||||
|
||||
const unsigned ActiveBits = DemandedElts.getActiveBits();
|
||||
const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
|
||||
|
||||
// Start assuming the prefix of elements is demanded, but possibly clear
|
||||
// some other bits if there are trailing zeros (unused components at front)
|
||||
// and update offset.
|
||||
DemandedElts = (1 << ActiveBits) - 1;
|
||||
|
||||
if (UnusedComponentsAtFront > 0) {
|
||||
static const unsigned InvalidOffsetIdx = 0xf;
|
||||
|
||||
unsigned OffsetIdx;
|
||||
switch (II->getIntrinsicID()) {
|
||||
case Intrinsic::amdgcn_raw_buffer_load:
|
||||
OffsetIdx = 1;
|
||||
break;
|
||||
case Intrinsic::amdgcn_s_buffer_load:
|
||||
// If resulting type is vec3, there is no point in trimming the
|
||||
// load with updated offset, as the vec3 would most likely be widened to
|
||||
// vec4 anyway during lowering.
|
||||
if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
|
||||
OffsetIdx = InvalidOffsetIdx;
|
||||
else
|
||||
OffsetIdx = 1;
|
||||
break;
|
||||
case Intrinsic::amdgcn_struct_buffer_load:
|
||||
OffsetIdx = 2;
|
||||
break;
|
||||
default:
|
||||
// TODO: handle tbuffer* intrinsics.
|
||||
OffsetIdx = InvalidOffsetIdx;
|
||||
break;
|
||||
}
|
||||
|
||||
if (OffsetIdx != InvalidOffsetIdx) {
|
||||
// Clear demanded bits and update the offset.
|
||||
DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
|
||||
auto *Offset = II->getArgOperand(OffsetIdx);
|
||||
unsigned SingleComponentSizeInBits =
|
||||
getDataLayout().getTypeSizeInBits(II->getType()->getScalarType());
|
||||
unsigned OffsetAdd =
|
||||
UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
|
||||
auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
|
||||
Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Image case.
|
||||
|
||||
ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
|
||||
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
|
||||
|
||||
|
@ -1037,7 +1088,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
|||
}
|
||||
|
||||
if (DMaskVal != NewDMaskVal)
|
||||
NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
|
||||
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
|
||||
}
|
||||
|
||||
unsigned NewNumElts = DemandedElts.countPopulation();
|
||||
|
@ -1045,8 +1096,8 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
|||
return UndefValue::get(II->getType());
|
||||
|
||||
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
|
||||
if (NewDMask)
|
||||
II->setArgOperand(DMaskIdx, NewDMask);
|
||||
if (DMaskIdx >= 0)
|
||||
II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
|
@ -1069,16 +1120,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
|||
OverloadTys[0] = NewTy;
|
||||
Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
|
||||
|
||||
SmallVector<Value *, 16> Args;
|
||||
for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
|
||||
Args.push_back(II->getArgOperand(I));
|
||||
|
||||
if (NewDMask)
|
||||
Args[DMaskIdx] = NewDMask;
|
||||
|
||||
IRBuilderBase::InsertPointGuard Guard(Builder);
|
||||
Builder.SetInsertPoint(II);
|
||||
|
||||
CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
|
||||
NewCall->takeName(II);
|
||||
NewCall->copyMetadata(*II);
|
||||
|
@ -1747,6 +1788,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
|
|||
case Intrinsic::amdgcn_raw_buffer_load:
|
||||
case Intrinsic::amdgcn_raw_buffer_load_format:
|
||||
case Intrinsic::amdgcn_raw_tbuffer_load:
|
||||
case Intrinsic::amdgcn_s_buffer_load:
|
||||
case Intrinsic::amdgcn_struct_buffer_load:
|
||||
case Intrinsic::amdgcn_struct_buffer_load_format:
|
||||
case Intrinsic::amdgcn_struct_tbuffer_load:
|
||||
|
|
|
@ -427,9 +427,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <2 x float> %data, i32 1
|
||||
|
@ -446,9 +446,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 1
|
||||
|
@ -456,9 +456,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 2
|
||||
|
@ -466,9 +466,9 @@ define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 12
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 3
|
||||
|
@ -485,9 +485,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32>
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: ret <2 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
|
@ -495,9 +495,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32>
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <2 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
|
||||
|
@ -514,9 +514,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <3 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <3 x float> %data
|
||||
define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
|
||||
|
@ -543,9 +543,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <3 x float> %data, i32 1
|
||||
|
@ -553,9 +553,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <3 x float> %data, i32 2
|
||||
|
@ -572,9 +572,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32>
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: ret <2 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
|
@ -620,6 +620,490 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32)
|
|||
|
||||
declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16(
|
||||
; CHECK: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt0 = extractelement <2 x half> %data, i32 0
|
||||
ret half %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <2 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt0 = extractelement <3 x half> %data, i32 1
|
||||
ret half %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 6
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x half> %data, i32 3
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x half>
|
||||
define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x half> %shuf
|
||||
}
|
||||
|
||||
declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #1
|
||||
declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #1
|
||||
declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1
|
||||
declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8(
|
||||
; CHECK: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt0 = extractelement <2 x i8> %data, i32 0
|
||||
ret i8 %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <2 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt0 = extractelement <3 x i8> %data, i32 1
|
||||
ret i8 %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 3
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x i8> %data, i32 3
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x i8>
|
||||
define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x i8> %shuf
|
||||
}
|
||||
|
||||
declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #1
|
||||
declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32>, i32, i32, i32) #1
|
||||
declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32>, i32, i32, i32) #1
|
||||
declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.s.buffer.load
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
; CHECK-LABEL: @s_buffer_load_f32(
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
ret float %data
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @s_buffer_load_v2f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
ret <2 x float> %data
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret <4 x float> %data
|
||||
define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
ret <4 x float> %data
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32(
|
||||
; CHECK: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt0 = extractelement <2 x float> %data, i32 0
|
||||
ret float %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <2 x float> %data, i32 1
|
||||
ret float %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt0 = extractelement <4 x float> %data, i32 0
|
||||
ret float %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 1
|
||||
ret float %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 2
|
||||
ret float %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 12
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 3
|
||||
ret float %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float>
|
||||
define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x float> %shuf
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
ret <2 x float> %shuf
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
|
||||
ret <2 x float> %shuf
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret <3 x float> %data
|
||||
define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
|
||||
ret <3 x float> %shuf
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <3 x float> %shuf
|
||||
define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
|
||||
ret <3 x float> %shuf
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt0 = extractelement <3 x float> %data, i32 0
|
||||
ret float %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <3 x float> %data, i32 1
|
||||
ret float %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <3 x float> %data, i32 2
|
||||
ret float %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float>
|
||||
define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x float> %shuf
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
ret <2 x float> %shuf
|
||||
}
|
||||
|
||||
; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened
|
||||
; to vec4 anyway during lowering.
|
||||
; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <3 x float> %shuf
|
||||
define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
|
||||
ret <3 x float> %shuf
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
|
||||
; CHECK-NEXT: ret i32 %tmp2
|
||||
define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%tmp = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%tmp1 = bitcast <4 x float> %tmp to <4 x i32>
|
||||
%tmp2 = extractelement <4 x i32> %tmp1, i32 0
|
||||
ret i32 %tmp2
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32(
|
||||
; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
|
||||
; CHECK-NEXT: ret float %tmp2
|
||||
define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%tmp1 = bitcast <4 x i32> %tmp to <4 x float>
|
||||
%tmp2 = extractelement <4 x float> %tmp1, i32 0
|
||||
ret float %tmp2
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32(
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
|
||||
%elt0 = extractelement <2 x float> %data, i32 0
|
||||
ret float %elt0
|
||||
}
|
||||
|
||||
declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
|
||||
declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) #1
|
||||
declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1
|
||||
declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1
|
||||
declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
|
||||
; CHECK: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt0 = extractelement <2 x half> %data, i32 0
|
||||
ret half %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <2 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <3 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <4 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 6
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <4 x half> %data, i32 3
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x half>
|
||||
define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x half> %shuf
|
||||
}
|
||||
|
||||
declare half @llvm.amdgcn.s.buffer.load.f16(<4 x i32>, i32, i32) #1
|
||||
declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1
|
||||
declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1
|
||||
declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
|
||||
; CHECK: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt0 = extractelement <2 x i8> %data, i32 0
|
||||
ret i8 %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <2 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <3 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <4 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 3
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%elt1 = extractelement <4 x i8> %data, i32 3
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x i8>
|
||||
define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
|
||||
%shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x i8> %shuf
|
||||
}
|
||||
|
||||
declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32) #1
|
||||
declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32>, i32, i32) #1
|
||||
declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32>, i32, i32) #1
|
||||
declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.raw.buffer.load.format
|
||||
; --------------------------------------------------------------------
|
||||
|
@ -901,9 +1385,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %r
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <2 x float> %data, i32 1
|
||||
|
@ -920,9 +1404,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %r
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 1
|
||||
|
@ -930,9 +1414,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %r
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 2
|
||||
|
@ -940,9 +1424,9 @@ define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %r
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 12
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x float> %data, i32 3
|
||||
|
@ -959,9 +1443,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i3
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: ret <2 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
|
@ -969,9 +1453,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i3
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <2 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
|
||||
|
@ -988,9 +1472,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(
|
||||
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
|
||||
; CHECK-NEXT: ret <3 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <3 x float> %data
|
||||
define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
|
||||
|
@ -1017,9 +1501,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %r
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <3 x float> %data, i32 1
|
||||
|
@ -1027,9 +1511,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %r
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
|
||||
; CHECK-NEXT: ret float %elt1
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 8
|
||||
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret float %data
|
||||
define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <3 x float> %data, i32 2
|
||||
|
@ -1046,9 +1530,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i3
|
|||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32(
|
||||
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
; CHECK-NEXT: ret <2 x float> %shuf
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 4
|
||||
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x float> %data
|
||||
define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
|
||||
|
@ -1094,6 +1578,132 @@ declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i
|
|||
|
||||
declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16(
|
||||
; CHECK: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt0 = extractelement <2 x half> %data, i32 0
|
||||
ret half %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <2 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <3 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 2
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x half> %data, i32 1
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 6
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x half> %data, i32 3
|
||||
ret half %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16(
|
||||
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x half>
|
||||
define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x half> %shuf
|
||||
}
|
||||
|
||||
declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1
|
||||
declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1
|
||||
declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1
|
||||
declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8(
|
||||
; CHECK: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt0 = extractelement <2 x i8> %data, i32 0
|
||||
ret i8 %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <2 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <3 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 1
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x i8> %data, i32 1
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %1 = add i32 %ofs, 3
|
||||
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret i8 %data
|
||||
define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%elt1 = extractelement <4 x i8> %data, i32 3
|
||||
ret i8 %elt1
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8(
|
||||
; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
; CHECK-NEXT: ret <2 x i8>
|
||||
define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
|
||||
%data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
|
||||
%shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
|
||||
ret <2 x i8> %shuf
|
||||
}
|
||||
|
||||
declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #1
|
||||
declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32>, i32, i32, i32, i32) #1
|
||||
declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32>, i32, i32, i32, i32) #1
|
||||
declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, i32) #1
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.struct.buffer.load.format
|
||||
; --------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue