[InstCombine][AMDGPU] Trim components of s_buffer_load

Summary:
Add trimming of unused components of s_buffer_load.

For s_buffer_load and unformatted buffer_load also trim unused
components at the beginning of vector and update offset accordingly.

Subscribers: kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71785
This commit is contained in:
Piotr Sobczak 2019-12-20 21:06:19 +01:00
parent 021f531786
commit dd7148822b
2 changed files with 729 additions and 77 deletions

View File

@ -1012,13 +1012,64 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
if (VWidth == 1)
return nullptr;
ConstantInt *NewDMask = nullptr;
IRBuilderBase::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(II);
// Assume the arguments are unchanged and later override them, if needed.
SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
if (DMaskIdx < 0) {
// Pretend that a prefix of elements is demanded to simplify the code
// below.
DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
// Buffer case.
const unsigned ActiveBits = DemandedElts.getActiveBits();
const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
// Start assuming the prefix of elements is demanded, but possibly clear
// some other bits if there are trailing zeros (unused components at front)
// and update offset.
DemandedElts = (1 << ActiveBits) - 1;
if (UnusedComponentsAtFront > 0) {
static const unsigned InvalidOffsetIdx = 0xf;
unsigned OffsetIdx;
switch (II->getIntrinsicID()) {
case Intrinsic::amdgcn_raw_buffer_load:
OffsetIdx = 1;
break;
case Intrinsic::amdgcn_s_buffer_load:
// If resulting type is vec3, there is no point in trimming the
// load with updated offset, as the vec3 would most likely be widened to
// vec4 anyway during lowering.
if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
OffsetIdx = InvalidOffsetIdx;
else
OffsetIdx = 1;
break;
case Intrinsic::amdgcn_struct_buffer_load:
OffsetIdx = 2;
break;
default:
// TODO: handle tbuffer* intrinsics.
OffsetIdx = InvalidOffsetIdx;
break;
}
if (OffsetIdx != InvalidOffsetIdx) {
// Clear demanded bits and update the offset.
DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
auto *Offset = II->getArgOperand(OffsetIdx);
unsigned SingleComponentSizeInBits =
getDataLayout().getTypeSizeInBits(II->getType()->getScalarType());
unsigned OffsetAdd =
UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
}
}
} else {
// Image case.
ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
unsigned DMaskVal = DMask->getZExtValue() & 0xf;
@ -1037,7 +1088,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
}
if (DMaskVal != NewDMaskVal)
NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
}
unsigned NewNumElts = DemandedElts.countPopulation();
@ -1045,8 +1096,8 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
return UndefValue::get(II->getType());
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
if (NewDMask)
II->setArgOperand(DMaskIdx, NewDMask);
if (DMaskIdx >= 0)
II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
return nullptr;
}
@ -1069,16 +1120,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
OverloadTys[0] = NewTy;
Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
SmallVector<Value *, 16> Args;
for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
Args.push_back(II->getArgOperand(I));
if (NewDMask)
Args[DMaskIdx] = NewDMask;
IRBuilderBase::InsertPointGuard Guard(Builder);
Builder.SetInsertPoint(II);
CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
NewCall->takeName(II);
NewCall->copyMetadata(*II);
@ -1747,6 +1788,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
case Intrinsic::amdgcn_raw_buffer_load:
case Intrinsic::amdgcn_raw_buffer_load_format:
case Intrinsic::amdgcn_raw_tbuffer_load:
case Intrinsic::amdgcn_s_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load:
case Intrinsic::amdgcn_struct_buffer_load_format:
case Intrinsic::amdgcn_struct_tbuffer_load:

View File

@ -427,9 +427,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_raw_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <2 x float> %data, i32 1
@ -446,9 +446,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 1
@ -456,9 +456,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc
}
; CHECK-LABEL: @extract_elt2_raw_buffer_load_v4f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 2
@ -466,9 +466,9 @@ define amdgpu_ps float @extract_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc
}
; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 12
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 3
@ -485,9 +485,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v4f32(<4 x i32>
}
; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v4f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: ret <2 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
@ -495,9 +495,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v4f32(<4 x i32>
}
; CHECK-LABEL: @extract_elt2_elt3_raw_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: ret <2 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
@ -514,9 +514,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_raw_buffer_load_v4f32(<4 x
}
; CHECK-LABEL: @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <3 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <3 x float> %data
define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_raw_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
@ -543,9 +543,9 @@ define amdgpu_ps float @extract_elt0_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <3 x float> %data, i32 1
@ -553,9 +553,9 @@ define amdgpu_ps float @extract_elt1_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc
}
; CHECK-LABEL: @extract_elt2_raw_buffer_load_v3f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <3 x float> %data, i32 2
@ -572,9 +572,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_raw_buffer_load_v3f32(<4 x i32>
}
; CHECK-LABEL: @extract_elt1_elt2_raw_buffer_load_v3f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: ret <2 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.raw.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt1_elt2_raw_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x float> @llvm.amdgcn.raw.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
@ -620,6 +620,490 @@ declare <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32>, i32, i32, i32)
declare <4 x i32> @llvm.amdgcn.raw.buffer.load.v4i32(<4 x i32>, i32, i32, i32) #1
; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2f16(
; CHECK: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt0_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt0 = extractelement <2 x half> %data, i32 0
ret half %elt0
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_raw_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <2 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_raw_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt0 = extractelement <3 x half> %data, i32 1
ret half %elt0
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4f16(
; CHECK-NEXT: %1 = add i32 %ofs, 6
; CHECK-NEXT: %data = call half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt3_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x half> %data, i32 3
ret half %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4f16(
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x half>
define amdgpu_ps <2 x half> @extract_elt0_elt1_raw_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
ret <2 x half> %shuf
}
declare half @llvm.amdgcn.raw.buffer.load.f16(<4 x i32>, i32, i32, i32) #1
declare <2 x half> @llvm.amdgcn.raw.buffer.load.v2f16(<4 x i32>, i32, i32, i32) #1
declare <3 x half> @llvm.amdgcn.raw.buffer.load.v3f16(<4 x i32>, i32, i32, i32) #1
declare <4 x half> @llvm.amdgcn.raw.buffer.load.v4f16(<4 x i32>, i32, i32, i32) #1
; CHECK-LABEL: @extract_elt0_raw_buffer_load_v2i8(
; CHECK: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt0_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt0 = extractelement <2 x i8> %data, i32 0
ret i8 %elt0
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v2i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <2 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v3i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt0 = extractelement <3 x i8> %data, i32 1
ret i8 %elt0
}
; CHECK-LABEL: @extract_elt1_raw_buffer_load_v4i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt3_raw_buffer_load_v4i8(
; CHECK-NEXT: %1 = add i32 %ofs, 3
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt3_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x i8> %data, i32 3
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_raw_buffer_load_v4i8(
; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x i8>
define amdgpu_ps <2 x i8> @extract_elt0_elt1_raw_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
ret <2 x i8> %shuf
}
declare i8 @llvm.amdgcn.raw.buffer.load.i8(<4 x i32>, i32, i32, i32) #1
declare <2 x i8> @llvm.amdgcn.raw.buffer.load.v2i8(<4 x i32>, i32, i32, i32) #1
declare <3 x i8> @llvm.amdgcn.raw.buffer.load.v3i8(<4 x i32>, i32, i32, i32) #1
declare <4 x i8> @llvm.amdgcn.raw.buffer.load.v4i8(<4 x i32>, i32, i32, i32) #1
; --------------------------------------------------------------------
; llvm.amdgcn.s.buffer.load
; --------------------------------------------------------------------
; CHECK-LABEL: @s_buffer_load_f32(
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @s_buffer_load_f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
ret float %data
}
; CHECK-LABEL: @s_buffer_load_v2f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
ret <2 x float> %data
}
; CHECK-LABEL: @s_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret <4 x float> %data
define amdgpu_ps <4 x float> @s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
ret <4 x float> %data
}
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f32(
; CHECK: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt0 = extractelement <2 x float> %data, i32 0
ret float %elt0
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f32(
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <2 x float> %data, i32 1
ret float %elt1
}
; CHECK-LABEL: @extract_elt0_s_buffer_load_v4f32(
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt0_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt0 = extractelement <4 x float> %data, i32 0
ret float %elt0
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f32(
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 1
ret float %elt1
}
; CHECK-LABEL: @extract_elt2_s_buffer_load_v4f32(
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 2
ret float %elt1
}
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f32(
; CHECK-NEXT: %1 = add i32 %ofs, 12
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 3
ret float %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret <2 x float>
define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
ret <2 x float> %shuf
}
; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v4f32(
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
ret <2 x float> %shuf
}
; CHECK-LABEL: @extract_elt2_elt3_s_buffer_load_v4f32(
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
ret <2 x float> %shuf
}
; CHECK-LABEL: @extract_elt0_elt1_elt2_s_buffer_load_v4f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret <3 x float> %data
define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
ret <3 x float> %shuf
}
; CHECK-LABEL: @extract_elt0_elt2_elt3_s_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
; CHECK-NEXT: ret <3 x float> %shuf
define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
ret <3 x float> %shuf
}
; CHECK-LABEL: @extract_elt0_s_buffer_load_v3f32(
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt0_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt0 = extractelement <3 x float> %data, i32 0
ret float %elt0
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f32(
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <3 x float> %data, i32 1
ret float %elt1
}
; CHECK-LABEL: @extract_elt2_s_buffer_load_v3f32(
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <3 x float> %data, i32 2
ret float %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v3f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret <2 x float>
define amdgpu_ps <2 x float> @extract_elt0_elt1_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
ret <2 x float> %shuf
}
; CHECK-LABEL: @extract_elt1_elt2_s_buffer_load_v3f32(
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt1_elt2_s_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
ret <2 x float> %shuf
}
; Do not trim to vec3 s_buffer_load in instcombine, as the load will most likely be widened
; to vec4 anyway during lowering.
; CHECK-LABEL: @extract_elt1_elt2_elt3_s_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <3 x float> %shuf
define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
ret <3 x float> %shuf
}
; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4f32(
; CHECK-NEXT: %tmp = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
; CHECK-NEXT: ret i32 %tmp2
define i32 @extract0_bitcast_s_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%tmp = call <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%tmp1 = bitcast <4 x float> %tmp to <4 x i32>
%tmp2 = extractelement <4 x i32> %tmp1, i32 0
ret i32 %tmp2
}
; CHECK-LABEL: @extract0_bitcast_s_buffer_load_v4i32(
; CHECK-NEXT: %tmp = call i32 @llvm.amdgcn.s.buffer.load.i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: %tmp2 = bitcast i32 %tmp to float
; CHECK-NEXT: ret float %tmp2
define float @extract0_bitcast_s_buffer_load_v4i32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%tmp = call <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32> %rsrc, i32 %ofs, i32 0)
%tmp1 = bitcast <4 x i32> %tmp to <4 x float>
%tmp2 = extractelement <4 x float> %tmp1, i32 0
ret float %tmp2
}
; CHECK-LABEL: @preserve_metadata_extract_elt0_s_buffer_load_v2f32(
; CHECK-NEXT: %data = call float @llvm.amdgcn.s.buffer.load.f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
; CHECK-NEXT: ret float %data
define amdgpu_ps float @preserve_metadata_extract_elt0_s_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32> %rsrc, i32 %ofs, i32 0), !fpmath !0
%elt0 = extractelement <2 x float> %data, i32 0
ret float %elt0
}
declare float @llvm.amdgcn.s.buffer.load.f32(<4 x i32>, i32, i32) #1
declare <2 x float> @llvm.amdgcn.s.buffer.load.v2f32(<4 x i32>, i32, i32) #1
declare <3 x float> @llvm.amdgcn.s.buffer.load.v3f32(<4 x i32>, i32, i32) #1
declare <4 x float> @llvm.amdgcn.s.buffer.load.v4f32(<4 x i32>, i32, i32) #1
declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) #1
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2f16(
; CHECK: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt0_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt0 = extractelement <2 x half> %data, i32 0
ret half %elt0
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_s_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <2 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_s_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <3 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4f16(
; CHECK-NEXT: %1 = add i32 %ofs, 6
; CHECK-NEXT: %data = call half @llvm.amdgcn.s.buffer.load.f16(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt3_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x half> %data, i32 3
ret half %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4f16(
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret <2 x half>
define amdgpu_ps <2 x half> @extract_elt0_elt1_s_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
ret <2 x half> %shuf
}
declare half @llvm.amdgcn.s.buffer.load.f16(<4 x i32>, i32, i32) #1
declare <2 x half> @llvm.amdgcn.s.buffer.load.v2f16(<4 x i32>, i32, i32) #1
declare <3 x half> @llvm.amdgcn.s.buffer.load.v3f16(<4 x i32>, i32, i32) #1
declare <4 x half> @llvm.amdgcn.s.buffer.load.v4f16(<4 x i32>, i32, i32) #1
; CHECK-LABEL: @extract_elt0_s_buffer_load_v2i8(
; CHECK: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt0_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt0 = extractelement <2 x i8> %data, i32 0
ret i8 %elt0
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v2i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <2 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v3i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <3 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt1_s_buffer_load_v4i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt3_s_buffer_load_v4i8(
; CHECK-NEXT: %1 = add i32 %ofs, 3
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32> %rsrc, i32 %1, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt3_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
%elt1 = extractelement <4 x i8> %data, i32 3
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_s_buffer_load_v4i8(
; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
; CHECK-NEXT: ret <2 x i8>
define amdgpu_ps <2 x i8> @extract_elt0_elt1_s_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %ofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32> %rsrc, i32 %ofs, i32 0)
%shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
ret <2 x i8> %shuf
}
declare i8 @llvm.amdgcn.s.buffer.load.i8(<4 x i32>, i32, i32) #1
declare <2 x i8> @llvm.amdgcn.s.buffer.load.v2i8(<4 x i32>, i32, i32) #1
declare <3 x i8> @llvm.amdgcn.s.buffer.load.v3i8(<4 x i32>, i32, i32) #1
declare <4 x i8> @llvm.amdgcn.s.buffer.load.v4i8(<4 x i32>, i32, i32) #1
; --------------------------------------------------------------------
; llvm.amdgcn.raw.buffer.load.format
; --------------------------------------------------------------------
@ -901,9 +1385,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v2f32(<4 x i32> inreg %r
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_struct_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <2 x float> %data, i32 1
@ -920,9 +1404,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v4f32(<4 x i32> inreg %r
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 1
@ -930,9 +1414,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_v4f32(<4 x i32> inreg %r
}
; CHECK-LABEL: @extract_elt2_struct_buffer_load_v4f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 2
@ -940,9 +1424,9 @@ define amdgpu_ps float @extract_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %r
}
; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 12
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x float> %data, i32 3
@ -959,9 +1443,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v4f32(<4 x i3
}
; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v4f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: ret <2 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
@ -969,9 +1453,9 @@ define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v4f32(<4 x i3
}
; CHECK-LABEL: @extract_elt2_elt3_struct_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
; CHECK-NEXT: ret <2 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
@ -988,9 +1472,9 @@ define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_struct_buffer_load_v4f32(<4
}
; CHECK-LABEL: @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(
; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
; CHECK-NEXT: ret <3 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <3 x float> %data
define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_struct_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
@ -1017,9 +1501,9 @@ define amdgpu_ps float @extract_elt0_struct_buffer_load_v3f32(<4 x i32> inreg %r
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f32(
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <3 x float> %data, i32 1
@ -1027,9 +1511,9 @@ define amdgpu_ps float @extract_elt1_struct_buffer_load_v3f32(<4 x i32> inreg %r
}
; CHECK-LABEL: @extract_elt2_struct_buffer_load_v3f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
; CHECK-NEXT: ret float %elt1
; CHECK-NEXT: %1 = add i32 %ofs, 8
; CHECK-NEXT: %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret float %data
define amdgpu_ps float @extract_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <3 x float> %data, i32 2
@ -1046,9 +1530,9 @@ define amdgpu_ps <2 x float> @extract_elt0_elt1_struct_buffer_load_v3f32(<4 x i3
}
; CHECK-LABEL: @extract_elt1_elt2_struct_buffer_load_v3f32(
; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
; CHECK-NEXT: ret <2 x float> %shuf
; CHECK-NEXT: %1 = add i32 %ofs, 4
; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.struct.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x float> %data
define amdgpu_ps <2 x float> @extract_elt1_elt2_struct_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x float> @llvm.amdgcn.struct.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
@ -1094,6 +1578,132 @@ declare <4 x float> @llvm.amdgcn.struct.buffer.load.v4f32(<4 x i32>, i32, i32, i
declare <4 x i32> @llvm.amdgcn.struct.buffer.load.v4i32(<4 x i32>, i32, i32, i32, i32) #1
; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2f16(
; CHECK: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt0_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt0 = extractelement <2 x half> %data, i32 0
ret half %elt0
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_struct_buffer_load_v2f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <2 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_struct_buffer_load_v3f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <3 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4f16(
; CHECK-NEXT: %1 = add i32 %ofs, 2
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x half> %data, i32 1
ret half %elt1
}
; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4f16(
; CHECK-NEXT: %1 = add i32 %ofs, 6
; CHECK-NEXT: %data = call half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret half %data
define amdgpu_ps half @extract_elt3_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x half> %data, i32 3
ret half %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4f16(
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x half>
define amdgpu_ps <2 x half> @extract_elt0_elt1_struct_buffer_load_v4f16(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x half> %data, <4 x half> undef, <2 x i32> <i32 0, i32 1>
ret <2 x half> %shuf
}
declare half @llvm.amdgcn.struct.buffer.load.f16(<4 x i32>, i32, i32, i32, i32) #1
declare <2 x half> @llvm.amdgcn.struct.buffer.load.v2f16(<4 x i32>, i32, i32, i32, i32) #1
declare <3 x half> @llvm.amdgcn.struct.buffer.load.v3f16(<4 x i32>, i32, i32, i32, i32) #1
declare <4 x half> @llvm.amdgcn.struct.buffer.load.v4f16(<4 x i32>, i32, i32, i32, i32) #1
; CHECK-LABEL: @extract_elt0_struct_buffer_load_v2i8(
; CHECK: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt0_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt0 = extractelement <2 x i8> %data, i32 0
ret i8 %elt0
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v2i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v2i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <2 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v3i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v3i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <3 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt1_struct_buffer_load_v4i8(
; CHECK-NEXT: %1 = add i32 %ofs, 1
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x i8> %data, i32 1
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt3_struct_buffer_load_v4i8(
; CHECK-NEXT: %1 = add i32 %ofs, 3
; CHECK-NEXT: %data = call i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32> %rsrc, i32 %idx, i32 %1, i32 %sofs, i32 0)
; CHECK-NEXT: ret i8 %data
define amdgpu_ps i8 @extract_elt3_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%elt1 = extractelement <4 x i8> %data, i32 3
ret i8 %elt1
}
; CHECK-LABEL: @extract_elt0_elt1_struct_buffer_load_v4i8(
; CHECK-NEXT: %data = call <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
; CHECK-NEXT: ret <2 x i8>
define amdgpu_ps <2 x i8> @extract_elt0_elt1_struct_buffer_load_v4i8(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs, i32 %sofs) #0 {
%data = call <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i32 %sofs, i32 0)
%shuf = shufflevector <4 x i8> %data, <4 x i8> undef, <2 x i32> <i32 0, i32 1>
ret <2 x i8> %shuf
}
declare i8 @llvm.amdgcn.struct.buffer.load.i8(<4 x i32>, i32, i32, i32, i32) #1
declare <2 x i8> @llvm.amdgcn.struct.buffer.load.v2i8(<4 x i32>, i32, i32, i32, i32) #1
declare <3 x i8> @llvm.amdgcn.struct.buffer.load.v3i8(<4 x i32>, i32, i32, i32, i32) #1
declare <4 x i8> @llvm.amdgcn.struct.buffer.load.v4i8(<4 x i32>, i32, i32, i32, i32) #1
; --------------------------------------------------------------------
; llvm.amdgcn.struct.buffer.load.format
; --------------------------------------------------------------------