forked from OSchip/llvm-project
[AMDGPU] Don't combine memory intrs to v3i16
v3i16 and v3f16 currently cannot be legalized and lowered so they should not be emitted by inst combining. Moved the check down to still allow extracting 1 or 2 elements via the dmask. Fixes image intrinsics being combined to return v3x16. Differential Revision: https://reviews.llvm.org/D84223
This commit is contained in:
parent
cf9fa2aa01
commit
2c659082bd
|
@ -1030,12 +1030,6 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
|||
APInt DemandedElts,
|
||||
int DMaskIdx) {
|
||||
|
||||
// FIXME: Allow v3i16/v3f16 in buffer intrinsics when the types are fully supported.
|
||||
if (DMaskIdx < 0 &&
|
||||
II->getType()->getScalarSizeInBits() != 32 &&
|
||||
DemandedElts.getActiveBits() == 3)
|
||||
return nullptr;
|
||||
|
||||
auto *IIVTy = cast<VectorType>(II->getType());
|
||||
unsigned VWidth = IIVTy->getNumElements();
|
||||
if (VWidth == 1)
|
||||
|
@ -1124,6 +1118,11 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
|
|||
if (!NewNumElts)
|
||||
return UndefValue::get(II->getType());
|
||||
|
||||
// FIXME: Allow v3i16/v3f16 in buffer and image intrinsics when the types are
|
||||
// fully supported.
|
||||
if (II->getType()->getScalarSizeInBits() == 16 && NewNumElts == 3)
|
||||
return nullptr;
|
||||
|
||||
if (NewNumElts >= VWidth && DemandedElts.isMask()) {
|
||||
if (DMaskIdx >= 0)
|
||||
II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
|
||||
|
|
|
@ -2965,6 +2965,64 @@ declare <4 x float> @llvm.amdgcn.image.sample.cd.1d.v4f32.f32.f32(i32, float, fl
|
|||
; llvm.amdgcn.image.sample.cd.cl
|
||||
; --------------------------------------------------------------------
|
||||
|
||||
; CHECK-LABEL: @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 8, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
%elt0 = extractelement <4 x half> %data, i32 3
|
||||
ret half %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 4, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
%elt0 = extractelement <4 x half> %data, i32 2
|
||||
ret half %elt0
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 2, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
define amdgpu_ps half @extract_elt1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
%elt0 = extractelement <4 x half> %data, i32 1
|
||||
ret half %elt0
|
||||
}
|
||||
|
||||
; FIXME: Enable load shortening when full support for v3f16 has been added (should expect call <3 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v3f16.f32.f32).
|
||||
; CHECK-LABEL: @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(
|
||||
; CHECK-NEXT: %data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: %res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
|
||||
; CHECK-NEXT: ret <4 x half> %res
|
||||
define amdgpu_ps <4 x half> @extract_elt_to3_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
%res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
|
||||
ret <4 x half> %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(
|
||||
; CHECK-NEXT: %data = call <2 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v2f16.f32.f32(i32 3, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: %res = shufflevector <2 x half> %data, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
|
||||
; CHECK-NEXT: ret <4 x half> %res
|
||||
define amdgpu_ps <4 x half> @extract_elt_to2_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
%res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
|
||||
ret <4 x half> %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: %res = insertelement <4 x half> undef, half %data, i64 0
|
||||
; CHECK-NEXT: ret <4 x half> %res
|
||||
define amdgpu_ps <4 x half> @extract_elt_to1_image_sample_cd_cl_1d_v4f16_f32_f32(float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> inreg %sampler, <4 x i32> inreg %rsrc) #0 {
|
||||
%data = call <4 x half> @llvm.amdgcn.image.sample.cd.cl.1d.v4f16.f32.f32(i32 15, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
%res = shufflevector <4 x half> %data, <4 x half> undef, <4 x i32> <i32 0, i32 4, i32 5, i32 6>
|
||||
ret <4 x half> %res
|
||||
}
|
||||
|
||||
; CHECK-LABEL: @extract_elt0_image_sample_cd_cl_1d_v4f16_f32_f32(
|
||||
; CHECK-NEXT: %data = call half @llvm.amdgcn.image.sample.cd.cl.1d.f16.f32.f32(i32 1, float %dsdh, float %dsdv, float %s, float %clamp, <8 x i32> %sampler, <4 x i32> %rsrc, i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: ret half %data
|
||||
|
|
Loading…
Reference in New Issue