forked from OSchip/llvm-project
[AMDGPU] Only match correct type for a16
Addresses are floats when a sampler is present and unsigned integers when no sampler is present. Therefore, only zext instructions, not sext instructions should match. Also match integer constants that can be truncated. Differential Revision: https://reviews.llvm.org/D118043
This commit is contained in:
parent
7cc3e141d7
commit
4ed7c6eec9
|
@ -58,24 +58,37 @@ static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
|
|||
|
||||
// Check if a value can be converted to a 16-bit value without losing
|
||||
// precision.
|
||||
static bool canSafelyConvertTo16Bit(Value &V) {
|
||||
// The value is expected to be either a float (IsFloat = true) or an unsigned
|
||||
// integer (IsFloat = false).
|
||||
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
|
||||
Type *VTy = V.getType();
|
||||
if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
|
||||
// The value is already 16-bit, so we don't want to convert to 16-bit again!
|
||||
return false;
|
||||
}
|
||||
if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
|
||||
// We need to check that if we cast the index down to a half, we do not lose
|
||||
// precision.
|
||||
APFloat FloatValue(ConstFloat->getValueAPF());
|
||||
bool LosesInfo = true;
|
||||
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero, &LosesInfo);
|
||||
return !LosesInfo;
|
||||
if (IsFloat) {
|
||||
if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
|
||||
// We need to check that if we cast the index down to a half, we do not
|
||||
// lose precision.
|
||||
APFloat FloatValue(ConstFloat->getValueAPF());
|
||||
bool LosesInfo = true;
|
||||
FloatValue.convert(APFloat::IEEEhalf(), APFloat::rmTowardZero,
|
||||
&LosesInfo);
|
||||
return !LosesInfo;
|
||||
}
|
||||
} else {
|
||||
if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
|
||||
// We need to check that if we cast the index down to an i16, we do not
|
||||
// lose precision.
|
||||
APInt IntValue(ConstInt->getValue());
|
||||
return IntValue.getActiveBits() <= 16;
|
||||
}
|
||||
}
|
||||
|
||||
Value *CastSrc;
|
||||
if (match(&V, m_FPExt(PatternMatch::m_Value(CastSrc))) ||
|
||||
match(&V, m_SExt(PatternMatch::m_Value(CastSrc))) ||
|
||||
match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)))) {
|
||||
bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
|
||||
: match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
|
||||
if (IsExt) {
|
||||
Type *CastSrcTy = CastSrc->getType();
|
||||
if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
|
||||
return true;
|
||||
|
@ -203,6 +216,10 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
|
|||
if (!ST->hasA16() && !ST->hasG16())
|
||||
return None;
|
||||
|
||||
// Address is interpreted as float if the instruction has a sampler or as
|
||||
// unsigned int if there is no sampler.
|
||||
bool HasSampler =
|
||||
AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode)->Sampler;
|
||||
bool FloatCoord = false;
|
||||
// true means derivatives can be converted to 16 bit, coordinates not
|
||||
bool OnlyDerivatives = false;
|
||||
|
@ -211,7 +228,7 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
|
|||
OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
|
||||
Value *Coord = II.getOperand(OperandIndex);
|
||||
// If the values are not derived from 16-bit values, we cannot optimize.
|
||||
if (!canSafelyConvertTo16Bit(*Coord)) {
|
||||
if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
|
||||
if (OperandIndex < ImageDimIntr->CoordStart ||
|
||||
ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
|
||||
return None;
|
||||
|
@ -232,7 +249,9 @@ simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST,
|
|||
// Check if there is a bias parameter and if it can be converted to f16
|
||||
if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
|
||||
Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
|
||||
if (!canSafelyConvertTo16Bit(*Bias))
|
||||
assert(HasSampler &&
|
||||
"Only image instructions with a sampler can have a bias");
|
||||
if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
|
||||
OnlyDerivatives = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -3667,6 +3667,105 @@ define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_V2(<2 x float> addrspa
|
|||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
|
||||
; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const(
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f16(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], half [[S:%.*]], half 0xH3400, half [[SLICE:%.*]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%dsdh32 = fpext half %dsdh to float
|
||||
%dtdh32 = fpext half %dtdh to float
|
||||
%dsdv32 = fpext half %dsdv to float
|
||||
%dtdv32 = fpext half %dtdv to float
|
||||
%s32 = fpext half %s to float
|
||||
%slice32 = fpext half %slice to float
|
||||
%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 0.25, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
|
||||
store <2 x float> %res, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @image_sample_a16_c_d_o_2darray_const_noopt(<2 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, <4 x i32> inreg %samp, i32 %offset, float %zcompare, half %dsdh, half %dtdh, half %dsdv, half %dtdv, half %s, half %slice) {
|
||||
; CHECK-LABEL: @image_sample_a16_c_d_o_2darray_const_noopt(
|
||||
; CHECK-NEXT: [[S32:%.*]] = fpext half [[S:%.*]] to float
|
||||
; CHECK-NEXT: [[SLICE32:%.*]] = fpext half [[SLICE:%.*]] to float
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f16.f32(i32 6, i32 [[OFFSET:%.*]], float [[ZCOMPARE:%.*]], half [[DSDH:%.*]], half [[DTDH:%.*]], half [[DSDV:%.*]], half [[DTDV:%.*]], float [[S32]], float 1.000000e+10, float [[SLICE32]], <8 x i32> [[RSRC:%.*]], <4 x i32> [[SAMP:%.*]], i1 false, i32 0, i32 0)
|
||||
; CHECK-NEXT: store <2 x float> [[RES]], <2 x float> addrspace(1)* [[OUT:%.*]], align 8
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%dsdh32 = fpext half %dsdh to float
|
||||
%dtdh32 = fpext half %dtdh to float
|
||||
%dsdv32 = fpext half %dsdv to float
|
||||
%dtdv32 = fpext half %dtdv to float
|
||||
%s32 = fpext half %s to float
|
||||
%slice32 = fpext half %slice to float
|
||||
%res = call <2 x float> @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh32, float %dtdh32, float %dsdv32, float %dtdv32, float %s32, float 1.0e+10, float %slice32, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 0, i32 0)
|
||||
store <2 x float> %res, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @image_load_a16_mip_1d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
|
||||
; CHECK-LABEL: @image_load_a16_mip_1d(
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i16(i32 15, i16 [[S:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
|
||||
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%s32 = zext i16 %s to i32
|
||||
%res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @image_load_a16_mip_1d_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
|
||||
; CHECK-LABEL: @image_load_a16_mip_1d_noopt(
|
||||
; CHECK-NEXT: [[S32:%.*]] = sext i16 [[S:%.*]] to i32
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.1d.v4f32.i32(i32 15, i32 [[S32]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
|
||||
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%s32 = sext i16 %s to i32
|
||||
%res = call <4 x float> @llvm.amdgcn.image.load.mip.1d.v4f32.i32(i32 15, i32 %s32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @image_load_a16_mip_2d(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s, i16 %t) {
|
||||
; CHECK-LABEL: @image_load_a16_mip_2d(
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 [[T:%.*]], <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
|
||||
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%s32 = zext i16 %s to i32
|
||||
%t32 = zext i16 %t to i32
|
||||
%res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 %t32, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @image_load_a16_mip_2d_const(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
|
||||
; CHECK-LABEL: @image_load_a16_mip_2d_const(
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i16(i32 15, i16 [[S:%.*]], i16 -1, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
|
||||
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%s32 = zext i16 %s to i32
|
||||
%res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65535, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_kernel void @image_load_a16_mip_2d_const_noopt(<4 x float> addrspace(1)* %out, <8 x i32> inreg %rsrc, i16 %s) {
|
||||
; CHECK-LABEL: @image_load_a16_mip_2d_const_noopt(
|
||||
; CHECK-NEXT: [[S32:%.*]] = zext i16 [[S:%.*]] to i32
|
||||
; CHECK-NEXT: [[RES:%.*]] = call <4 x float> @llvm.amdgcn.image.load.2d.v4f32.i32(i32 15, i32 [[S32]], i32 65536, <8 x i32> [[RSRC:%.*]], i32 0, i32 0)
|
||||
; CHECK-NEXT: store <4 x float> [[RES]], <4 x float> addrspace(1)* [[OUT:%.*]], align 16
|
||||
; CHECK-NEXT: ret void
|
||||
;
|
||||
%s32 = zext i16 %s to i32
|
||||
%res = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %s32, i32 65536, i32 0, <8 x i32> %rsrc, i32 0, i32 0)
|
||||
store <4 x float> %res, <4 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; --------------------------------------------------------------------
|
||||
; llvm.amdgcn.image.sample g16
|
||||
; --------------------------------------------------------------------
|
||||
|
|
Loading…
Reference in New Issue