[AMDGPU][GlobalISel] Legalize G_MUL for non-standard types

Legalizing G_MUL for non-standard types (like i33) generated an error. Putting
minScalar and maxScalar instead of clampScalar. Also using new rule, instead
of widening to the next power of 2, widen to the next multiple of the passed
argument (32 in this case), so instead of widening i65 to i128, we widen it to
i96.

Patch by: Mateja Marjanovic

Differential Revision: https://reviews.llvm.org/D109228
This commit is contained in:
Mirko Brkusanin 2021-09-07 16:25:04 +02:00
parent 5263bf583a
commit 6c4b634da6
6 changed files with 149 additions and 21 deletions

View File

@ -301,6 +301,10 @@ LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size);
/// type that's wider than the given size.
LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size);
/// True iff the specified type index is a scalar whose size is not a multiple
/// of Size.
LegalityPredicate sizeNotMultipleOf(unsigned TypeIdx, unsigned Size);
/// True iff the specified type index is a scalar whose size is not a power of
/// 2.
LegalityPredicate sizeNotPow2(unsigned TypeIdx);
@ -356,6 +360,11 @@ LegalizeMutation changeElementSizeTo(unsigned TypeIdx, unsigned FromTypeIdx);
/// next power of 2.
LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min = 0);
/// Widen the scalar type or vector element type for the given type index to
/// next multiple of \p Size.
LegalizeMutation widenScalarOrEltToNextMultipleOf(unsigned TypeIdx,
unsigned Size);
/// Add more elements to the type for the given type index to the next power of
/// 2.
LegalizeMutation moreElementsToNextPow2(unsigned TypeIdx, unsigned Min = 0);
@ -836,6 +845,16 @@ public:
LegalizeMutations::widenScalarOrEltToNextPow2(TypeIdx, MinSize));
}
/// Widen the scalar to the next multiple of Size. No effect if the
/// type is not a scalar or is a multiple of Size.
LegalizeRuleSet &widenScalarToNextMultipleOf(unsigned TypeIdx,
unsigned Size) {
using namespace LegalityPredicates;
return actionIf(
LegalizeAction::WidenScalar, sizeNotMultipleOf(typeIdx(TypeIdx), Size),
LegalizeMutations::widenScalarOrEltToNextMultipleOf(TypeIdx, Size));
}
/// Widen the scalar or vector element type to the next power of two that is
/// at least MinSize. No effect if the scalar size is a power of two.
LegalizeRuleSet &widenScalarOrEltToNextPow2(unsigned TypeIdx,

View File

@ -153,6 +153,14 @@ LegalityPredicate LegalityPredicates::scalarOrEltSizeNotPow2(unsigned TypeIdx) {
};
}
LegalityPredicate LegalityPredicates::sizeNotMultipleOf(unsigned TypeIdx,
unsigned Size) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];
return QueryTy.isScalar() && QueryTy.getSizeInBits() % Size != 0;
};
}
LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
return [=](const LegalityQuery &Query) {
const LLT QueryTy = Query.Types[TypeIdx];

View File

@ -63,6 +63,16 @@ LegalizeMutation LegalizeMutations::widenScalarOrEltToNextPow2(unsigned TypeIdx,
};
}
LegalizeMutation
LegalizeMutations::widenScalarOrEltToNextMultipleOf(unsigned TypeIdx,
unsigned Size) {
return [=](const LegalityQuery &Query) {
const LLT Ty = Query.Types[TypeIdx];
unsigned NewEltSizeInBits = alignTo(Ty.getScalarSizeInBits(), Size);
return std::make_pair(TypeIdx, Ty.changeElementSize(NewEltSizeInBits));
};
}
LegalizeMutation LegalizeMutations::moreElementsToNextPow2(unsigned TypeIdx,
unsigned Min) {
return [=](const LegalityQuery &Query) {

View File

@ -532,10 +532,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
// Full set of gfx9 features.
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16, V2S16})
.clampScalar(0, S16, S32)
.minScalar(0, S16)
.clampMaxNumElements(0, S16, 2)
.scalarize(0)
.widenScalarToNextPow2(0, 32);
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32)
.scalarize(0);
getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
.legalFor({S32, S16, V2S16}) // Clamp modifier
@ -547,9 +548,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
} else if (ST.has16BitInsts()) {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32, S16})
.clampScalar(0, S16, S32)
.scalarize(0)
.widenScalarToNextPow2(0, 32); // FIXME: min should be 16
.minScalar(0, S16)
.widenScalarToNextMultipleOf(0, 32)
.maxScalar(0, S32)
.scalarize(0);
// Technically the saturating operations require clamp bit support, but this
// was introduced at the same time as 16-bit operations.
@ -569,6 +571,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
} else {
getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
.legalFor({S32})
.widenScalarToNextMultipleOf(0, 32)
.clampScalar(0, S32, S32)
.scalarize(0);

View File

@ -500,21 +500,58 @@ body: |
$vgpr0 = COPY %5
...
# FIXME:
# ---
# name: test_mul_s33
# body: |
# bb.0:
# liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
# %0:_(s64) = COPY $vgpr0_vgpr1
# %1:_(s64) = COPY $vgpr2_vgpr3
# %2:_(s33) = G_TRUNC %0
# %3:_(s33) = G_TRUNC %1
# %4:_(s33) = G_MUL %2, %3
# %5:_(s64) = G_ANYEXT %4
# $vgpr0_vgpr1 = COPY %5
# ...
---
name: test_mul_s33
body: |
bb.0:
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
; GFX6-LABEL: name: test_mul_s33
; GFX6: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; GFX6: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
; GFX6: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; GFX6: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; GFX6: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
; GFX6: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
; GFX6: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
; GFX6: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]]
; GFX6: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
; GFX6: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
; GFX6: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32)
; GFX6: $vgpr0_vgpr1 = COPY [[MV]](s64)
; GFX8-LABEL: name: test_mul_s33
; GFX8: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; GFX8: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
; GFX8: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; GFX8: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; GFX8: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
; GFX8: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
; GFX8: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
; GFX8: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]]
; GFX8: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
; GFX8: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
; GFX8: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32)
; GFX8: $vgpr0_vgpr1 = COPY [[MV]](s64)
; GFX9-LABEL: name: test_mul_s33
; GFX9: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
; GFX9: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
; GFX9: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
; GFX9: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](s64)
; GFX9: [[MUL:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV2]]
; GFX9: [[MUL1:%[0-9]+]]:_(s32) = G_MUL [[UV1]], [[UV2]]
; GFX9: [[MUL2:%[0-9]+]]:_(s32) = G_MUL [[UV]], [[UV3]]
; GFX9: [[UMULH:%[0-9]+]]:_(s32) = G_UMULH [[UV]], [[UV2]]
; GFX9: [[ADD:%[0-9]+]]:_(s32) = G_ADD [[MUL1]], [[MUL2]]
; GFX9: [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[ADD]], [[UMULH]]
; GFX9: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[MUL]](s32), [[ADD1]](s32)
; GFX9: $vgpr0_vgpr1 = COPY [[MV]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s33) = G_TRUNC %0
%3:_(s33) = G_TRUNC %1
%4:_(s33) = G_MUL %2, %3
%5:_(s64) = G_ANYEXT %4
$vgpr0_vgpr1 = COPY %5
...
---
name: test_mul_s96

View File

@ -276,6 +276,57 @@ define <2 x i32> @v_mul_v2i32(<2 x i32> %num, <2 x i32> %den) {
ret <2 x i32> %result
}
define amdgpu_cs i33 @s_mul_i33(i33 inreg %num, i33 inreg %den) {
; GFX7-LABEL: s_mul_i33:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mov_b32_e32 v0, s2
; GFX7-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX7-NEXT: s_mul_i32 s4, s0, s2
; GFX7-NEXT: s_mul_i32 s1, s1, s2
; GFX7-NEXT: s_mul_i32 s0, s0, s3
; GFX7-NEXT: s_add_i32 s1, s1, s0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s1, v0
; GFX7-NEXT: v_readfirstlane_b32 s1, v0
; GFX7-NEXT: s_mov_b32 s0, s4
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_mul_i33:
; GFX8: ; %bb.0:
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
; GFX8-NEXT: s_mul_i32 s4, s0, s2
; GFX8-NEXT: s_mul_i32 s1, s1, s2
; GFX8-NEXT: s_mul_i32 s0, s0, s3
; GFX8-NEXT: s_add_i32 s1, s1, s0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s1, v0
; GFX8-NEXT: v_readfirstlane_b32 s1, v0
; GFX8-NEXT: s_mov_b32 s0, s4
; GFX8-NEXT: ; return to shader part epilog
;
; GFX9-LABEL: s_mul_i33:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_mul_i32 s1, s1, s2
; GFX9-NEXT: s_mul_i32 s3, s0, s3
; GFX9-NEXT: s_mul_i32 s4, s0, s2
; GFX9-NEXT: s_mul_hi_u32 s0, s0, s2
; GFX9-NEXT: s_add_i32 s1, s1, s3
; GFX9-NEXT: s_add_i32 s1, s1, s0
; GFX9-NEXT: s_mov_b32 s0, s4
; GFX9-NEXT: ; return to shader part epilog
;
; GFX10-LABEL: s_mul_i33:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_mul_i32 s1, s1, s2
; GFX10-NEXT: s_mul_i32 s3, s0, s3
; GFX10-NEXT: s_mul_hi_u32 s4, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s3
; GFX10-NEXT: s_mul_i32 s0, s0, s2
; GFX10-NEXT: s_add_i32 s1, s1, s4
; GFX10-NEXT: ; return to shader part epilog
%result = mul i33 %num, %den
ret i33 %result
}
define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
; GFX7-LABEL: s_mul_i64:
; GFX7: ; %bb.0: