[AMDGPU] Prefer v_fmac over v_fma only when no source modifiers are used

v_fmac with source modifiers forces VOP3 encoding, but it is strictly
better to use the VOP3-only v_fma instead, because $dst and $src2 are
not tied so it gives the register allocator more freedom and avoids a
copy in some cases.

This is the same strategy we already use for v_mad vs v_mac and
v_fma_legacy vs v_fmac_legacy.

Differential Revision: https://reviews.llvm.org/D110070
This commit is contained in:
Jay Foad 2021-09-20 14:20:28 +01:00
parent e83629280f
commit 86dcb59206
14 changed files with 194 additions and 202 deletions

View File

@ -2305,31 +2305,37 @@ let SubtargetPredicate = NotHasMinMaxDenormModes in {
let OtherPredicates = [HasDLInsts] in {
// Don't allow source modifiers. If there are any source modifiers then it's
// better to select fma instead of fmac.
def : GCNPat <
(fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
(f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(fma (f32 (VOP3NoMods f32:$src0)),
(f32 (VOP3NoMods f32:$src1)),
(f32 (VOP3NoMods f32:$src2))),
(V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
(V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
} // End OtherPredicates = [HasDLInsts]
let SubtargetPredicate = isGFX10Plus in
// Don't allow source modifiers. If there are any source modifiers then it's
// better to select fma instead of fmac.
def : GCNPat <
(fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
(f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
(fma (f16 (VOP3NoMods f32:$src0)),
(f16 (VOP3NoMods f32:$src1)),
(f16 (VOP3NoMods f32:$src2))),
(V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
let SubtargetPredicate = isGFX90APlus in
// Don't allow source modifiers. If there are any source modifiers then it's
// better to select fma instead of fmac.
def : GCNPat <
(fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
(f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
(fma (f64 (VOP3NoMods f64:$src0)),
(f64 (VOP3NoMods f64:$src1)),
(f64 (VOP3NoMods f64:$src2))),
(V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
SRCMODS.NONE, $src2, $clamp, $omod)
(V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
SRCMODS.NONE, $src2)
>;
// COPY is workaround tablegen bug from multiple outputs

View File

@ -90,10 +90,10 @@ define float @v_fdiv_f32(float %a, float %b) {
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@ -108,11 +108,11 @@ define float @v_fdiv_f32(float %a, float %b) {
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v3, v5
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float %a, %b
@ -194,10 +194,10 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@ -295,10 +295,10 @@ define float @v_rcp_f32(float %x) {
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2
; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3
; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@ -313,11 +313,11 @@ define float @v_rcp_f32(float %x) {
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4
; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float 1.0, %x
@ -403,10 +403,10 @@ define float @v_rcp_f32_arcp(float %x) {
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2
; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3
; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@ -421,11 +421,11 @@ define float @v_rcp_f32_arcp(float %x) {
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4
; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp float 1.0, %x
@ -566,10 +566,10 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@ -722,15 +722,15 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
@ -746,24 +746,24 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v8, v7, -v4, v6
; GFX10-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v6, -v4, v7
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, s4, v3, v3, v1
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v6
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1
; GFX10-FLUSH-NEXT: s_denorm_mode 3
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v6
; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v2
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v2, -v4, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v5
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> %a, %b
@ -884,15 +884,15 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
@ -1054,15 +1054,15 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8
; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
@ -1078,24 +1078,24 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
; GFX10-FLUSH-NEXT: s_denorm_mode 3
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
@ -1236,15 +1236,15 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8
; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
@ -1260,24 +1260,24 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
; GFX10-FLUSH-NEXT: s_denorm_mode 3
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2
; GFX10-FLUSH-NEXT: s_denorm_mode 0
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
@ -1475,15 +1475,15 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]

View File

@ -498,7 +498,7 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, v1, |v0|, v2
; GFX10-NEXT: v_fma_f32 v0, |v0|, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
@ -528,7 +528,7 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, |v1|, v0, v2
; GFX10-NEXT: v_fma_f32 v0, v0, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fabs.y = call float @llvm.fabs.f32(float %y)
%fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z)
@ -558,7 +558,7 @@ define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2
; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%fabs.x = call float @llvm.fabs.f32(float %x)
%fabs.y = call float @llvm.fabs.f32(float %y)
@ -668,7 +668,7 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, v1, -v0, v2
; GFX10-NEXT: v_fma_f32 v0, -v0, v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg float %x
%fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z)
@ -698,7 +698,7 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) {
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, -v1, v0, v2
; GFX10-NEXT: v_fma_f32 v0, v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.y = fneg float %y
%fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z)

View File

@ -59,13 +59,13 @@ body: |
; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4
; GFX10-LABEL: name: fma_f32_fneg_src0
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit %4
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1
@ -96,13 +96,13 @@ body: |
; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4
; GFX10-LABEL: name: fma_f32_fneg_src1
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
; GFX10-NEXT: S_ENDPGM 0, implicit %4
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = COPY $vgpr1

View File

@ -60,13 +60,13 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
; GCN-NEXT: v_mac_f32_e32 v10, v7, v6
; GCN-NEXT: v_mul_f32_e32 v1, v8, v6
; GCN-NEXT: v_mul_f32_e32 v7, v6, v3
; GCN-NEXT: v_fmac_f32_e64 v9, -v6, v3
; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_f32_e32 v3, v4, v10
; GCN-NEXT: v_add_f32_e32 v4, v4, v10
; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1
; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6
; GCN-NEXT: v_mul_f32_e32 v3, v4, v6
; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a
; GCN-NEXT: v_fmac_f32_e32 v7, v9, v6
; GCN-NEXT: v_mul_f32_e32 v3, v3, v6
; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6
; GCN-NEXT: v_mul_f32_e32 v0, v2, v6
; GCN-NEXT: v_mul_f32_e32 v2, v7, v4

View File

@ -59,9 +59,9 @@ entry:
; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
; GFX10-NOT: s_denorm_mode
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
@ -331,9 +331,9 @@ entry:
; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
; GFX10-NOT: s_denorm_mode
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]

View File

@ -1,6 +1,6 @@
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A -check-prefix=FUNC %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX90A %s
declare double @llvm.fma.f64(double, double, double) nounwind readnone
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
@ -55,8 +55,7 @@ define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x doubl
}
; FUNC-LABEL: {{^}}fma_f64_abs_src0:
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\]}}
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
@ -69,8 +68,7 @@ define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double ad
}
; FUNC-LABEL: {{^}}fma_f64_abs_src1:
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}}
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
@ -97,8 +95,7 @@ define amdgpu_kernel void @fma_f64_abs_src2(double addrspace(1)* %out, double ad
}
; FUNC-LABEL: {{^}}fma_f64_neg_src0:
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
@ -111,8 +108,7 @@ define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double ad
}
; FUNC-LABEL: {{^}}fma_f64_neg_src1:
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
@ -139,8 +135,7 @@ define amdgpu_kernel void @fma_f64_neg_src2(double addrspace(1)* %out, double ad
}
; FUNC-LABEL: {{^}}fma_f64_abs_neg_src0:
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1
@ -154,8 +149,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, doubl
}
; FUNC-LABEL: {{^}}fma_f64_abs_neg_src1:
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @fma_f64_abs_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
double addrspace(1)* %in2, double addrspace(1)* %in3) {
%r0 = load double, double addrspace(1)* %in1

View File

@ -71,7 +71,7 @@ define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0
; FMAGFX10: ; %bb.0:
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0
; FMAGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
;
; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
@ -84,7 +84,7 @@ define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0
; FMADGFX10: ; %bb.0:
; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0
; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0
; FMADGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0
; FMADGFX10-NEXT: s_setpc_b64 s[30:31]
%add = fsub fast float 1.0, %arg1
%tmp1 = fmul fast float %arg0, %add
@ -156,8 +156,8 @@ define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2
; FMAGFX10: ; %bb.0:
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0
; FMAGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1
; FMAGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0
; FMAGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
;
; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
@ -171,8 +171,8 @@ define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2
; FMADGFX10: ; %bb.0:
; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0
; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0
; FMADGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1
; FMADGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0
; FMADGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1
; FMADGFX10-NEXT: s_setpc_b64 s[30:31]
%add = fsub fast <2 x float> <float 1.0, float 1.0>, %arg1
%tmp1 = fmul fast <2 x float> %arg0, %add
@ -236,7 +236,7 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <
; FMAGFX10: ; %bb.0:
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
; FMAGFX10-NEXT: v_fma_f32 v0, v1, -v0, v1
; FMAGFX10-NEXT: v_fma_f32 v0, -v0, v1, v1
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
;
; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:

View File

@ -360,8 +360,7 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
@ -370,9 +369,7 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
%tid.ext = sext i32 %tid to i64

View File

@ -156,7 +156,7 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)*
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
half addrspace(1)* %in2) #0 {
@ -280,7 +280,7 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace
; GFX10-NEXT: v_rcp_f16_e32 v3, v2
; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
half addrspace(1)* %in2) #0 {
@ -404,7 +404,7 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa
; GFX10-NEXT: v_rcp_f16_e32 v3, v2
; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
half addrspace(1)* %in2) #1 {
@ -575,7 +575,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)
; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6
; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
float addrspace(1)* %in2) #0 {
@ -691,7 +691,7 @@ define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspa
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
float addrspace(1)* %in2) #0 {
@ -807,7 +807,7 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
float addrspace(1)* %in2) #1 {
@ -1534,22 +1534,21 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX10-NEXT: v_rcp_f32_e32 v4, v4
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
; GFX10-NEXT: v_mov_b32_e32 v4, v1
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v4, -v3, v2
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX10-NEXT: v_rcp_f32_e32 v5, v5
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1
; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1
; GFX10-NEXT: v_trunc_f16_e32 v4, v4
; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
; GFX10-NEXT: s_endpgm
<2 x half> addrspace(1)* %in2) #0 {
@ -1899,42 +1898,40 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
; GFX10-NEXT: v_mov_b32_e32 v6, v1
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3
; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1
; GFX10-NEXT: v_rcp_f32_e32 v7, v7
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7
; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6
; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1
; GFX10-NEXT: v_trunc_f16_e32 v6, v6
; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX10-NEXT: v_rcp_f32_e32 v5, v5
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-NEXT: v_mov_b32_e32 v5, v0
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2
; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0
; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
<4 x half> addrspace(1)* %in2) #0 {
@ -2173,7 +2170,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1
; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1
; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0
; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
; GFX10-NEXT: v_rcp_f32_e32 v6, v5
@ -2188,7 +2185,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7
; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2
; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
<2 x float> addrspace(1)* %in2) #0 {
@ -2547,7 +2544,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12
; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3
; GFX10-NEXT: v_trunc_f32_e32 v9, v9
; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3
; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3
; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2
; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
; GFX10-NEXT: v_rcp_f32_e32 v10, v9
@ -2562,7 +2559,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2
; GFX10-NEXT: v_trunc_f32_e32 v7, v7
; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2
; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2
; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1
; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
; GFX10-NEXT: v_rcp_f32_e32 v9, v7
@ -2577,7 +2574,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10
; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1
; GFX10-NEXT: v_trunc_f32_e32 v6, v6
; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1
; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1
; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0
; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
; GFX10-NEXT: v_rcp_f32_e32 v7, v6
@ -2592,7 +2589,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4
; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
; GFX10-NEXT: s_endpgm
<4 x float> addrspace(1)* %in2) #0 {

View File

@ -356,7 +356,7 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
; GCN: s_waitcnt
; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2
; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2
; GFX906-NEXT: v_fma_f32 v0, v1, |v0|, v2
; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2
; GCN-NEXT: s_setpc_b64
define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
%src0.fabs = call float @llvm.fabs.f32(float %src0)

View File

@ -176,8 +176,7 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, ha
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fmac_f16_e64 v2, -v0, -v1
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_fma_f16 v0, -v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg half %x
%neg.y = fneg half %y
@ -196,8 +195,7 @@ define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, ha
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fmac_f16_e64 v2, |v0|, |v1|
; GFX10-NEXT: v_mov_b32_e32 v0, v2
; GFX10-NEXT: v_fma_f16 v0, |v0|, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = call half @llvm.fabs.f16(half %x)
%neg.y = call half @llvm.fabs.f16(half %y)

View File

@ -111,7 +111,7 @@ define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, -v1, -v0, v2
; GFX10-NEXT: v_fma_f32 v0, -v0, -v1, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg float %x
%neg.y = fneg float %y
@ -130,7 +130,7 @@ define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y,
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2
; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = call float @llvm.fabs.f32(float %x)
%neg.y = call float @llvm.fabs.f32(float %y)
@ -150,8 +150,8 @@ define <2 x float> @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg(<2 x float>
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: v_fma_f32 v0, -v2, -v0, v4
; GFX10-NEXT: v_fma_f32 v1, -v3, -v1, v5
; GFX10-NEXT: v_fma_f32 v0, -v0, -v2, v4
; GFX10-NEXT: v_fma_f32 v1, -v1, -v3, v5
; GFX10-NEXT: s_setpc_b64 s[30:31]
%neg.x = fneg <2 x float> %x
%neg.y = fneg <2 x float> %y

View File

@ -187,7 +187,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GCN-LABEL: {{^}}fdiv_test_denormals
; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
; GFX1030: v_fmac_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
; GFX1030: v_fma_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
bb:
%tmp = load i8, i8 addrspace(1)* null, align 1