forked from OSchip/llvm-project
[AMDGPU] Prefer v_fmac over v_fma only when no source modifiers are used
v_fmac with source modifiers forces VOP3 encoding, but it is strictly better to use the VOP3-only v_fma instead, because $dst and $src2 are not tied so it gives the register allocator more freedom and avoids a copy in some cases. This is the same strategy we already use for v_mad vs v_mac and v_fma_legacy vs v_fmac_legacy. Differential Revision: https://reviews.llvm.org/D110070
This commit is contained in:
parent
e83629280f
commit
86dcb59206
|
@ -2305,31 +2305,37 @@ let SubtargetPredicate = NotHasMinMaxDenormModes in {
|
|||
|
||||
|
||||
let OtherPredicates = [HasDLInsts] in {
|
||||
// Don't allow source modifiers. If there are any source modifiers then it's
|
||||
// better to select fma instead of fmac.
|
||||
def : GCNPat <
|
||||
(fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
|
||||
(f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
|
||||
(fma (f32 (VOP3NoMods f32:$src0)),
|
||||
(f32 (VOP3NoMods f32:$src1)),
|
||||
(f32 (VOP3NoMods f32:$src2))),
|
||||
(V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
|
||||
(V_FMAC_F32_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
|
||||
SRCMODS.NONE, $src2)
|
||||
>;
|
||||
} // End OtherPredicates = [HasDLInsts]
|
||||
|
||||
let SubtargetPredicate = isGFX10Plus in
|
||||
// Don't allow source modifiers. If there are any source modifiers then it's
|
||||
// better to select fma instead of fmac.
|
||||
def : GCNPat <
|
||||
(fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
|
||||
(f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
|
||||
(fma (f16 (VOP3NoMods f32:$src0)),
|
||||
(f16 (VOP3NoMods f32:$src1)),
|
||||
(f16 (VOP3NoMods f32:$src2))),
|
||||
(V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
|
||||
(V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
|
||||
SRCMODS.NONE, $src2)
|
||||
>;
|
||||
|
||||
let SubtargetPredicate = isGFX90APlus in
|
||||
// Don't allow source modifiers. If there are any source modifiers then it's
|
||||
// better to select fma instead of fmac.
|
||||
def : GCNPat <
|
||||
(fma (f64 (VOP3Mods0 f64:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
|
||||
(f64 (VOP3Mods f64:$src1, i32:$src1_modifiers)),
|
||||
(fma (f64 (VOP3NoMods f64:$src0)),
|
||||
(f64 (VOP3NoMods f64:$src1)),
|
||||
(f64 (VOP3NoMods f64:$src2))),
|
||||
(V_FMAC_F64_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
|
||||
SRCMODS.NONE, $src2, $clamp, $omod)
|
||||
(V_FMAC_F64_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
|
||||
SRCMODS.NONE, $src2)
|
||||
>;
|
||||
|
||||
// COPY is workaround tablegen bug from multiple outputs
|
||||
|
|
|
@ -90,10 +90,10 @@ define float @v_fdiv_f32(float %a, float %b) {
|
|||
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -108,11 +108,11 @@ define float @v_fdiv_f32(float %a, float %b) {
|
|||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v4, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
|
||||
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv float %a, %b
|
||||
|
@ -194,10 +194,10 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
|
|||
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -295,10 +295,10 @@ define float @v_rcp_f32(float %x) {
|
|||
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -313,11 +313,11 @@ define float @v_rcp_f32(float %x) {
|
|||
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
|
||||
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv float 1.0, %x
|
||||
|
@ -403,10 +403,10 @@ define float @v_rcp_f32_arcp(float %x) {
|
|||
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v1, v2, 1.0
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v2, v3, v2
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v2
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v5, v3, -v1, v4
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v1, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v2
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v4, -v1, v3
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v4, v2, v3
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v1, -v1, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v3
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -421,11 +421,11 @@ define float @v_rcp_f32_arcp(float %x) {
|
|||
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v2, v4, v2
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, v4, -v1, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v2
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v1, v4
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v3, v2, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
|
||||
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv arcp float 1.0, %x
|
||||
|
@ -566,10 +566,10 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
|
|||
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v2, v3, 1.0
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v4, v3
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v5, v3
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v6, v4, -v2, v5
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v4, v5
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v6, v3
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v5, -v2, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v5, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v4, v5
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v4
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
|
@ -722,15 +722,15 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
|
|||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
|
||||
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -746,24 +746,24 @@ define <2 x float> @v_fdiv_v2f32(<2 x float> %a, <2 x float> %b) {
|
|||
; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v5
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v8, v7, -v4, v6
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v7, v8, v5
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v6, -v4, v7
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v3, v3, v1
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v5, v6, v5, v7
|
||||
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v6, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v5, v2, v0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v6, s4, v3, v3, v1
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
|
||||
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v6
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, v1, v3, v1
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v6, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v6, v5, v6
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v6
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v7, v5, -v4, v2
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v7, v6
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v2, -v4, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v4, -v6, v5, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v4, v5
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v7, -v6, v4, v2
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v7, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v6, v4, v2
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v6, v5
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v5, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
|
||||
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv <2 x float> %a, %b
|
||||
|
@ -884,15 +884,15 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
|
|||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
|
||||
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -1054,15 +1054,15 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
|
|||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7
|
||||
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -1078,24 +1078,24 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
|
|||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
|
||||
|
@ -1236,15 +1236,15 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
|
|||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v7, v5
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v8, v4
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v6, v5
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v10, v7, -v2, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v11, v9, -v3, v6
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v10, -v2, v7, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v6
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v10, v4
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v11, v5
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v2, v7
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v6, -v3, v9
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v8, v4, v7
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v6
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v7
|
||||
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v6, v5, v9
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
|
@ -1260,24 +1260,24 @@ define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
|
|||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v4
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v4, -v2, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v3, v4, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v3, vcc_lo, 1.0, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, s4, v1, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v4
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, vcc_lo, 1.0, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v4, v5, v4
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, v5, -v2, v3
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v4
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e64 v3, -v2, v5
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v4, v3, 1.0
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
|
||||
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v4, v5, v2
|
||||
; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
|
||||
; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v4, v5, v2
|
||||
; GFX10-FLUSH-NEXT: s_denorm_mode 0
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v4, v5
|
||||
; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
|
||||
; GFX10-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
|
||||
; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
|
||||
%fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
|
||||
|
@ -1475,15 +1475,15 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
|
|||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v7, v9, v7
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v10, v6
|
||||
; GFX10-IEEE-NEXT: v_mul_f32_e32 v11, v8, v7
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v12, v9, -v4, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v13, v11, -v5, v8
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v12, -v4, v9, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v8
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v9, v12, v6
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e32 v11, v13, v7
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v10, -v4, v9
|
||||
; GFX10-IEEE-NEXT: v_fmac_f32_e64 v8, -v5, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v10, v6, v9
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v4, -v4, v9, v10
|
||||
; GFX10-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v8
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v4, v4, v6, v9
|
||||
; GFX10-IEEE-NEXT: s_mov_b32 vcc_lo, s4
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v8, v7, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fmas_f32 v5, v5, v7, v11
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
|
||||
; GFX10-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
|
||||
; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
|
||||
|
|
|
@ -498,7 +498,7 @@ define float @v_fma_f32_fabs_lhs(float %x, float %y, float %z) {
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, v1, |v0|, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, |v0|, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%fabs.x = call float @llvm.fabs.f32(float %x)
|
||||
%fma = call float @llvm.fma.f32(float %fabs.x, float %y, float %z)
|
||||
|
@ -528,7 +528,7 @@ define float @v_fma_f32_fabs_rhs(float %x, float %y, float %z) {
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, |v1|, v0, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, v0, |v1|, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%fabs.y = call float @llvm.fabs.f32(float %y)
|
||||
%fma = call float @llvm.fma.f32(float %x, float %fabs.y, float %z)
|
||||
|
@ -558,7 +558,7 @@ define float @v_fma_f32_fabs_lhs_rhs(float %x, float %y, float %z) {
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%fabs.x = call float @llvm.fabs.f32(float %x)
|
||||
%fabs.y = call float @llvm.fabs.f32(float %y)
|
||||
|
@ -668,7 +668,7 @@ define float @v_fma_f32_fneg_lhs(float %x, float %y, float %z) {
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, v1, -v0, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v0, v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.x = fneg float %x
|
||||
%fma = call float @llvm.fma.f32(float %neg.x, float %y, float %z)
|
||||
|
@ -698,7 +698,7 @@ define float @v_fma_f32_fneg_rhs(float %x, float %y, float %z) {
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v1, v0, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, v0, -v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.y = fneg float %y
|
||||
%fma = call float @llvm.fma.f32(float %x, float %neg.y, float %z)
|
||||
|
|
|
@ -59,13 +59,13 @@ body: |
|
|||
; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4
|
||||
; GFX10-LABEL: name: fma_f32_fneg_src0
|
||||
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 1, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit %4
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
|
@ -96,13 +96,13 @@ body: |
|
|||
; GFX9-DL: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX9-DL-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX9-DL-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX9-DL-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX9-DL-NEXT: S_ENDPGM 0, implicit %4
|
||||
; GFX10-LABEL: name: fma_f32_fneg_src1
|
||||
; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
|
||||
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
|
||||
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
|
||||
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMAC_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX10-NEXT: %4:vgpr_32 = nofpexcept V_FMA_F32_e64 0, [[COPY]], 1, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $mode, implicit $exec
|
||||
; GFX10-NEXT: S_ENDPGM 0, implicit %4
|
||||
%0:vgpr(s32) = COPY $vgpr0
|
||||
%1:vgpr(s32) = COPY $vgpr1
|
||||
|
|
|
@ -60,13 +60,13 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
|
|||
; GCN-NEXT: v_mac_f32_e32 v10, v7, v6
|
||||
; GCN-NEXT: v_mul_f32_e32 v1, v8, v6
|
||||
; GCN-NEXT: v_mul_f32_e32 v7, v6, v3
|
||||
; GCN-NEXT: v_fmac_f32_e64 v9, -v6, v3
|
||||
; GCN-NEXT: v_fma_f32 v3, -v6, v3, v9
|
||||
; GCN-NEXT: s_waitcnt vmcnt(0)
|
||||
; GCN-NEXT: v_add_f32_e32 v3, v4, v10
|
||||
; GCN-NEXT: v_add_f32_e32 v4, v4, v10
|
||||
; GCN-NEXT: v_fma_f32 v0, v2, s26, -v1
|
||||
; GCN-NEXT: v_fmac_f32_e32 v7, v3, v6
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, v4, v6
|
||||
; GCN-NEXT: v_fma_f32 v4, v5, s0, 0x3ca3d70a
|
||||
; GCN-NEXT: v_fmac_f32_e32 v7, v9, v6
|
||||
; GCN-NEXT: v_mul_f32_e32 v3, v3, v6
|
||||
; GCN-NEXT: v_fmac_f32_e32 v1, v0, v6
|
||||
; GCN-NEXT: v_mul_f32_e32 v0, v2, v6
|
||||
; GCN-NEXT: v_mul_f32_e32 v2, v7, v4
|
||||
|
|
|
@ -59,9 +59,9 @@ entry:
|
|||
; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
|
||||
; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
|
||||
; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
|
||||
; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
|
||||
; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
|
||||
; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
|
||||
; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
|
||||
; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
|
||||
; GFX10-NOT: s_denorm_mode
|
||||
|
||||
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
|
||||
|
@ -331,9 +331,9 @@ entry:
|
|||
; GFX10: v_fmac_f32_e32 [[B:v[0-9]+]], [[A]], [[NUM_RCP]]
|
||||
; GFX10: v_div_scale_f32 [[DEN_SCALE:v[0-9]+]]
|
||||
; GFX10: v_mul_f32_e32 [[C:v[0-9]+]], [[DEN_SCALE]], [[B]]
|
||||
; GFX10: v_fma_f32 [[D:v[0-9]+]], [[C]], -[[NUM_SCALE]], [[DEN_SCALE]]
|
||||
; GFX10: v_fma_f32 [[D:v[0-9]+]], -[[NUM_SCALE]], [[C]], [[DEN_SCALE]]
|
||||
; GFX10: v_fmac_f32_e32 [[E:v[0-9]+]], [[D]], [[B]]
|
||||
; GFX10: v_fmac_f32_e64 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]]
|
||||
; GFX10: v_fma_f32 [[F:v[0-9]+]], -[[NUM_SCALE]], [[E]], [[DEN_SCALE]]
|
||||
; GFX10-NOT: s_denorm_mode
|
||||
|
||||
; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A -check-prefix=FUNC %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,SI %s
|
||||
; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -mcpu=gfx90a -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=FUNC,GCN,GFX90A %s
|
||||
|
||||
declare double @llvm.fma.f64(double, double, double) nounwind readnone
|
||||
declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
|
||||
|
@ -55,8 +55,7 @@ define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x doubl
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fma_f64_abs_src0:
|
||||
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\]}}
|
||||
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
|
||||
double addrspace(1)* %in2, double addrspace(1)* %in3) {
|
||||
%r0 = load double, double addrspace(1)* %in1
|
||||
|
@ -69,8 +68,7 @@ define amdgpu_kernel void @fma_f64_abs_src0(double addrspace(1)* %out, double ad
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fma_f64_abs_src1:
|
||||
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
|
||||
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|}}
|
||||
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], \|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @fma_f64_abs_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
|
||||
double addrspace(1)* %in2, double addrspace(1)* %in3) {
|
||||
%r0 = load double, double addrspace(1)* %in1
|
||||
|
@ -97,8 +95,7 @@ define amdgpu_kernel void @fma_f64_abs_src2(double addrspace(1)* %out, double ad
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fma_f64_neg_src0:
|
||||
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
|
||||
double addrspace(1)* %in2, double addrspace(1)* %in3) {
|
||||
%r0 = load double, double addrspace(1)* %in1
|
||||
|
@ -111,8 +108,7 @@ define amdgpu_kernel void @fma_f64_neg_src0(double addrspace(1)* %out, double ad
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fma_f64_neg_src1:
|
||||
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
|
||||
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @fma_f64_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
|
||||
double addrspace(1)* %in2, double addrspace(1)* %in3) {
|
||||
%r0 = load double, double addrspace(1)* %in1
|
||||
|
@ -139,8 +135,7 @@ define amdgpu_kernel void @fma_f64_neg_src2(double addrspace(1)* %out, double ad
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fma_f64_abs_neg_src0:
|
||||
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
|
||||
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, double addrspace(1)* %in1,
|
||||
double addrspace(1)* %in2, double addrspace(1)* %in3) {
|
||||
%r0 = load double, double addrspace(1)* %in1
|
||||
|
@ -154,8 +149,7 @@ define amdgpu_kernel void @fma_f64_abs_neg_src0(double addrspace(1)* %out, doubl
|
|||
}
|
||||
|
||||
; FUNC-LABEL: {{^}}fma_f64_abs_neg_src1:
|
||||
; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
|
||||
; GFX90A: v_fmac_f64_e64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
|
||||
; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|, v\[[0-9]+:[0-9]+\]}}
|
||||
define amdgpu_kernel void @fma_f64_abs_neg_src1(double addrspace(1)* %out, double addrspace(1)* %in1,
|
||||
double addrspace(1)* %in2, double addrspace(1)* %in3) {
|
||||
%r0 = load double, double addrspace(1)* %in1
|
||||
|
|
|
@ -71,7 +71,7 @@ define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0
|
|||
; FMAGFX10: ; %bb.0:
|
||||
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0
|
||||
; FMAGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0
|
||||
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_f32:
|
||||
|
@ -84,7 +84,7 @@ define float @unsafe_fmul_fsub_distribute_fast_f32(float %arg0, float %arg1) #0
|
|||
; FMADGFX10: ; %bb.0:
|
||||
; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v1, v0
|
||||
; FMADGFX10-NEXT: v_fma_f32 v0, -v1, v0, v0
|
||||
; FMADGFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%add = fsub fast float 1.0, %arg1
|
||||
%tmp1 = fmul fast float %arg0, %add
|
||||
|
@ -156,8 +156,8 @@ define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2
|
|||
; FMAGFX10: ; %bb.0:
|
||||
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; FMAGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0
|
||||
; FMAGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1
|
||||
; FMAGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0
|
||||
; FMAGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1
|
||||
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; FMAD-LABEL: unsafe_fmul_fsub_distribute_fast_v2f32:
|
||||
|
@ -171,8 +171,8 @@ define <2 x float> @unsafe_fmul_fsub_distribute_fast_v2f32(<2 x float> %arg0, <2
|
|||
; FMADGFX10: ; %bb.0:
|
||||
; FMADGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FMADGFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; FMADGFX10-NEXT: v_fmac_f32_e64 v0, -v2, v0
|
||||
; FMADGFX10-NEXT: v_fmac_f32_e64 v1, -v3, v1
|
||||
; FMADGFX10-NEXT: v_fma_f32 v0, -v2, v0, v0
|
||||
; FMADGFX10-NEXT: v_fma_f32 v1, -v3, v1, v1
|
||||
; FMADGFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%add = fsub fast <2 x float> <float 1.0, float 1.0>, %arg1
|
||||
%tmp1 = fmul fast <2 x float> %arg0, %add
|
||||
|
@ -236,7 +236,7 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <
|
|||
; FMAGFX10: ; %bb.0:
|
||||
; FMAGFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; FMAGFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; FMAGFX10-NEXT: v_fma_f32 v0, v1, -v0, v1
|
||||
; FMAGFX10-NEXT: v_fma_f32 v0, -v0, v1, v1
|
||||
; FMAGFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
;
|
||||
; FMAD-LABEL: unsafe_fast_fmul_fsub_ditribute_post_legalize:
|
||||
|
|
|
@ -360,8 +360,7 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out
|
|||
; GCN: {{buffer|flat|global}}_load_ushort [[REGC:v[0-9]+]]
|
||||
; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
|
||||
|
||||
; VI-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
|
||||
; GFX10-DENORM-CONTRACT: v_fmac_f16_e64 [[REGC]], -[[REGA]], [[REGB]]
|
||||
; GCN-DENORM-CONTRACT: v_fma_f16 [[RESULT:v[0-9]+]], -[[REGA]], [[REGB]], [[REGC]]
|
||||
|
||||
; GCN-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
|
||||
; GCN-DENORM-STRICT: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
|
||||
|
@ -370,9 +369,7 @@ define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out
|
|||
|
||||
; GFX10-FLUSH: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGA]], [[REGB]]
|
||||
; GFX10-FLUSH: v_sub_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
|
||||
; GFX10-FLUSH: global_store_short v{{[0-9]+}}, [[RESULT]]
|
||||
; GFX10-DENORM-STRICT: global_store_short v{{[0-9]+}}, [[RESULT]]
|
||||
; GFX10-DENORM-CONTRACT: global_store_short v{{[0-9]+}}, [[REGC]]
|
||||
; GFX10: global_store_short v{{[0-9]+}}, [[RESULT]]
|
||||
define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
|
||||
%tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
|
||||
%tid.ext = sext i32 %tid to i64
|
||||
|
|
|
@ -156,7 +156,7 @@ define amdgpu_kernel void @frem_f16(half addrspace(1)* %out, half addrspace(1)*
|
|||
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
|
||||
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
half addrspace(1)* %in2) #0 {
|
||||
|
@ -280,7 +280,7 @@ define amdgpu_kernel void @fast_frem_f16(half addrspace(1)* %out, half addrspace
|
|||
; GFX10-NEXT: v_rcp_f16_e32 v3, v2
|
||||
; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
|
||||
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
half addrspace(1)* %in2) #0 {
|
||||
|
@ -404,7 +404,7 @@ define amdgpu_kernel void @unsafe_frem_f16(half addrspace(1)* %out, half addrspa
|
|||
; GFX10-NEXT: v_rcp_f16_e32 v3, v2
|
||||
; GFX10-NEXT: v_mul_f16_e32 v3, v1, v3
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f16 v1, -v3, v2, v1
|
||||
; GFX10-NEXT: global_store_short v0, v1, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
half addrspace(1)* %in2) #1 {
|
||||
|
@ -575,7 +575,7 @@ define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)
|
|||
; GFX10-NEXT: v_div_fmas_f32 v3, v3, v5, v6
|
||||
; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v1
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
float addrspace(1)* %in2) #0 {
|
||||
|
@ -691,7 +691,7 @@ define amdgpu_kernel void @fast_frem_f32(float addrspace(1)* %out, float addrspa
|
|||
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
|
||||
; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
float addrspace(1)* %in2) #0 {
|
||||
|
@ -807,7 +807,7 @@ define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrs
|
|||
; GFX10-NEXT: v_rcp_f32_e32 v3, v2
|
||||
; GFX10-NEXT: v_mul_f32_e32 v3, v1, v3
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f32_e64 v1, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f32 v1, -v3, v2, v1
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
float addrspace(1)* %in2) #1 {
|
||||
|
@ -1534,22 +1534,21 @@ define amdgpu_kernel void @frem_v2f16(<2 x half> addrspace(1)* %out, <2 x half>
|
|||
; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v2
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v4, v4
|
||||
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v4
|
||||
; GFX10-NEXT: v_mov_b32_e32 v4, v1
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v4, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v4, v1
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v5, v5
|
||||
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v1
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v1, -v3, v2
|
||||
; GFX10-NEXT: v_pack_b32_f16 v1, v4, v1
|
||||
; GFX10-NEXT: v_mul_f32_e32 v4, v4, v5
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v4, v4
|
||||
; GFX10-NEXT: v_div_fixup_f16 v4, v4, v2, v1
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v4, v4
|
||||
; GFX10-NEXT: v_fma_f16 v1, -v4, v2, v1
|
||||
; GFX10-NEXT: v_pack_b32_f16 v1, v3, v1
|
||||
; GFX10-NEXT: global_store_dword v0, v1, s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
<2 x half> addrspace(1)* %in2) #0 {
|
||||
|
@ -1899,42 +1898,40 @@ define amdgpu_kernel void @frem_v4f16(<4 x half> addrspace(1)* %out, <4 x half>
|
|||
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v3
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
|
||||
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
|
||||
; GFX10-NEXT: v_mov_b32_e32 v6, v1
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
|
||||
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v6, -v5, v3
|
||||
; GFX10-NEXT: v_fma_f16 v5, -v5, v3, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v3, 16, v3
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v1
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v1
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v7, v3
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v1
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v7, v7
|
||||
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v7
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
|
||||
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v3, v1
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v1, -v5, v3
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
|
||||
; GFX10-NEXT: v_mul_f32_e32 v6, v6, v7
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v6, v6
|
||||
; GFX10-NEXT: v_div_fixup_f16 v6, v6, v3, v1
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v6, v6
|
||||
; GFX10-NEXT: v_fma_f16 v1, -v6, v3, v1
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
|
||||
; GFX10-NEXT: v_pack_b32_f16 v1, v6, v1
|
||||
; GFX10-NEXT: v_pack_b32_f16 v1, v5, v1
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v2
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v5, v5
|
||||
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v5
|
||||
; GFX10-NEXT: v_mov_b32_e32 v5, v0
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v5, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f16 v3, -v3, v2, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v2
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v3, v0
|
||||
; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v0
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v6, v2
|
||||
; GFX10-NEXT: v_cvt_f32_f16_e32 v5, v0
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v6, v6
|
||||
; GFX10-NEXT: v_mul_f32_e32 v3, v3, v6
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_div_fixup_f16 v3, v3, v2, v0
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v0, -v3, v2
|
||||
; GFX10-NEXT: v_pack_b32_f16 v0, v5, v0
|
||||
; GFX10-NEXT: v_mul_f32_e32 v5, v5, v6
|
||||
; GFX10-NEXT: v_cvt_f16_f32_e32 v5, v5
|
||||
; GFX10-NEXT: v_div_fixup_f16 v5, v5, v2, v0
|
||||
; GFX10-NEXT: v_trunc_f16_e32 v5, v5
|
||||
; GFX10-NEXT: v_fma_f16 v0, -v5, v2, v0
|
||||
; GFX10-NEXT: v_pack_b32_f16 v0, v3, v0
|
||||
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
<4 x half> addrspace(1)* %in2) #0 {
|
||||
|
@ -2173,7 +2170,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
|
|||
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v8
|
||||
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v3, v1
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
|
||||
; GFX10-NEXT: v_fma_f32 v1, v3, -v5, v1
|
||||
; GFX10-NEXT: v_fma_f32 v1, -v5, v3, v1
|
||||
; GFX10-NEXT: v_div_scale_f32 v5, s0, v2, v2, v0
|
||||
; GFX10-NEXT: v_div_scale_f32 v3, vcc_lo, v0, v2, v0
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v6, v5
|
||||
|
@ -2188,7 +2185,7 @@ define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float
|
|||
; GFX10-NEXT: v_div_fmas_f32 v3, v3, v6, v7
|
||||
; GFX10-NEXT: v_div_fixup_f32 v3, v3, v2, v0
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
|
||||
; GFX10-NEXT: v_fmac_f32_e64 v0, -v3, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v3, v2, v0
|
||||
; GFX10-NEXT: global_store_dwordx2 v4, v[0:1], s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
<2 x float> addrspace(1)* %in2) #0 {
|
||||
|
@ -2547,7 +2544,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
|
|||
; GFX10-NEXT: v_div_fmas_f32 v9, v9, v11, v12
|
||||
; GFX10-NEXT: v_div_fixup_f32 v9, v9, v7, v3
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v9, v9
|
||||
; GFX10-NEXT: v_fma_f32 v3, v7, -v9, v3
|
||||
; GFX10-NEXT: v_fma_f32 v3, -v9, v7, v3
|
||||
; GFX10-NEXT: v_div_scale_f32 v9, s0, v6, v6, v2
|
||||
; GFX10-NEXT: v_div_scale_f32 v7, vcc_lo, v2, v6, v2
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v10, v9
|
||||
|
@ -2562,7 +2559,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
|
|||
; GFX10-NEXT: v_div_fmas_f32 v7, v7, v10, v11
|
||||
; GFX10-NEXT: v_div_fixup_f32 v7, v7, v6, v2
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v7, v7
|
||||
; GFX10-NEXT: v_fma_f32 v2, v6, -v7, v2
|
||||
; GFX10-NEXT: v_fma_f32 v2, -v7, v6, v2
|
||||
; GFX10-NEXT: v_div_scale_f32 v7, s0, v5, v5, v1
|
||||
; GFX10-NEXT: v_div_scale_f32 v6, vcc_lo, v1, v5, v1
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v9, v7
|
||||
|
@ -2577,7 +2574,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
|
|||
; GFX10-NEXT: v_div_fmas_f32 v6, v6, v9, v10
|
||||
; GFX10-NEXT: v_div_fixup_f32 v6, v6, v5, v1
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v6, v6
|
||||
; GFX10-NEXT: v_fma_f32 v1, v5, -v6, v1
|
||||
; GFX10-NEXT: v_fma_f32 v1, -v6, v5, v1
|
||||
; GFX10-NEXT: v_div_scale_f32 v6, s0, v4, v4, v0
|
||||
; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v4, v0
|
||||
; GFX10-NEXT: v_rcp_f32_e32 v7, v6
|
||||
|
@ -2592,7 +2589,7 @@ define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float
|
|||
; GFX10-NEXT: v_div_fmas_f32 v5, v5, v7, v9
|
||||
; GFX10-NEXT: v_div_fixup_f32 v5, v5, v4, v0
|
||||
; GFX10-NEXT: v_trunc_f32_e32 v5, v5
|
||||
; GFX10-NEXT: v_fmac_f32_e64 v0, -v5, v4
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v5, v4, v0
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[4:5]
|
||||
; GFX10-NEXT: s_endpgm
|
||||
<4 x float> addrspace(1)* %in2) #0 {
|
||||
|
|
|
@ -356,7 +356,7 @@ define float @no_mix_simple(float %src0, float %src1, float %src2) #0 {
|
|||
; GCN: s_waitcnt
|
||||
; CIVI-NEXT: v_mad_f32 v0, |v0|, v1, v2
|
||||
; GFX900-NEXT: v_mad_f32 v0, |v0|, v1, v2
|
||||
; GFX906-NEXT: v_fma_f32 v0, v1, |v0|, v2
|
||||
; GFX906-NEXT: v_fma_f32 v0, |v0|, v1, v2
|
||||
; GCN-NEXT: s_setpc_b64
|
||||
define float @no_mix_simple_fabs(float %src0, float %src1, float %src2) #0 {
|
||||
%src0.fabs = call float @llvm.fabs.f32(float %src0)
|
||||
|
|
|
@ -176,8 +176,7 @@ define half @v_constained_fma_f16_fpexcept_strict_fneg_fneg(half %x, half %y, ha
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v2, -v0, -v1
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX10-NEXT: v_fma_f16 v0, -v0, -v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.x = fneg half %x
|
||||
%neg.y = fneg half %y
|
||||
|
@ -196,8 +195,7 @@ define half @v_constained_fma_f16_fpexcept_strict_fabs_fabs(half %x, half %y, ha
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fmac_f16_e64 v2, |v0|, |v1|
|
||||
; GFX10-NEXT: v_mov_b32_e32 v0, v2
|
||||
; GFX10-NEXT: v_fma_f16 v0, |v0|, |v1|, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.x = call half @llvm.fabs.f16(half %x)
|
||||
%neg.y = call half @llvm.fabs.f16(half %y)
|
||||
|
|
|
@ -111,7 +111,7 @@ define float @v_constained_fma_f32_fpexcept_strict_fneg_fneg(float %x, float %y,
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v1, -v0, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v0, -v1, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.x = fneg float %x
|
||||
%neg.y = fneg float %y
|
||||
|
@ -130,7 +130,7 @@ define float @v_constained_fma_f32_fpexcept_strict_fabs_fabs(float %x, float %y,
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, |v1|, |v0|, v2
|
||||
; GFX10-NEXT: v_fma_f32 v0, |v0|, |v1|, v2
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.x = call float @llvm.fabs.f32(float %x)
|
||||
%neg.y = call float @llvm.fabs.f32(float %y)
|
||||
|
@ -150,8 +150,8 @@ define <2 x float> @v_constained_fma_v2f32_fpexcept_strict_fneg_fneg(<2 x float>
|
|||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
|
||||
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v2, -v0, v4
|
||||
; GFX10-NEXT: v_fma_f32 v1, -v3, -v1, v5
|
||||
; GFX10-NEXT: v_fma_f32 v0, -v0, -v2, v4
|
||||
; GFX10-NEXT: v_fma_f32 v1, -v1, -v3, v5
|
||||
; GFX10-NEXT: s_setpc_b64 s[30:31]
|
||||
%neg.x = fneg <2 x float> %x
|
||||
%neg.y = fneg <2 x float> %y
|
||||
|
|
|
@ -187,7 +187,7 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
|
|||
|
||||
; GCN-LABEL: {{^}}fdiv_test_denormals
|
||||
; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX1030: v_fmac_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
|
||||
; GFX1030: v_fma_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
|
||||
define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
|
||||
bb:
|
||||
%tmp = load i8, i8 addrspace(1)* null, align 1
|
||||
|
|
Loading…
Reference in New Issue