From b6e2796114d08aadfabe8c889b5d96e6bc4f5e0e Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Thu, 27 Feb 2020 20:53:17 -0800 Subject: [PATCH] [X86][TwoAddressInstructionPass] Teach tryInstructionCommute to continue checking for commutable FMA operands in more cases. Previously we would only check for another commutable operand if the first commute was an aggressive commute. But if we have two kill operands and neither is tied to the def at the start, we should consider both operands as the one to use as the new def. This improves the loop in the fma-commute-loop.ll test. This test is derived from a post from discourse here https://llvm.discourse.group/t/unnecessary-vmovapd-instructions-generated-can-you-hint-in-favor-of-vfmadd231pd/582 Differential Revision: https://reviews.llvm.org/D75016 --- .../lib/CodeGen/TwoAddressInstructionPass.cpp | 25 ++++--- llvm/test/CodeGen/X86/fma-commute-loop.ll | 24 ++++--- llvm/test/CodeGen/X86/recip-fastmath.ll | 66 +++++++++---------- llvm/test/CodeGen/X86/recip-fastmath2.ll | 32 ++++----- llvm/test/CodeGen/X86/sqrt-fastmath.ll | 12 ++-- 5 files changed, 76 insertions(+), 83 deletions(-) diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp index 2b1ffab74b6f..336077f297d2 100644 --- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -1238,21 +1238,18 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI, Dist)) { MadeChange = true; ++NumCommuted; - if (AggressiveCommute) { + if (AggressiveCommute) ++NumAggrCommuted; - // There might be more than two commutable operands, update BaseOp and - // continue scanning. - // FIXME: This assumes that the new instruction's operands are in the - // same positions and were simply swapped. - BaseOpReg = OtherOpReg; - BaseOpKilled = OtherOpKilled; - // Resamples OpsNum in case the number of operands was reduced. This - // happens with X86. - OpsNum = MI->getDesc().getNumOperands(); - continue; - } - // If this was a commute based on kill, we won't do better continuing. - return MadeChange; + + // There might be more than two commutable operands, update BaseOp and + // continue scanning. + // FIXME: This assumes that the new instruction's operands are in the + // same positions and were simply swapped. + BaseOpReg = OtherOpReg; + BaseOpKilled = OtherOpKilled; + // Resamples OpsNum in case the number of operands was reduced. This + // happens with X86. + OpsNum = MI->getDesc().getNumOperands(); } } return MadeChange; diff --git a/llvm/test/CodeGen/X86/fma-commute-loop.ll b/llvm/test/CodeGen/X86/fma-commute-loop.ll index f96e9c12dba3..6b0bceb88f47 100644 --- a/llvm/test/CodeGen/X86/fma-commute-loop.ll +++ b/llvm/test/CodeGen/X86/fma-commute-loop.ll @@ -25,25 +25,23 @@ define void @eggs(<8 x double>* %arg, <8 x double>* %arg1, <8 x double>* %arg2, ; CHECK-NEXT: addq {{[0-9]+}}(%rsp), %r12 ; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vxorpd %xmm5, %xmm5, %xmm5 -; CHECK-NEXT: vxorpd %xmm3, %xmm3, %xmm3 ; CHECK-NEXT: .p2align 4, 0x90 ; CHECK-NEXT: LBB0_1: ## %bb15 ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmovapd %zmm5, %zmm6 -; CHECK-NEXT: vmovapd %zmm4, %zmm7 -; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm4 -; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm5 +; CHECK-NEXT: vmovupd (%rax,%r11,8), %zmm6 +; CHECK-NEXT: vmovupd (%rax,%r13,8), %zmm7 ; CHECK-NEXT: vmovupd (%rax,%r12,8), %zmm8 ; CHECK-NEXT: vbroadcastsd (%r15,%rbx,8), %zmm9 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm4 * zmm9) + zmm0 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm5 * zmm9) + zmm1 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm0 = (zmm6 * zmm9) + zmm0 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm1 = (zmm7 * zmm9) + zmm1 ; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm2 = (zmm8 * zmm9) + zmm2 ; CHECK-NEXT: vbroadcastsd (%r14,%rbx,8), %zmm9 -; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm4 = (zmm9 * zmm4) + zmm7 -; CHECK-NEXT: vfmadd213pd {{.*#+}} zmm5 = (zmm9 * zmm5) + zmm6 -; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm8 * zmm9) + zmm3 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm3 = (zmm9 * zmm6) + zmm3 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm4 = (zmm9 * zmm7) + zmm4 +; CHECK-NEXT: vfmadd231pd {{.*#+}} zmm5 = (zmm8 * zmm9) + zmm5 ; CHECK-NEXT: incq %rbx ; CHECK-NEXT: cmpq %rbx, %r10 ; CHECK-NEXT: jne LBB0_1 @@ -51,9 +49,9 @@ define void @eggs(<8 x double>* %arg, <8 x double>* %arg1, <8 x double>* %arg2, ; CHECK-NEXT: vmovapd %zmm0, (%rdi) ; CHECK-NEXT: vmovapd %zmm1, (%rsi) ; CHECK-NEXT: vmovapd %zmm2, (%rdx) -; CHECK-NEXT: vmovapd %zmm4, (%rcx) -; CHECK-NEXT: vmovapd %zmm5, (%r8) -; CHECK-NEXT: vmovapd %zmm3, (%r9) +; CHECK-NEXT: vmovapd %zmm3, (%rcx) +; CHECK-NEXT: vmovapd %zmm4, (%r8) +; CHECK-NEXT: vmovapd %zmm5, (%r9) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: popq %r12 ; CHECK-NEXT: popq %r13 diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll index b1b9c1b735c4..99ce5eba08ff 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath.ll @@ -144,9 +144,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 { ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpss %xmm1, %xmm1, %xmm2 ; FMA-RECIP-NEXT: vmulss %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 -; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3 -; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 +; FMA-RECIP-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 +; FMA-RECIP-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: f32_one_step_variables: @@ -181,9 +180,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 { ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpss %xmm1, %xmm1, %xmm2 ; HASWELL-NEXT: vmulss %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 -; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3 -; HASWELL-NEXT: vmovaps %xmm2, %xmm0 +; HASWELL-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 +; HASWELL-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: f32_one_step_variables: @@ -200,9 +198,8 @@ define float @f32_one_step_variables(float %x, float %y) #1 { ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpss %xmm1, %xmm1, %xmm2 ; AVX512-NEXT: vmulss %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vfmsub213ss {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 -; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3 -; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: vfmsub231ss {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 +; AVX512-NEXT: vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast float %x, %y ret float %div @@ -445,10 +442,11 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; ; HASWELL-LABEL: v4f32_one_step: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %xmm0, %xmm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 +; HASWELL-NEXT: vrcpps %xmm0, %xmm2 +; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2 +; HASWELL-NEXT: vmovaps %xmm1, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step: @@ -463,10 +461,11 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 { ; ; KNL-LABEL: v4f32_one_step: ; KNL: # %bb.0: -; KNL-NEXT: vrcpps %xmm0, %xmm1 -; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 -; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 +; KNL-NEXT: vrcpps %xmm0, %xmm2 +; KNL-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; KNL-NEXT: vfmsub231ps {{.*#+}} xmm1 = (xmm2 * xmm0) - xmm1 +; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm2 +; KNL-NEXT: vmovaps %xmm1, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_one_step: @@ -505,9 +504,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 ; FMA-RECIP: # %bb.0: ; FMA-RECIP-NEXT: vrcpps %xmm1, %xmm2 ; FMA-RECIP-NEXT: vmulps %xmm2, %xmm0, %xmm3 -; FMA-RECIP-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 -; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3 -; FMA-RECIP-NEXT: vmovaps %xmm2, %xmm0 +; FMA-RECIP-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 +; FMA-RECIP-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; FMA-RECIP-NEXT: retq ; ; BDVER2-LABEL: v4f32_one_step_variables: @@ -542,9 +540,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm1, %xmm2 ; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm3 -; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 -; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3 -; HASWELL-NEXT: vmovaps %xmm2, %xmm0 +; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 +; HASWELL-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_variables: @@ -561,9 +558,8 @@ define <4 x float> @v4f32_one_step_variables(<4 x float> %x, <4 x float> %y) #1 ; AVX512: # %bb.0: ; AVX512-NEXT: vrcpps %xmm1, %xmm2 ; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm3 -; AVX512-NEXT: vfmsub213ps {{.*#+}} xmm1 = (xmm3 * xmm1) - xmm0 -; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm2 = -(xmm1 * xmm2) + xmm3 -; AVX512-NEXT: vmovaps %xmm2, %xmm0 +; AVX512-NEXT: vfmsub231ps {{.*#+}} xmm0 = (xmm3 * xmm1) - xmm0 +; AVX512-NEXT: vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3 ; AVX512-NEXT: retq %div = fdiv fast <4 x float> %x, %y ret <4 x float> %div @@ -816,10 +812,11 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; ; HASWELL-LABEL: v8f32_one_step: ; HASWELL: # %bb.0: -; HASWELL-NEXT: vrcpps %ymm0, %ymm1 -; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 +; HASWELL-NEXT: vrcpps %ymm0, %ymm2 +; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 +; HASWELL-NEXT: vmovaps %ymm1, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step: @@ -834,10 +831,11 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 { ; ; KNL-LABEL: v8f32_one_step: ; KNL: # %bb.0: -; KNL-NEXT: vrcpps %ymm0, %ymm1 -; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 -; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 +; KNL-NEXT: vrcpps %ymm0, %ymm2 +; KNL-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] +; KNL-NEXT: vfmsub231ps {{.*#+}} ymm1 = (ymm2 * ymm0) - ymm1 +; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2 +; KNL-NEXT: vmovaps %ymm1, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_one_step: diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll index c2bd531049f6..6e67e6eb452f 100644 --- a/llvm/test/CodeGen/X86/recip-fastmath2.ll +++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll @@ -530,10 +530,10 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %xmm0, %xmm1 ; HASWELL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; HASWELL-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; HASWELL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 +; HASWELL-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm0 +; HASWELL-NEXT: vmulps %xmm2, %xmm0, %xmm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs: @@ -552,10 +552,10 @@ define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 { ; KNL: # %bb.0: ; KNL-NEXT: vrcpps %xmm0, %xmm1 ; KNL-NEXT: vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2 -; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1 -; KNL-NEXT: vmulps {{.*}}(%rip), %xmm0, %xmm1 -; KNL-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; KNL-NEXT: vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2 +; KNL-NEXT: vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1 +; KNL-NEXT: vmulps {{.*}}(%rip), %xmm2, %xmm0 +; KNL-NEXT: vmulps %xmm2, %xmm0, %xmm0 ; KNL-NEXT: retq ; ; SKX-LABEL: v4f32_one_step_2_divs: @@ -892,10 +892,10 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; HASWELL: # %bb.0: ; HASWELL-NEXT: vrcpps %ymm0, %ymm1 ; HASWELL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; HASWELL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 -; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 -; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; HASWELL-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; HASWELL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2 +; HASWELL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 +; HASWELL-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm0 +; HASWELL-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; HASWELL-NEXT: retq ; ; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs: @@ -914,10 +914,10 @@ define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 { ; KNL: # %bb.0: ; KNL-NEXT: vrcpps %ymm0, %ymm1 ; KNL-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] -; KNL-NEXT: vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2 -; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1 -; KNL-NEXT: vmulps {{.*}}(%rip), %ymm0, %ymm1 -; KNL-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; KNL-NEXT: vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2 +; KNL-NEXT: vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1 +; KNL-NEXT: vmulps {{.*}}(%rip), %ymm2, %ymm0 +; KNL-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; KNL-NEXT: retq ; ; SKX-LABEL: v8f32_one_step_2_divs: diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll index 37e6b6954dc2..3986c8f863d7 100644 --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -328,10 +328,10 @@ define <4 x float> @v4f32_estimate(<4 x float> %x) #1 { ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 -; AVX512-NEXT: vbroadcastss {{.*#+}} xmm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %xmm2, %xmm1, %xmm1 +; AVX512-NEXT: vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2 +; AVX512-NEXT: vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmulps %xmm0, %xmm1, %xmm0 +; AVX512-NEXT: vmulps %xmm2, %xmm0, %xmm0 ; AVX512-NEXT: retq %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) %div = fdiv fast <4 x float> , %sqrt @@ -401,10 +401,10 @@ define <8 x float> @v8f32_estimate(<8 x float> %x) #1 { ; AVX512-NEXT: vrsqrtps %ymm0, %ymm1 ; AVX512-NEXT: vmulps %ymm1, %ymm0, %ymm0 ; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0] -; AVX512-NEXT: vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2 -; AVX512-NEXT: vbroadcastss {{.*#+}} ymm2 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] -; AVX512-NEXT: vmulps %ymm2, %ymm1, %ymm1 +; AVX512-NEXT: vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2 +; AVX512-NEXT: vbroadcastss {{.*#+}} ymm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1] ; AVX512-NEXT: vmulps %ymm0, %ymm1, %ymm0 +; AVX512-NEXT: vmulps %ymm2, %ymm0, %ymm0 ; AVX512-NEXT: retq %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x) %div = fdiv fast <8 x float> , %sqrt