From c5f0f5309e3d849a76d733ae35f58565d1c4eb65 Mon Sep 17 00:00:00 2001
From: Andrea Di Biagio <Andrea_DiBiagio@sn.scee.net>
Date: Wed, 16 Jan 2019 18:18:01 +0000
Subject: [PATCH] [X86][BtVer2] Update latency of horizontal operations.

On Jaguar, horizontal adds/subs have local forwarding disable.
That means, we pay a compulsory extra cycle of write-back stage, and the value
is not available until the end of that stage.

This patch changes the latency of horizontal operations by adding an extra
cycle. With this patch, latency numbers now match what is reported by perf.

I plan to send another patch to also 'fix' the latency of shuffle operations (on
Jaguar, local forwarding is disabled for vector shuffles too).

Differential Revision: https://reviews.llvm.org/D56777

llvm-svn: 351366
---
 llvm/lib/Target/X86/X86ScheduleBtVer2.td      | 10 ++--
 llvm/test/CodeGen/X86/avx-schedule.ll         | 16 +++---
 llvm/test/CodeGen/X86/mmx-schedule.ll         | 24 ++++----
 llvm/test/CodeGen/X86/sse3-schedule.ll        | 32 +++++------
 llvm/test/CodeGen/X86/ssse3-schedule.ll       | 48 ++++++++--------
 .../tools/llvm-mca/X86/BtVer2/dot-product.s   | 32 +++++------
 .../X86/BtVer2/hadd-read-after-ld-1.s         | 14 ++---
 .../X86/BtVer2/hadd-read-after-ld-2.s         | 14 ++---
 .../X86/BtVer2/instruction-info-view.s        | 10 ++--
 .../llvm-mca/X86/BtVer2/resources-avx1.s      | 56 +++++++++----------
 .../llvm-mca/X86/BtVer2/resources-sse3.s      | 16 +++---
 .../llvm-mca/X86/BtVer2/resources-ssse3.s     | 48 ++++++++--------
 12 files changed, 161 insertions(+), 159 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index 33a6b01546d7..adb69cc44083 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -174,6 +174,8 @@ multiclass JWriteResYMMPair<X86FoldableSchedWrite SchedRW,
   }
 }
 
+// Instructions that have local forwarding disabled have an extra +1cy latency.
+
 // A folded store needs a cycle on the SAGU for the store data,
 // most RMW instructions don't need an extra uop.
 defm : X86WriteRes<WriteRMW, [JSAGU], 1, [1], 0>;
@@ -575,10 +577,10 @@ defm : JWriteResFpuPair<WriteAESDecEnc,   [JFPU01, JVALU, JFPU0, JVIMUL], 3, [1,
 // Horizontal add/sub  instructions.
 ////////////////////////////////////////////////////////////////////////////////
 
-defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 3>;
-defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 3, [2,2], 2>;
-defm : JWriteResFpuPair<WritePHAdd,       [JFPU01, JVALU], 1>;
-defm : JWriteResFpuPair<WritePHAddX,      [JFPU01, JVALU], 1>;
+defm : JWriteResFpuPair<WriteFHAdd,         [JFPU0, JFPA], 4>;            // +1cy latency.
+defm : JWriteResYMMPair<WriteFHAddY,        [JFPU0, JFPA], 4, [2,2], 2>;  // +1cy latency.
+defm : JWriteResFpuPair<WritePHAdd,         [JFPU01, JVALU], 2>;          // +1cy latency.
+defm : JWriteResFpuPair<WritePHAddX,        [JFPU01, JVALU], 2>;          // +1cy latency.
 defm : X86WriteResPairUnsupported<WritePHAddY>;
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll
index e1db9bab294c..6818ea06732e 100644
--- a/llvm/test/CodeGen/X86/avx-schedule.ll
+++ b/llvm/test/CodeGen/X86/avx-schedule.ll
@@ -1951,8 +1951,8 @@ define <4 x double> @test_haddpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT:    vhaddpd %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT:    vhaddpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_haddpd:
@@ -2012,8 +2012,8 @@ define <8 x float> @test_haddps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT:    vhaddps %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT:    vhaddps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_haddps:
@@ -2073,8 +2073,8 @@ define <4 x double> @test_hsubpd(<4 x double> %a0, <4 x double> %a1, <4 x double
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT:    vhsubpd %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT:    vhsubpd (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_hsubpd:
@@ -2134,8 +2134,8 @@ define <8 x float> @test_hsubps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [3:2.00]
-; BTVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [8:2.00]
+; BTVER2-NEXT:    vhsubps %ymm1, %ymm0, %ymm0 # sched: [4:2.00]
+; BTVER2-NEXT:    vhsubps (%rdi), %ymm0, %ymm0 # sched: [9:2.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-LABEL: test_hsubps:
diff --git a/llvm/test/CodeGen/X86/mmx-schedule.ll b/llvm/test/CodeGen/X86/mmx-schedule.ll
index 51dc5e102ff1..513332f61f11 100644
--- a/llvm/test/CodeGen/X86/mmx-schedule.ll
+++ b/llvm/test/CodeGen/X86/mmx-schedule.ll
@@ -3368,8 +3368,8 @@ define i64 @test_phaddd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BTVER2-LABEL: test_phaddd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    phaddd %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT:    phaddd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT:    phaddd %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT:    phaddd (%rdi), %mm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    movq %mm0, %rax # sched: [4:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -3453,8 +3453,8 @@ define i64 @test_phaddsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BTVER2-LABEL: test_phaddsw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT:    phaddsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT:    phaddsw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT:    phaddsw (%rdi), %mm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    movq %mm0, %rax # sched: [4:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -3538,8 +3538,8 @@ define i64 @test_phaddw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BTVER2-LABEL: test_phaddw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    phaddw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT:    phaddw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT:    phaddw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT:    phaddw (%rdi), %mm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    movq %mm0, %rax # sched: [4:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -3623,8 +3623,8 @@ define i64 @test_phsubd(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BTVER2-LABEL: test_phsubd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    phsubd %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT:    phsubd (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT:    phsubd %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT:    phsubd (%rdi), %mm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    movq %mm0, %rax # sched: [4:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -3708,8 +3708,8 @@ define i64 @test_phsubsw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BTVER2-LABEL: test_phsubsw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT:    phsubsw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT:    phsubsw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT:    phsubsw (%rdi), %mm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    movq %mm0, %rax # sched: [4:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
@@ -3793,8 +3793,8 @@ define i64 @test_phsubw(x86_mmx %a0, x86_mmx %a1, x86_mmx* %a2) optsize {
 ;
 ; BTVER2-LABEL: test_phsubw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    phsubw %mm1, %mm0 # sched: [1:0.50]
-; BTVER2-NEXT:    phsubw (%rdi), %mm0 # sched: [6:1.00]
+; BTVER2-NEXT:    phsubw %mm1, %mm0 # sched: [2:0.50]
+; BTVER2-NEXT:    phsubw (%rdi), %mm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    movq %mm0, %rax # sched: [4:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
diff --git a/llvm/test/CodeGen/X86/sse3-schedule.ll b/llvm/test/CodeGen/X86/sse3-schedule.ll
index 1c3419a35ff3..c80d0e446599 100644
--- a/llvm/test/CodeGen/X86/sse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/sse3-schedule.ll
@@ -356,14 +356,14 @@ define <2 x double> @test_haddpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ;
 ; BTVER2-SSE-LABEL: test_haddpd:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT:    haddpd (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT:    haddpd %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT:    haddpd (%rdi), %xmm0 # sched: [9:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_haddpd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT:    vhaddpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_haddpd:
@@ -477,14 +477,14 @@ define <4 x float> @test_haddps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ;
 ; BTVER2-SSE-LABEL: test_haddps:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT:    haddps (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT:    haddps %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT:    haddps (%rdi), %xmm0 # sched: [9:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_haddps:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT:    vhaddps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_haddps:
@@ -598,14 +598,14 @@ define <2 x double> @test_hsubpd(<2 x double> %a0, <2 x double> %a1, <2 x double
 ;
 ; BTVER2-SSE-LABEL: test_hsubpd:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT:    hsubpd (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT:    hsubpd %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT:    hsubpd (%rdi), %xmm0 # sched: [9:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_hsubpd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT:    vhsubpd (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_hsubpd:
@@ -719,14 +719,14 @@ define <4 x float> @test_hsubps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%
 ;
 ; BTVER2-SSE-LABEL: test_hsubps:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [3:1.00]
-; BTVER2-SSE-NEXT:    hsubps (%rdi), %xmm0 # sched: [8:1.00]
+; BTVER2-SSE-NEXT:    hsubps %xmm1, %xmm0 # sched: [4:1.00]
+; BTVER2-SSE-NEXT:    hsubps (%rdi), %xmm0 # sched: [9:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_hsubps:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [3:1.00]
-; BTVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [8:1.00]
+; BTVER2-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 # sched: [4:1.00]
+; BTVER2-NEXT:    vhsubps (%rdi), %xmm0, %xmm0 # sched: [9:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_hsubps:
diff --git a/llvm/test/CodeGen/X86/ssse3-schedule.ll b/llvm/test/CodeGen/X86/ssse3-schedule.ll
index 5c8bd2dc843e..5a871e273609 100644
--- a/llvm/test/CodeGen/X86/ssse3-schedule.ll
+++ b/llvm/test/CodeGen/X86/ssse3-schedule.ll
@@ -676,14 +676,14 @@ define <4 x i32> @test_phaddd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BTVER2-SSE-LABEL: test_phaddd:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT:    phaddd (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT:    phaddd %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT:    phaddd (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_phaddd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT:    vphaddd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_phaddd:
@@ -797,14 +797,14 @@ define <8 x i16> @test_phaddsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BTVER2-SSE-LABEL: test_phaddsw:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT:    phaddsw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT:    phaddsw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT:    phaddsw (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_phaddsw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT:    vphaddsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_phaddsw:
@@ -918,14 +918,14 @@ define <8 x i16> @test_phaddw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BTVER2-SSE-LABEL: test_phaddw:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT:    phaddw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT:    phaddw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT:    phaddw (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_phaddw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT:    vphaddw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_phaddw:
@@ -1039,14 +1039,14 @@ define <4 x i32> @test_phsubd(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> *%a2) {
 ;
 ; BTVER2-SSE-LABEL: test_phsubd:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT:    phsubd (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT:    phsubd %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT:    phsubd (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_phsubd:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT:    vphsubd (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_phsubd:
@@ -1160,14 +1160,14 @@ define <8 x i16> @test_phsubsw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BTVER2-SSE-LABEL: test_phsubsw:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT:    phsubsw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT:    phsubsw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT:    phsubsw (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_phsubsw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT:    vphsubsw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_phsubsw:
@@ -1281,14 +1281,14 @@ define <8 x i16> @test_phsubw(<8 x i16> %a0, <8 x i16> %a1, <8 x i16> *%a2) {
 ;
 ; BTVER2-SSE-LABEL: test_phsubw:
 ; BTVER2-SSE:       # %bb.0:
-; BTVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [1:0.50]
-; BTVER2-SSE-NEXT:    phsubw (%rdi), %xmm0 # sched: [6:1.00]
+; BTVER2-SSE-NEXT:    phsubw %xmm1, %xmm0 # sched: [2:0.50]
+; BTVER2-SSE-NEXT:    phsubw (%rdi), %xmm0 # sched: [7:1.00]
 ; BTVER2-SSE-NEXT:    retq # sched: [4:1.00]
 ;
 ; BTVER2-LABEL: test_phsubw:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [1:0.50]
-; BTVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [6:1.00]
+; BTVER2-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 # sched: [2:0.50]
+; BTVER2-NEXT:    vphsubw (%rdi), %xmm0, %xmm0 # sched: [7:1.00]
 ; BTVER2-NEXT:    retq # sched: [4:1.00]
 ;
 ; ZNVER1-SSE-LABEL: test_phsubw:
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/dot-product.s b/llvm/test/tools/llvm-mca/X86/BtVer2/dot-product.s
index 643e456450cd..a43b8285a518 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/dot-product.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/dot-product.s
@@ -7,12 +7,12 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # CHECK:      Iterations:        300
 # CHECK-NEXT: Instructions:      900
-# CHECK-NEXT: Total Cycles:      610
+# CHECK-NEXT: Total Cycles:      611
 # CHECK-NEXT: Total uOps:        900
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    1.48
-# CHECK-NEXT: IPC:               1.48
+# CHECK-NEXT: uOps Per Cycle:    1.47
+# CHECK-NEXT: IPC:               1.47
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -25,8 +25,8 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      2     1.00                        vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      3     1.00                        vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT:  1      3     1.00                        vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT:  1      4     1.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT:  1      4     1.00                        vhaddps	%xmm3, %xmm3, %xmm4
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - JALU0
@@ -59,14 +59,14 @@ vhaddps  %xmm3, %xmm3, %xmm4
 # CHECK-NEXT: Index     0123456789
 
 # CHECK:      [0,0]     DeeER.    .    .   vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [0,1]     D==eeeER  .    .   vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [0,2]     .D====eeeER    .   vhaddps	%xmm3, %xmm3, %xmm4
-# CHECK-NEXT: [1,0]     .DeeE-----R    .   vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [1,1]     . D=eeeE---R   .   vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [1,2]     . D====eeeER   .   vhaddps	%xmm3, %xmm3, %xmm4
-# CHECK-NEXT: [2,0]     .  DeeE-----R  .   vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: [2,1]     .  D====eeeER  .   vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: [2,2]     .   D======eeeER   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [0,1]     D==eeeeER .    .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [0,2]     .D=====eeeeER  .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [1,0]     .DeeE-------R  .   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [1,1]     . D=eeeeE----R .   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [1,2]     . D=====eeeeER .   vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: [2,0]     .  DeeE-------R.   vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: [2,1]     .  D==eeeeE---R.   vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: [2,2]     .   D=====eeeeER   vhaddps	%xmm3, %xmm3, %xmm4
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
@@ -75,6 +75,6 @@ vhaddps  %xmm3, %xmm3, %xmm4
 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
 
 # CHECK:            [0]    [1]    [2]    [3]
-# CHECK-NEXT: 0.     3     1.0    1.0    3.3       vmulps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT: 1.     3     3.3    0.7    1.0       vhaddps	%xmm2, %xmm2, %xmm3
-# CHECK-NEXT: 2.     3     5.7    0.0    0.0       vhaddps	%xmm3, %xmm3, %xmm4
+# CHECK-NEXT: 0.     3     1.0    1.0    4.7       vmulps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT: 1.     3     2.7    0.0    2.3       vhaddps	%xmm2, %xmm2, %xmm3
+# CHECK-NEXT: 2.     3     6.0    0.0    0.0       vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-1.s
index 87862a6e5a33..197c1dce5815 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-1.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-1.s
@@ -6,12 +6,12 @@ vhaddps (%rdi), %xmm1, %xmm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total Cycles:      12
 # CHECK-NEXT: Total uOps:        2
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.18
-# CHECK-NEXT: IPC:               0.18
+# CHECK-NEXT: uOps Per Cycle:    0.17
+# CHECK-NEXT: IPC:               0.17
 # CHECK-NEXT: Block RThroughput: 1.0
 
 # CHECK:      Instruction Info:
@@ -24,14 +24,14 @@ vhaddps (%rdi), %xmm1, %xmm2
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      1     0.50                        vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT:  1      8     1.00    *                   vhaddps	(%rdi), %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vhaddps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     0
+# CHECK-NEXT:                     01
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    .   vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: [0,1]     DeeeeeeeeER   vhaddps	(%rdi), %xmm1, %xmm2
+# CHECK:      [0,0]     DeER .    ..   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     DeeeeeeeeeER   vhaddps	(%rdi), %xmm1, %xmm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-2.s b/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-2.s
index 80d5109d07ee..e64ee28103f3 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-2.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/hadd-read-after-ld-2.s
@@ -6,12 +6,12 @@ vhaddps (%rdi), %ymm1, %ymm2
 
 # CHECK:      Iterations:        1
 # CHECK-NEXT: Instructions:      2
-# CHECK-NEXT: Total Cycles:      12
+# CHECK-NEXT: Total Cycles:      13
 # CHECK-NEXT: Total uOps:        3
 
 # CHECK:      Dispatch Width:    2
-# CHECK-NEXT: uOps Per Cycle:    0.25
-# CHECK-NEXT: IPC:               0.17
+# CHECK-NEXT: uOps Per Cycle:    0.23
+# CHECK-NEXT: IPC:               0.15
 # CHECK-NEXT: Block RThroughput: 2.0
 
 # CHECK:      Instruction Info:
@@ -24,14 +24,14 @@ vhaddps (%rdi), %ymm1, %ymm2
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # CHECK-NEXT:  1      1     0.50                        vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT:  2      8     2.00    *                   vhaddps	(%rdi), %ymm1, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vhaddps	(%rdi), %ymm1, %ymm2
 
 # CHECK:      Timeline view:
-# CHECK-NEXT:                     01
+# CHECK-NEXT:                     012
 # CHECK-NEXT: Index     0123456789
 
-# CHECK:      [0,0]     DeER .    ..   vshufps	$0, %xmm0, %xmm1, %xmm1
-# CHECK-NEXT: [0,1]     .DeeeeeeeeER   vhaddps	(%rdi), %ymm1, %ymm2
+# CHECK:      [0,0]     DeER .    . .   vshufps	$0, %xmm0, %xmm1, %xmm1
+# CHECK-NEXT: [0,1]     .DeeeeeeeeeER   vhaddps	(%rdi), %ymm1, %ymm2
 
 # CHECK:      Average Wait times (based on the timeline view):
 # CHECK-NEXT: [0]: Executions
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/instruction-info-view.s b/llvm/test/tools/llvm-mca/X86/BtVer2/instruction-info-view.s
index fa19b55c1d18..8f17c44e0ef4 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/instruction-info-view.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/instruction-info-view.s
@@ -13,13 +13,13 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # ENABLED:       Iterations:        100
 # ENABLED-NEXT:  Instructions:      300
-# ENABLED-NEXT:  Total Cycles:      209
+# ENABLED-NEXT:  Total Cycles:      211
 # ENABLED-NEXT:  Total uOps:        300
 
 
 # ENABLED:       Dispatch Width:    2
-# ENABLED-NEXT:  uOps Per Cycle:    1.44
-# ENABLED-NEXT:  IPC:               1.44
+# ENABLED-NEXT:  uOps Per Cycle:    1.42
+# ENABLED-NEXT:  IPC:               1.42
 # ENABLED-NEXT:  Block RThroughput: 2.0
 
 # ENABLED:       Instruction Info:
@@ -32,5 +32,5 @@ vhaddps  %xmm3, %xmm3, %xmm4
 
 # ENABLED:       [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 # ENABLED-NEXT:   1      2     1.00                        vmulps	%xmm0, %xmm1, %xmm2
-# ENABLED-NEXT:   1      3     1.00                        vhaddps	%xmm2, %xmm2, %xmm3
-# ENABLED-NEXT:   1      3     1.00                        vhaddps	%xmm3, %xmm3, %xmm4
+# ENABLED-NEXT:   1      4     1.00                        vhaddps	%xmm2, %xmm2, %xmm3
+# ENABLED-NEXT:   1      4     1.00                        vhaddps	%xmm3, %xmm3, %xmm4
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s
index 3db3470e0c4f..0df1d17983e6 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/resources-avx1.s
@@ -1196,22 +1196,22 @@ vzeroupper
 # CHECK-NEXT:  1      1     1.00           *            vextractf128	$1, %ymm0, (%rax)
 # CHECK-NEXT:  1      3     1.00                        vextractps	$1, %xmm0, %ecx
 # CHECK-NEXT:  1      3     1.00           *            vextractps	$1, %xmm0, (%rax)
-# CHECK-NEXT:  1      3     1.00                        vhaddpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   vhaddpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      3     2.00                        vhaddpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     2.00    *                   vhaddpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vhaddps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   vhaddps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      3     2.00                        vhaddps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     2.00    *                   vhaddps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vhsubpd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   vhsubpd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      3     2.00                        vhsubpd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     2.00    *                   vhsubpd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  1      3     1.00                        vhsubps	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   vhsubps	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  2      3     2.00                        vhsubps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      8     2.00    *                   vhsubps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vhaddpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vhaddpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     2.00                        vhaddpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vhaddpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vhaddps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vhaddps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     2.00                        vhaddps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vhaddps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vhsubpd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vhsubpd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     2.00                        vhsubpd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vhsubpd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vhsubps	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   vhsubps	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  2      4     2.00                        vhsubps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  2      9     2.00    *                   vhsubps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  2      1     1.00                        vinsertf128	$1, %xmm0, %ymm1, %ymm2
 # CHECK-NEXT:  2      6     1.00    *                   vinsertf128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vinsertps	$1, %xmm0, %xmm1, %xmm2
@@ -1455,20 +1455,20 @@ vzeroupper
 # CHECK-NEXT:  1      3     1.00           *            vpextrq	$1, %xmm0, (%rax)
 # CHECK-NEXT:  1      3     1.00                        vpextrw	$1, %xmm0, %ecx
 # CHECK-NEXT:  1      3     1.00           *            vpextrw	$1, %xmm0, (%rax)
-# CHECK-NEXT:  1      1     0.50                        vphaddd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   vphaddd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vphaddsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   vphaddsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vphaddw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   vphaddw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vphaddd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vphaddd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vphaddsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vphaddsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vphaddw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vphaddw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      2     0.50                        vphminposuw	%xmm0, %xmm2
 # CHECK-NEXT:  1      7     1.00    *                   vphminposuw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        vphsubd	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   vphsubd	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vphsubsw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   vphsubsw	(%rax), %xmm1, %xmm2
-# CHECK-NEXT:  1      1     0.50                        vphsubw	%xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   vphsubw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vphsubd	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vphsubd	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vphsubsw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vphsubsw	(%rax), %xmm1, %xmm2
+# CHECK-NEXT:  1      2     0.50                        vphsubw	%xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   vphsubw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  2      7     0.50                        vpinsrb	$1, %eax, %xmm1, %xmm2
 # CHECK-NEXT:  1      4     1.00    *                   vpinsrb	$1, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  2      7     0.50                        vpinsrd	$1, %eax, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/resources-sse3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/resources-sse3.s
index 3fd2f406a98b..add8bd843bfa 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/resources-sse3.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/resources-sse3.s
@@ -43,14 +43,14 @@ movsldup  (%rax), %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   addsubpd	(%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        addsubps	%xmm0, %xmm2
 # CHECK-NEXT:  1      8     1.00    *                   addsubps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        haddpd	%xmm0, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   haddpd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        haddps	%xmm0, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   haddps	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        hsubpd	%xmm0, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   hsubpd	(%rax), %xmm2
-# CHECK-NEXT:  1      3     1.00                        hsubps	%xmm0, %xmm2
-# CHECK-NEXT:  1      8     1.00    *                   hsubps	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        haddpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   haddpd	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        haddps	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   haddps	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        hsubpd	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   hsubpd	(%rax), %xmm2
+# CHECK-NEXT:  1      4     1.00                        hsubps	%xmm0, %xmm2
+# CHECK-NEXT:  1      9     1.00    *                   hsubps	(%rax), %xmm2
 # CHECK-NEXT:  1      5     1.00    *                   lddqu	(%rax), %xmm2
 # CHECK-NEXT:  1      1     0.50                        movddup	%xmm0, %xmm2
 # CHECK-NEXT:  1      6     1.00    *                   movddup	(%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/resources-ssse3.s b/llvm/test/tools/llvm-mca/X86/BtVer2/resources-ssse3.s
index ff7ff3f93bac..0b64d1da48b6 100644
--- a/llvm/test/tools/llvm-mca/X86/BtVer2/resources-ssse3.s
+++ b/llvm/test/tools/llvm-mca/X86/BtVer2/resources-ssse3.s
@@ -122,30 +122,30 @@ psignw      (%rax), %xmm2
 # CHECK-NEXT:  1      6     1.00    *                   palignr	$1, (%rax), %mm2
 # CHECK-NEXT:  1      1     0.50                        palignr	$1, %xmm0, %xmm2
 # CHECK-NEXT:  1      6     1.00    *                   palignr	$1, (%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        phaddd	%mm0, %mm2
-# CHECK-NEXT:  1      6     1.00    *                   phaddd	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        phaddd	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   phaddd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        phaddsw	%mm0, %mm2
-# CHECK-NEXT:  1      6     1.00    *                   phaddsw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        phaddsw	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   phaddsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        phaddw	%mm0, %mm2
-# CHECK-NEXT:  1      6     1.00    *                   phaddw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        phaddw	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   phaddw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        phsubd	%mm0, %mm2
-# CHECK-NEXT:  1      6     1.00    *                   phsubd	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        phsubd	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   phsubd	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        phsubsw	%mm0, %mm2
-# CHECK-NEXT:  1      6     1.00    *                   phsubsw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        phsubsw	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   phsubsw	(%rax), %xmm2
-# CHECK-NEXT:  1      1     0.50                        phsubw	%mm0, %mm2
-# CHECK-NEXT:  1      6     1.00    *                   phsubw	(%rax), %mm2
-# CHECK-NEXT:  1      1     0.50                        phsubw	%xmm0, %xmm2
-# CHECK-NEXT:  1      6     1.00    *                   phsubw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        phaddd	%mm0, %mm2
+# CHECK-NEXT:  1      7     1.00    *                   phaddd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        phaddd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   phaddd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        phaddsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     1.00    *                   phaddsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        phaddsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   phaddsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        phaddw	%mm0, %mm2
+# CHECK-NEXT:  1      7     1.00    *                   phaddw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        phaddw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   phaddw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        phsubd	%mm0, %mm2
+# CHECK-NEXT:  1      7     1.00    *                   phsubd	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        phsubd	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   phsubd	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        phsubsw	%mm0, %mm2
+# CHECK-NEXT:  1      7     1.00    *                   phsubsw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        phsubsw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   phsubsw	(%rax), %xmm2
+# CHECK-NEXT:  1      2     0.50                        phsubw	%mm0, %mm2
+# CHECK-NEXT:  1      7     1.00    *                   phsubw	(%rax), %mm2
+# CHECK-NEXT:  1      2     0.50                        phsubw	%xmm0, %xmm2
+# CHECK-NEXT:  1      7     1.00    *                   phsubw	(%rax), %xmm2
 # CHECK-NEXT:  1      2     1.00                        pmaddubsw	%mm0, %mm2
 # CHECK-NEXT:  1      7     1.00    *                   pmaddubsw	(%rax), %mm2
 # CHECK-NEXT:  1      2     1.00                        pmaddubsw	%xmm0, %xmm2