From 89deac669441f847fb389dacab0bcdf7e7f85de2 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Fri, 8 Jun 2018 17:00:45 +0000 Subject: [PATCH] [X86][BtVer2] Add support for all SUB/XOR 32/64 scalar instructions that should match the dependency-breaking 'zero-idiom' As detailed on Agner's Microarchitecture doc (21.8 AMD Bobcat and Jaguar pipeline - Dependency-breaking instructions), these instructions are dependency breaking and fast-path zero the destination register (and appropriate EFLAGS bits). llvm-svn: 334303 --- llvm/lib/Target/X86/X86ScheduleBtVer2.td | 9 +- llvm/test/CodeGen/X86/avx-schedule.ll | 8 +- .../tools/llvm-mca/X86/BtVer2/zero-idioms.s | 254 +++++++++--------- 3 files changed, 139 insertions(+), 132 deletions(-) diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td index d41777159ba3..4155e13d4fe8 100644 --- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td +++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td @@ -555,7 +555,7 @@ def JWriteZeroLatency : SchedWriteRes<[]> { let Latency = 0; } -// Certain vector instructions that use the same register for both source +// Certain instructions that use the same register for both source // operands do not have a real dependency on the previous contents of the // register, and thus, do not have to wait before completing. They can be // optimized out at register renaming stage. @@ -564,6 +564,13 @@ def JWriteZeroLatency : SchedWriteRes<[]> { // Reference: Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs", // Section 21.8 [Dependency-breaking instructions]. +def JWriteZeroIdiom : SchedWriteVariant<[ + SchedVar, [JWriteZeroLatency]>, + SchedVar, [WriteALU]> +]>; +def : InstRW<[JWriteZeroIdiom], (instrs SUB32rr, SUB64rr, + XOR32rr, XOR64rr)>; + def JWriteFZeroIdiom : SchedWriteVariant<[ SchedVar, [JWriteZeroLatency]>, SchedVar, [WriteFLogic]> diff --git a/llvm/test/CodeGen/X86/avx-schedule.ll b/llvm/test/CodeGen/X86/avx-schedule.ll index 5a5d6db6070e..6c137846dd95 100644 --- a/llvm/test/CodeGen/X86/avx-schedule.ll +++ b/llvm/test/CodeGen/X86/avx-schedule.ll @@ -4691,7 +4691,7 @@ define i32 @test_testpd(<2 x double> %a0, <2 x double> %a1, <2 x double> *%a2) { ; ; BTVER2-LABEL: test_testpd: ; BTVER2: # %bb.0: -; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: xorl %eax, %eax # sched: [0:0.50] ; BTVER2-NEXT: vtestpd %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: setb %al # sched: [1:0.50] ; BTVER2-NEXT: vtestpd (%rdi), %xmm0 # sched: [8:1.00] @@ -4777,7 +4777,7 @@ define i32 @test_testpd_ymm(<4 x double> %a0, <4 x double> %a1, <4 x double> *%a ; ; BTVER2-LABEL: test_testpd_ymm: ; BTVER2: # %bb.0: -; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: xorl %eax, %eax # sched: [0:0.50] ; BTVER2-NEXT: vtestpd %ymm1, %ymm0 # sched: [4:2.00] ; BTVER2-NEXT: setb %al # sched: [1:0.50] ; BTVER2-NEXT: vtestpd (%rdi), %ymm0 # sched: [9:2.00] @@ -4858,7 +4858,7 @@ define i32 @test_testps(<4 x float> %a0, <4 x float> %a1, <4 x float> *%a2) { ; ; BTVER2-LABEL: test_testps: ; BTVER2: # %bb.0: -; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: xorl %eax, %eax # sched: [0:0.50] ; BTVER2-NEXT: vtestps %xmm1, %xmm0 # sched: [3:1.00] ; BTVER2-NEXT: setb %al # sched: [1:0.50] ; BTVER2-NEXT: vtestps (%rdi), %xmm0 # sched: [8:1.00] @@ -4944,7 +4944,7 @@ define i32 @test_testps_ymm(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a2) ; ; BTVER2-LABEL: test_testps_ymm: ; BTVER2: # %bb.0: -; BTVER2-NEXT: xorl %eax, %eax # sched: [1:0.50] +; BTVER2-NEXT: xorl %eax, %eax # sched: [0:0.50] ; BTVER2-NEXT: vtestps %ymm1, %ymm0 # sched: [4:2.00] ; BTVER2-NEXT: setb %al # sched: [1:0.50] ; BTVER2-NEXT: vtestps (%rdi), %ymm0 # sched: [9:2.00] diff --git a/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s index 1fcfde107bc8..ca52ec39c88f 100644 --- a/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s +++ b/llvm/test/tools/llvm-mca/X86/BtVer2/zero-idioms.s @@ -67,9 +67,9 @@ vpxor %xmm3, %xmm3, %xmm5 # CHECK: Iterations: 1 # CHECK-NEXT: Instructions: 55 -# CHECK-NEXT: Total Cycles: 32 +# CHECK-NEXT: Total Cycles: 29 # CHECK-NEXT: Dispatch Width: 2 -# CHECK-NEXT: IPC: 1.72 +# CHECK-NEXT: IPC: 1.90 # CHECK-NEXT: Block RThroughput: 27.5 # CHECK: Instruction Info: @@ -81,10 +81,10 @@ vpxor %xmm3, %xmm3, %xmm5 # CHECK-NEXT: [6]: HasSideEffects # CHECK: [1] [2] [3] [4] [5] [6] Instructions: -# CHECK-NEXT: 1 1 0.50 subl %eax, %eax -# CHECK-NEXT: 1 1 0.50 subq %rax, %rax -# CHECK-NEXT: 1 1 0.50 xorl %eax, %eax -# CHECK-NEXT: 1 1 0.50 xorq %rax, %rax +# CHECK-NEXT: 1 0 0.50 subl %eax, %eax +# CHECK-NEXT: 1 0 0.50 subq %rax, %rax +# CHECK-NEXT: 1 0 0.50 xorl %eax, %eax +# CHECK-NEXT: 1 0 0.50 xorq %rax, %rax # CHECK-NEXT: 1 0 0.50 pcmpgtb %mm2, %mm2 # CHECK-NEXT: 1 0 0.50 pcmpgtd %mm2, %mm2 # CHECK-NEXT: 1 0 0.50 pcmpgtw %mm2, %mm2 @@ -138,8 +138,8 @@ vpxor %xmm3, %xmm3, %xmm5 # CHECK-NEXT: 1 0 0.50 vpxor %xmm3, %xmm3, %xmm5 # CHECK: Register File statistics: -# CHECK-NEXT: Total number of mappings created: 8 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: * Register File #1 -- JFpuPRF: # CHECK-NEXT: Number of physical registers: 72 @@ -148,8 +148,8 @@ vpxor %xmm3, %xmm3, %xmm5 # CHECK: * Register File #2 -- JIntegerPRF: # CHECK-NEXT: Number of physical registers: 64 -# CHECK-NEXT: Total number of mappings created: 8 -# CHECK-NEXT: Max number of mappings used: 8 +# CHECK-NEXT: Total number of mappings created: 0 +# CHECK-NEXT: Max number of mappings used: 0 # CHECK: Resources: # CHECK-NEXT: [0] - JALU0 @@ -169,14 +169,14 @@ vpxor %xmm3, %xmm3, %xmm5 # CHECK: Resource pressure per iteration: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] -# CHECK-NEXT: 2.00 2.00 - - - - - - - - - - - - +# CHECK-NEXT: - - - - - - - - - - - - - - # CHECK: Resource pressure by instruction: # CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12] [13] Instructions: -# CHECK-NEXT: - 1.00 - - - - - - - - - - - - subl %eax, %eax -# CHECK-NEXT: 1.00 - - - - - - - - - - - - - subq %rax, %rax -# CHECK-NEXT: - 1.00 - - - - - - - - - - - - xorl %eax, %eax -# CHECK-NEXT: 1.00 - - - - - - - - - - - - - xorq %rax, %rax +# CHECK-NEXT: - - - - - - - - - - - - - - subl %eax, %eax +# CHECK-NEXT: - - - - - - - - - - - - - - subq %rax, %rax +# CHECK-NEXT: - - - - - - - - - - - - - - xorl %eax, %eax +# CHECK-NEXT: - - - - - - - - - - - - - - xorq %rax, %rax # CHECK-NEXT: - - - - - - - - - - - - - - pcmpgtb %mm2, %mm2 # CHECK-NEXT: - - - - - - - - - - - - - - pcmpgtd %mm2, %mm2 # CHECK-NEXT: - - - - - - - - - - - - - - pcmpgtw %mm2, %mm2 @@ -230,64 +230,64 @@ vpxor %xmm3, %xmm3, %xmm5 # CHECK-NEXT: - - - - - - - - - - - - - - vpxor %xmm3, %xmm3, %xmm5 # CHECK: Timeline view: -# CHECK-NEXT: 0123456789 01 -# CHECK-NEXT: Index 0123456789 0123456789 +# CHECK-NEXT: 0123456789 +# CHECK-NEXT: Index 0123456789 012345678 -# CHECK: [0,0] DeER . . . . . .. subl %eax, %eax -# CHECK-NEXT: [0,1] D=eER. . . . . .. subq %rax, %rax -# CHECK-NEXT: [0,2] .D=eER . . . . .. xorl %eax, %eax -# CHECK-NEXT: [0,3] .D==eER . . . . .. xorq %rax, %rax -# CHECK-NEXT: [0,4] . D---R . . . . .. pcmpgtb %mm2, %mm2 -# CHECK-NEXT: [0,5] . D----R . . . . .. pcmpgtd %mm2, %mm2 -# CHECK-NEXT: [0,6] . D---R . . . . .. pcmpgtw %mm2, %mm2 -# CHECK-NEXT: [0,7] . D----R . . . . .. pcmpgtb %xmm2, %xmm2 -# CHECK-NEXT: [0,8] . D---R . . . . .. pcmpgtd %xmm2, %xmm2 -# CHECK-NEXT: [0,9] . D----R. . . . .. pcmpgtq %xmm2, %xmm2 -# CHECK-NEXT: [0,10] . D---R. . . . .. pcmpgtw %xmm2, %xmm2 -# CHECK-NEXT: [0,11] . D----R . . . .. vpcmpgtb %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,12] . .D---R . . . .. vpcmpgtd %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,13] . .D----R . . . .. vpcmpgtq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,14] . . D---R . . . .. vpcmpgtw %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,15] . . D----R . . . .. vpcmpgtb %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,16] . . D---R . . . .. vpcmpgtd %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,17] . . D----R . . . .. vpcmpgtq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,18] . . D---R . . . .. vpcmpgtw %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,19] . . D----R. . . .. psubb %mm2, %mm2 -# CHECK-NEXT: [0,20] . . D---R. . . .. psubd %mm2, %mm2 -# CHECK-NEXT: [0,21] . . D----R . . .. psubq %mm2, %mm2 -# CHECK-NEXT: [0,22] . . .D---R . . .. psubw %mm2, %mm2 -# CHECK-NEXT: [0,23] . . .D----R . . .. psubb %xmm2, %xmm2 -# CHECK-NEXT: [0,24] . . . D---R . . .. psubd %xmm2, %xmm2 -# CHECK-NEXT: [0,25] . . . D----R . . .. psubq %xmm2, %xmm2 -# CHECK-NEXT: [0,26] . . . D---R . . .. psubw %xmm2, %xmm2 -# CHECK-NEXT: [0,27] . . . D----R . . .. vpsubb %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,28] . . . D---R . . .. vpsubd %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,29] . . . D----R. . .. vpsubq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,30] . . . D---R. . .. vpsubw %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,31] . . . D----R . .. vpsubb %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,32] . . . .D---R . .. vpsubd %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,33] . . . .D----R . .. vpsubq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,34] . . . . D---R . .. vpsubw %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,35] . . . . D----R . .. andnps %xmm0, %xmm0 -# CHECK-NEXT: [0,36] . . . . D---R . .. andnpd %xmm1, %xmm1 -# CHECK-NEXT: [0,37] . . . . D----R . .. vandnps %xmm2, %xmm2, %xmm2 -# CHECK-NEXT: [0,38] . . . . D---R . .. vandnpd %xmm1, %xmm1, %xmm1 -# CHECK-NEXT: [0,39] . . . . D----R. .. pandn %mm2, %mm2 -# CHECK-NEXT: [0,40] . . . . D---R. .. pandn %xmm2, %xmm2 -# CHECK-NEXT: [0,41] . . . . D----R .. vpandn %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,42] . . . . .D---R .. vandnps %xmm2, %xmm2, %xmm5 -# CHECK-NEXT: [0,43] . . . . .D----R .. vandnpd %xmm1, %xmm1, %xmm5 -# CHECK-NEXT: [0,44] . . . . . D---R .. vpandn %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,45] . . . . . D----R .. xorps %xmm0, %xmm0 -# CHECK-NEXT: [0,46] . . . . . D---R .. xorpd %xmm1, %xmm1 -# CHECK-NEXT: [0,47] . . . . . D----R .. vxorps %xmm2, %xmm2, %xmm2 -# CHECK-NEXT: [0,48] . . . . . D---R .. vxorpd %xmm1, %xmm1, %xmm1 -# CHECK-NEXT: [0,49] . . . . . D----R.. pxor %mm2, %mm2 -# CHECK-NEXT: [0,50] . . . . . D---R.. pxor %xmm2, %xmm2 -# CHECK-NEXT: [0,51] . . . . . D----R. vpxor %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,52] . . . . . .D---R. vxorps %xmm4, %xmm4, %xmm5 -# CHECK-NEXT: [0,53] . . . . . .D----R vxorpd %xmm1, %xmm1, %xmm3 -# CHECK-NEXT: [0,54] . . . . . . D---R vpxor %xmm3, %xmm3, %xmm5 +# CHECK: [0,0] DR . . . . . . subl %eax, %eax +# CHECK-NEXT: [0,1] DR . . . . . . subq %rax, %rax +# CHECK-NEXT: [0,2] .DR . . . . . . xorl %eax, %eax +# CHECK-NEXT: [0,3] .DR . . . . . . xorq %rax, %rax +# CHECK-NEXT: [0,4] . DR . . . . . . pcmpgtb %mm2, %mm2 +# CHECK-NEXT: [0,5] . DR . . . . . . pcmpgtd %mm2, %mm2 +# CHECK-NEXT: [0,6] . DR. . . . . . pcmpgtw %mm2, %mm2 +# CHECK-NEXT: [0,7] . DR. . . . . . pcmpgtb %xmm2, %xmm2 +# CHECK-NEXT: [0,8] . DR . . . . . pcmpgtd %xmm2, %xmm2 +# CHECK-NEXT: [0,9] . DR . . . . . pcmpgtq %xmm2, %xmm2 +# CHECK-NEXT: [0,10] . DR . . . . . pcmpgtw %xmm2, %xmm2 +# CHECK-NEXT: [0,11] . DR . . . . . vpcmpgtb %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,12] . .DR . . . . . vpcmpgtd %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,13] . .DR . . . . . vpcmpgtq %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,14] . . DR . . . . . vpcmpgtw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,15] . . DR . . . . . vpcmpgtb %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,16] . . DR. . . . . vpcmpgtd %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,17] . . DR. . . . . vpcmpgtq %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,18] . . DR . . . . vpcmpgtw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,19] . . DR . . . . psubb %mm2, %mm2 +# CHECK-NEXT: [0,20] . . DR . . . . psubd %mm2, %mm2 +# CHECK-NEXT: [0,21] . . DR . . . . psubq %mm2, %mm2 +# CHECK-NEXT: [0,22] . . .DR . . . . psubw %mm2, %mm2 +# CHECK-NEXT: [0,23] . . .DR . . . . psubb %xmm2, %xmm2 +# CHECK-NEXT: [0,24] . . . DR . . . . psubd %xmm2, %xmm2 +# CHECK-NEXT: [0,25] . . . DR . . . . psubq %xmm2, %xmm2 +# CHECK-NEXT: [0,26] . . . DR. . . . psubw %xmm2, %xmm2 +# CHECK-NEXT: [0,27] . . . DR. . . . vpsubb %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,28] . . . DR . . . vpsubd %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,29] . . . DR . . . vpsubq %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,30] . . . DR . . . vpsubw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,31] . . . DR . . . vpsubb %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,32] . . . .DR . . . vpsubd %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,33] . . . .DR . . . vpsubq %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,34] . . . . DR . . . vpsubw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,35] . . . . DR . . . andnps %xmm0, %xmm0 +# CHECK-NEXT: [0,36] . . . . DR. . . andnpd %xmm1, %xmm1 +# CHECK-NEXT: [0,37] . . . . DR. . . vandnps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: [0,38] . . . . DR . . vandnpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [0,39] . . . . DR . . pandn %mm2, %mm2 +# CHECK-NEXT: [0,40] . . . . DR . . pandn %xmm2, %xmm2 +# CHECK-NEXT: [0,41] . . . . DR . . vpandn %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,42] . . . . .DR . . vandnps %xmm2, %xmm2, %xmm5 +# CHECK-NEXT: [0,43] . . . . .DR . . vandnpd %xmm1, %xmm1, %xmm5 +# CHECK-NEXT: [0,44] . . . . . DR . . vpandn %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,45] . . . . . DR . . xorps %xmm0, %xmm0 +# CHECK-NEXT: [0,46] . . . . . DR. . xorpd %xmm1, %xmm1 +# CHECK-NEXT: [0,47] . . . . . DR. . vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: [0,48] . . . . . DR . vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [0,49] . . . . . DR . pxor %mm2, %mm2 +# CHECK-NEXT: [0,50] . . . . . DR . pxor %xmm2, %xmm2 +# CHECK-NEXT: [0,51] . . . . . DR . vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,52] . . . . . .DR. vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: [0,53] . . . . . .DR. vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: [0,54] . . . . . . DR vpxor %xmm3, %xmm3, %xmm5 # CHECK: Average Wait times (based on the timeline view): # CHECK-NEXT: [0]: Executions @@ -296,58 +296,58 @@ vpxor %xmm3, %xmm3, %xmm5 # CHECK-NEXT: [3]: Average time elapsed from WB until retire stage # CHECK: [0] [1] [2] [3] -# CHECK-NEXT: 0. 1 1.0 1.0 0.0 subl %eax, %eax -# CHECK-NEXT: 1. 1 2.0 0.0 0.0 subq %rax, %rax -# CHECK-NEXT: 2. 1 2.0 0.0 0.0 xorl %eax, %eax -# CHECK-NEXT: 3. 1 3.0 0.0 0.0 xorq %rax, %rax -# CHECK-NEXT: 4. 1 0.0 0.0 3.0 pcmpgtb %mm2, %mm2 -# CHECK-NEXT: 5. 1 0.0 0.0 4.0 pcmpgtd %mm2, %mm2 -# CHECK-NEXT: 6. 1 0.0 0.0 3.0 pcmpgtw %mm2, %mm2 -# CHECK-NEXT: 7. 1 0.0 0.0 4.0 pcmpgtb %xmm2, %xmm2 -# CHECK-NEXT: 8. 1 0.0 0.0 3.0 pcmpgtd %xmm2, %xmm2 -# CHECK-NEXT: 9. 1 0.0 0.0 4.0 pcmpgtq %xmm2, %xmm2 -# CHECK-NEXT: 10. 1 0.0 0.0 3.0 pcmpgtw %xmm2, %xmm2 -# CHECK-NEXT: 11. 1 0.0 0.0 4.0 vpcmpgtb %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 12. 1 0.0 0.0 3.0 vpcmpgtd %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 13. 1 0.0 0.0 4.0 vpcmpgtq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 14. 1 0.0 0.0 3.0 vpcmpgtw %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 15. 1 0.0 0.0 4.0 vpcmpgtb %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 16. 1 0.0 0.0 3.0 vpcmpgtd %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 17. 1 0.0 0.0 4.0 vpcmpgtq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 18. 1 0.0 0.0 3.0 vpcmpgtw %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 19. 1 0.0 0.0 4.0 psubb %mm2, %mm2 -# CHECK-NEXT: 20. 1 0.0 0.0 3.0 psubd %mm2, %mm2 -# CHECK-NEXT: 21. 1 0.0 0.0 4.0 psubq %mm2, %mm2 -# CHECK-NEXT: 22. 1 0.0 0.0 3.0 psubw %mm2, %mm2 -# CHECK-NEXT: 23. 1 0.0 0.0 4.0 psubb %xmm2, %xmm2 -# CHECK-NEXT: 24. 1 0.0 0.0 3.0 psubd %xmm2, %xmm2 -# CHECK-NEXT: 25. 1 0.0 0.0 4.0 psubq %xmm2, %xmm2 -# CHECK-NEXT: 26. 1 0.0 0.0 3.0 psubw %xmm2, %xmm2 -# CHECK-NEXT: 27. 1 0.0 0.0 4.0 vpsubb %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 28. 1 0.0 0.0 3.0 vpsubd %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 29. 1 0.0 0.0 4.0 vpsubq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 30. 1 0.0 0.0 3.0 vpsubw %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 31. 1 0.0 0.0 4.0 vpsubb %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 32. 1 0.0 0.0 3.0 vpsubd %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 33. 1 0.0 0.0 4.0 vpsubq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 34. 1 0.0 0.0 3.0 vpsubw %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 35. 1 0.0 0.0 4.0 andnps %xmm0, %xmm0 -# CHECK-NEXT: 36. 1 0.0 0.0 3.0 andnpd %xmm1, %xmm1 -# CHECK-NEXT: 37. 1 0.0 0.0 4.0 vandnps %xmm2, %xmm2, %xmm2 -# CHECK-NEXT: 38. 1 0.0 0.0 3.0 vandnpd %xmm1, %xmm1, %xmm1 -# CHECK-NEXT: 39. 1 0.0 0.0 4.0 pandn %mm2, %mm2 -# CHECK-NEXT: 40. 1 0.0 0.0 3.0 pandn %xmm2, %xmm2 -# CHECK-NEXT: 41. 1 0.0 0.0 4.0 vpandn %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 42. 1 0.0 0.0 3.0 vandnps %xmm2, %xmm2, %xmm5 -# CHECK-NEXT: 43. 1 0.0 0.0 4.0 vandnpd %xmm1, %xmm1, %xmm5 -# CHECK-NEXT: 44. 1 0.0 0.0 3.0 vpandn %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 45. 1 0.0 0.0 4.0 xorps %xmm0, %xmm0 -# CHECK-NEXT: 46. 1 0.0 0.0 3.0 xorpd %xmm1, %xmm1 -# CHECK-NEXT: 47. 1 0.0 0.0 4.0 vxorps %xmm2, %xmm2, %xmm2 -# CHECK-NEXT: 48. 1 0.0 0.0 3.0 vxorpd %xmm1, %xmm1, %xmm1 -# CHECK-NEXT: 49. 1 0.0 0.0 4.0 pxor %mm2, %mm2 -# CHECK-NEXT: 50. 1 0.0 0.0 3.0 pxor %xmm2, %xmm2 -# CHECK-NEXT: 51. 1 0.0 0.0 4.0 vpxor %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 52. 1 0.0 0.0 3.0 vxorps %xmm4, %xmm4, %xmm5 -# CHECK-NEXT: 53. 1 0.0 0.0 4.0 vxorpd %xmm1, %xmm1, %xmm3 -# CHECK-NEXT: 54. 1 0.0 0.0 3.0 vpxor %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 0. 1 0.0 0.0 0.0 subl %eax, %eax +# CHECK-NEXT: 1. 1 0.0 0.0 0.0 subq %rax, %rax +# CHECK-NEXT: 2. 1 0.0 0.0 0.0 xorl %eax, %eax +# CHECK-NEXT: 3. 1 0.0 0.0 0.0 xorq %rax, %rax +# CHECK-NEXT: 4. 1 0.0 0.0 0.0 pcmpgtb %mm2, %mm2 +# CHECK-NEXT: 5. 1 0.0 0.0 0.0 pcmpgtd %mm2, %mm2 +# CHECK-NEXT: 6. 1 0.0 0.0 0.0 pcmpgtw %mm2, %mm2 +# CHECK-NEXT: 7. 1 0.0 0.0 0.0 pcmpgtb %xmm2, %xmm2 +# CHECK-NEXT: 8. 1 0.0 0.0 0.0 pcmpgtd %xmm2, %xmm2 +# CHECK-NEXT: 9. 1 0.0 0.0 0.0 pcmpgtq %xmm2, %xmm2 +# CHECK-NEXT: 10. 1 0.0 0.0 0.0 pcmpgtw %xmm2, %xmm2 +# CHECK-NEXT: 11. 1 0.0 0.0 0.0 vpcmpgtb %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 12. 1 0.0 0.0 0.0 vpcmpgtd %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 13. 1 0.0 0.0 0.0 vpcmpgtq %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 14. 1 0.0 0.0 0.0 vpcmpgtw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 15. 1 0.0 0.0 0.0 vpcmpgtb %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 16. 1 0.0 0.0 0.0 vpcmpgtd %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 17. 1 0.0 0.0 0.0 vpcmpgtq %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 18. 1 0.0 0.0 0.0 vpcmpgtw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 19. 1 0.0 0.0 0.0 psubb %mm2, %mm2 +# CHECK-NEXT: 20. 1 0.0 0.0 0.0 psubd %mm2, %mm2 +# CHECK-NEXT: 21. 1 0.0 0.0 0.0 psubq %mm2, %mm2 +# CHECK-NEXT: 22. 1 0.0 0.0 0.0 psubw %mm2, %mm2 +# CHECK-NEXT: 23. 1 0.0 0.0 0.0 psubb %xmm2, %xmm2 +# CHECK-NEXT: 24. 1 0.0 0.0 0.0 psubd %xmm2, %xmm2 +# CHECK-NEXT: 25. 1 0.0 0.0 0.0 psubq %xmm2, %xmm2 +# CHECK-NEXT: 26. 1 0.0 0.0 0.0 psubw %xmm2, %xmm2 +# CHECK-NEXT: 27. 1 0.0 0.0 0.0 vpsubb %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 28. 1 0.0 0.0 0.0 vpsubd %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 29. 1 0.0 0.0 0.0 vpsubq %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 30. 1 0.0 0.0 0.0 vpsubw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 31. 1 0.0 0.0 0.0 vpsubb %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 32. 1 0.0 0.0 0.0 vpsubd %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 33. 1 0.0 0.0 0.0 vpsubq %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 34. 1 0.0 0.0 0.0 vpsubw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 35. 1 0.0 0.0 0.0 andnps %xmm0, %xmm0 +# CHECK-NEXT: 36. 1 0.0 0.0 0.0 andnpd %xmm1, %xmm1 +# CHECK-NEXT: 37. 1 0.0 0.0 0.0 vandnps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: 38. 1 0.0 0.0 0.0 vandnpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: 39. 1 0.0 0.0 0.0 pandn %mm2, %mm2 +# CHECK-NEXT: 40. 1 0.0 0.0 0.0 pandn %xmm2, %xmm2 +# CHECK-NEXT: 41. 1 0.0 0.0 0.0 vpandn %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 42. 1 0.0 0.0 0.0 vandnps %xmm2, %xmm2, %xmm5 +# CHECK-NEXT: 43. 1 0.0 0.0 0.0 vandnpd %xmm1, %xmm1, %xmm5 +# CHECK-NEXT: 44. 1 0.0 0.0 0.0 vpandn %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 45. 1 0.0 0.0 0.0 xorps %xmm0, %xmm0 +# CHECK-NEXT: 46. 1 0.0 0.0 0.0 xorpd %xmm1, %xmm1 +# CHECK-NEXT: 47. 1 0.0 0.0 0.0 vxorps %xmm2, %xmm2, %xmm2 +# CHECK-NEXT: 48. 1 0.0 0.0 0.0 vxorpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: 49. 1 0.0 0.0 0.0 pxor %mm2, %mm2 +# CHECK-NEXT: 50. 1 0.0 0.0 0.0 pxor %xmm2, %xmm2 +# CHECK-NEXT: 51. 1 0.0 0.0 0.0 vpxor %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 52. 1 0.0 0.0 0.0 vxorps %xmm4, %xmm4, %xmm5 +# CHECK-NEXT: 53. 1 0.0 0.0 0.0 vxorpd %xmm1, %xmm1, %xmm3 +# CHECK-NEXT: 54. 1 0.0 0.0 0.0 vpxor %xmm3, %xmm3, %xmm5