[X86][SSE] Add SSE3 coverage to PHADD/SUB generation tests

This would have caught the regression identified in rGc012a388a15b.
This commit is contained in:
Simon Pilgrim 2021-05-15 21:25:34 +01:00
parent 88a8965a7d
commit 0afb10de14
1 changed files with 444 additions and 189 deletions

View File

@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3,SSSE3_SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3,SSSE3_FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSE3,SSE3_SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSE3,SSE3_FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSE_SLOW,SSSE3,SSSE3_SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSE,SSE_FAST,SSSE3,SSSE3_FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_SLOW
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX,AVX1,AVX1_FAST
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2_SLOW
@ -10,10 +12,10 @@
; PR34111 - https://bugs.llvm.org/show_bug.cgi?id=34111
define <4 x float> @hadd_v4f32(<4 x float> %a) {
; SSSE3-LABEL: hadd_v4f32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm0, %xmm0
; SSSE3-NEXT: retq
; SSE-LABEL: hadd_v4f32:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_v4f32:
; AVX: # %bb.0:
@ -27,21 +29,21 @@ define <4 x float> @hadd_v4f32(<4 x float> %a) {
}
define <8 x float> @hadd_v8f32a(<8 x float> %a) {
; SSSE3_SLOW-LABEL: hadd_v8f32a:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
; SSSE3_SLOW-NEXT: haddps %xmm1, %xmm2
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hadd_v8f32a:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movaps %xmm0, %xmm2
; SSE_SLOW-NEXT: haddps %xmm1, %xmm2
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSE_SLOW-NEXT: movaps %xmm2, %xmm1
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v8f32a:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2
; SSSE3_FAST-NEXT: haddps %xmm1, %xmm2
; SSSE3_FAST-NEXT: haddps %xmm0, %xmm0
; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hadd_v8f32a:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: movaps %xmm0, %xmm2
; SSE_FAST-NEXT: haddps %xmm1, %xmm2
; SSE_FAST-NEXT: haddps %xmm0, %xmm0
; SSE_FAST-NEXT: movaps %xmm2, %xmm1
; SSE_FAST-NEXT: retq
;
; AVX1_SLOW-LABEL: hadd_v8f32a:
; AVX1_SLOW: # %bb.0:
@ -71,11 +73,11 @@ define <8 x float> @hadd_v8f32a(<8 x float> %a) {
}
define <8 x float> @hadd_v8f32b(<8 x float> %a) {
; SSSE3-LABEL: hadd_v8f32b:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm0, %xmm0
; SSSE3-NEXT: haddps %xmm1, %xmm1
; SSSE3-NEXT: retq
; SSE-LABEL: hadd_v8f32b:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm0, %xmm0
; SSE-NEXT: haddps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_v8f32b:
; AVX: # %bb.0:
@ -89,10 +91,10 @@ define <8 x float> @hadd_v8f32b(<8 x float> %a) {
}
define <4 x float> @hsub_v4f32(<4 x float> %a) {
; SSSE3-LABEL: hsub_v4f32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: hsubps %xmm0, %xmm0
; SSSE3-NEXT: retq
; SSE-LABEL: hsub_v4f32:
; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm0, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_v4f32:
; AVX: # %bb.0:
@ -106,21 +108,21 @@ define <4 x float> @hsub_v4f32(<4 x float> %a) {
}
define <8 x float> @hsub_v8f32a(<8 x float> %a) {
; SSSE3_SLOW-LABEL: hsub_v8f32a:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movaps %xmm0, %xmm2
; SSSE3_SLOW-NEXT: hsubps %xmm1, %xmm2
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSSE3_SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hsub_v8f32a:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movaps %xmm0, %xmm2
; SSE_SLOW-NEXT: hsubps %xmm1, %xmm2
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSE_SLOW-NEXT: movaps %xmm2, %xmm1
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hsub_v8f32a:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: movaps %xmm0, %xmm2
; SSSE3_FAST-NEXT: hsubps %xmm1, %xmm2
; SSSE3_FAST-NEXT: hsubps %xmm0, %xmm0
; SSSE3_FAST-NEXT: movaps %xmm2, %xmm1
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hsub_v8f32a:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: movaps %xmm0, %xmm2
; SSE_FAST-NEXT: hsubps %xmm1, %xmm2
; SSE_FAST-NEXT: hsubps %xmm0, %xmm0
; SSE_FAST-NEXT: movaps %xmm2, %xmm1
; SSE_FAST-NEXT: retq
;
; AVX1_SLOW-LABEL: hsub_v8f32a:
; AVX1_SLOW: # %bb.0:
@ -150,11 +152,11 @@ define <8 x float> @hsub_v8f32a(<8 x float> %a) {
}
define <8 x float> @hsub_v8f32b(<8 x float> %a) {
; SSSE3-LABEL: hsub_v8f32b:
; SSSE3: # %bb.0:
; SSSE3-NEXT: hsubps %xmm0, %xmm0
; SSSE3-NEXT: hsubps %xmm1, %xmm1
; SSSE3-NEXT: retq
; SSE-LABEL: hsub_v8f32b:
; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm0, %xmm0
; SSE-NEXT: hsubps %xmm1, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_v8f32b:
; AVX: # %bb.0:
@ -168,18 +170,18 @@ define <8 x float> @hsub_v8f32b(<8 x float> %a) {
}
define <2 x double> @hadd_v2f64(<2 x double> %a) {
; SSSE3_SLOW-LABEL: hadd_v2f64:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hadd_v2f64:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE_SLOW-NEXT: addsd %xmm0, %xmm1
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v2f64:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hadd_v2f64:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
; SSE_FAST-NEXT: retq
;
; AVX1_SLOW-LABEL: hadd_v2f64:
; AVX1_SLOW: # %bb.0:
@ -212,18 +214,18 @@ define <2 x double> @hadd_v2f64(<2 x double> %a) {
}
define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
; SSSE3_SLOW-LABEL: hadd_v2f64_scalar_splat:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hadd_v2f64_scalar_splat:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE_SLOW-NEXT: addsd %xmm0, %xmm1
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v2f64_scalar_splat:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hadd_v2f64_scalar_splat:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
; SSE_FAST-NEXT: retq
;
; AVX1_SLOW-LABEL: hadd_v2f64_scalar_splat:
; AVX1_SLOW: # %bb.0:
@ -257,23 +259,23 @@ define <2 x double> @hadd_v2f64_scalar_splat(<2 x double> %a) {
}
define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_splat:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm3
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm3
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hadd_v4f64_scalar_splat:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movapd %xmm0, %xmm2
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSE_SLOW-NEXT: addsd %xmm0, %xmm2
; SSE_SLOW-NEXT: movapd %xmm1, %xmm3
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm3 = xmm3[1],xmm1[1]
; SSE_SLOW-NEXT: addsd %xmm1, %xmm3
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm3[0,0]
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v4f64_scalar_splat:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hadd_v4f64_scalar_splat:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
; SSE_FAST-NEXT: haddpd %xmm1, %xmm1
; SSE_FAST-NEXT: retq
;
; AVX-LABEL: hadd_v4f64_scalar_splat:
; AVX: # %bb.0:
@ -292,20 +294,20 @@ define <4 x double> @hadd_v4f64_scalar_splat(<4 x double> %a) {
}
define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
; SSSE3_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE_SLOW-NEXT: addsd %xmm0, %xmm1
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm1[0,0]
; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v4f64_scalar_broadcast:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
; SSSE3_FAST-NEXT: movapd %xmm0, %xmm1
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hadd_v4f64_scalar_broadcast:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
; SSE_FAST-NEXT: movapd %xmm0, %xmm1
; SSE_FAST-NEXT: retq
;
; AVX1_SLOW-LABEL: hadd_v4f64_scalar_broadcast:
; AVX1_SLOW: # %bb.0:
@ -346,23 +348,23 @@ define <4 x double> @hadd_v4f64_scalar_broadcast(<4 x double> %a) {
}
define <4 x double> @hadd_v4f64(<4 x double> %a) {
; SSSE3_SLOW-LABEL: hadd_v4f64:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSSE3_SLOW-NEXT: addsd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSSE3_SLOW-NEXT: addsd %xmm1, %xmm2
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hadd_v4f64:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movapd %xmm0, %xmm2
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSE_SLOW-NEXT: addsd %xmm0, %xmm2
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm2[0,0]
; SSE_SLOW-NEXT: movapd %xmm1, %xmm2
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE_SLOW-NEXT: addsd %xmm1, %xmm2
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm2[0,0]
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hadd_v4f64:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: haddpd %xmm0, %xmm0
; SSSE3_FAST-NEXT: haddpd %xmm1, %xmm1
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hadd_v4f64:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: haddpd %xmm0, %xmm0
; SSE_FAST-NEXT: haddpd %xmm1, %xmm1
; SSE_FAST-NEXT: retq
;
; AVX-LABEL: hadd_v4f64:
; AVX: # %bb.0:
@ -376,18 +378,18 @@ define <4 x double> @hadd_v4f64(<4 x double> %a) {
}
define <2 x double> @hsub_v2f64(<2 x double> %a) {
; SSSE3_SLOW-LABEL: hsub_v2f64:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm1
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSSE3_SLOW-NEXT: subsd %xmm1, %xmm0
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hsub_v2f64:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movapd %xmm0, %xmm1
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm0[1]
; SSE_SLOW-NEXT: subsd %xmm1, %xmm0
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hsub_v2f64:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hsub_v2f64:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0
; SSE_FAST-NEXT: retq
;
; AVX1_SLOW-LABEL: hsub_v2f64:
; AVX1_SLOW: # %bb.0:
@ -420,23 +422,23 @@ define <2 x double> @hsub_v2f64(<2 x double> %a) {
}
define <4 x double> @hsub_v4f64(<4 x double> %a) {
; SSSE3_SLOW-LABEL: hsub_v4f64:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movapd %xmm0, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm0
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSSE3_SLOW-NEXT: movapd %xmm1, %xmm2
; SSSE3_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSSE3_SLOW-NEXT: subsd %xmm2, %xmm1
; SSSE3_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
; SSSE3_SLOW-NEXT: retq
; SSE_SLOW-LABEL: hsub_v4f64:
; SSE_SLOW: # %bb.0:
; SSE_SLOW-NEXT: movapd %xmm0, %xmm2
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm0[1]
; SSE_SLOW-NEXT: subsd %xmm2, %xmm0
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm0 = xmm0[0,0]
; SSE_SLOW-NEXT: movapd %xmm1, %xmm2
; SSE_SLOW-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
; SSE_SLOW-NEXT: subsd %xmm2, %xmm1
; SSE_SLOW-NEXT: movddup {{.*#+}} xmm1 = xmm1[0,0]
; SSE_SLOW-NEXT: retq
;
; SSSE3_FAST-LABEL: hsub_v4f64:
; SSSE3_FAST: # %bb.0:
; SSSE3_FAST-NEXT: hsubpd %xmm0, %xmm0
; SSSE3_FAST-NEXT: hsubpd %xmm1, %xmm1
; SSSE3_FAST-NEXT: retq
; SSE_FAST-LABEL: hsub_v4f64:
; SSE_FAST: # %bb.0:
; SSE_FAST-NEXT: hsubpd %xmm0, %xmm0
; SSE_FAST-NEXT: hsubpd %xmm1, %xmm1
; SSE_FAST-NEXT: retq
;
; AVX-LABEL: hsub_v4f64:
; AVX: # %bb.0:
@ -450,6 +452,13 @@ define <4 x double> @hsub_v4f64(<4 x double> %a) {
}
define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
; SSE3-LABEL: hadd_v4i32:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,2]
; SSE3-NEXT: paddd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm0, %xmm0
@ -467,6 +476,16 @@ define <4 x i32> @hadd_v4i32(<4 x i32> %a) {
}
define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
; SSE3-LABEL: hadd_v8i32a:
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm2
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
; SSE3-NEXT: paddd %xmm0, %xmm2
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3_SLOW-LABEL: hadd_v8i32a:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
@ -513,6 +532,16 @@ define <8 x i32> @hadd_v8i32a(<8 x i32> %a) {
}
define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
; SSE3-LABEL: hadd_v8i32b:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
; SSE3-NEXT: paddd %xmm2, %xmm0
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
; SSE3-NEXT: paddd %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_v8i32b:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm0, %xmm0
@ -539,6 +568,13 @@ define <8 x i32> @hadd_v8i32b(<8 x i32> %a) {
}
define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
; SSE3-LABEL: hsub_v4i32:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,3,1,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,3]
; SSE3-NEXT: psubd %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hsub_v4i32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm0, %xmm0
@ -556,6 +592,16 @@ define <4 x i32> @hsub_v4i32(<4 x i32> %a) {
}
define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
; SSE3-LABEL: hsub_v8i32a:
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm2
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[0,2]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE3-NEXT: psubd %xmm0, %xmm2
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3_SLOW-LABEL: hsub_v8i32a:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
@ -602,6 +648,16 @@ define <8 x i32> @hsub_v8i32a(<8 x i32> %a) {
}
define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
; SSE3-LABEL: hsub_v8i32b:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,3,1,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
; SSE3-NEXT: psubd %xmm2, %xmm0
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,1,3]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,0,2]
; SSE3-NEXT: psubd %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hsub_v8i32b:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm0, %xmm0
@ -628,6 +684,18 @@ define <8 x i32> @hsub_v8i32b(<8 x i32> %a) {
}
define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
; SSE3-LABEL: hadd_v8i16:
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
; SSE3-NEXT: paddw %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm0, %xmm0
@ -645,6 +713,28 @@ define <8 x i16> @hadd_v8i16(<8 x i16> %a) {
}
define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
; SSE3-LABEL: hadd_v16i16a:
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm2[0]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[1,0,3,2,4,5,6,7]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,0,3,2,4,5,6,7]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: paddw %xmm3, %xmm1
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
; SSE3-NEXT: retq
;
; SSSE3_SLOW-LABEL: hadd_v16i16a:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
@ -691,6 +781,32 @@ define <16 x i16> @hadd_v16i16a(<16 x i16> %a) {
}
define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
; SSE3-LABEL: hadd_v16i16b:
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
; SSE3-NEXT: paddw %xmm2, %xmm0
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5]
; SSE3-NEXT: paddw %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_v16i16b:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm0, %xmm0
@ -717,6 +833,14 @@ define <16 x i16> @hadd_v16i16b(<16 x i16> %a) {
}
define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
; SSE3-LABEL: hsub_v8i16:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[1,1,3,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,6,6]
; SSE3-NEXT: psubw %xmm1, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hsub_v8i16:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubw %xmm0, %xmm0
@ -734,6 +858,29 @@ define <8 x i16> @hsub_v8i16(<8 x i16> %a) {
}
define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
; SSE3-LABEL: hsub_v16i16a:
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: psubw %xmm0, %xmm2
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,1,0,1]
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3_SLOW-LABEL: hsub_v16i16a:
; SSSE3_SLOW: # %bb.0:
; SSSE3_SLOW-NEXT: movdqa %xmm0, %xmm2
@ -780,6 +927,32 @@ define <16 x i16> @hsub_v16i16a(<16 x i16> %a) {
}
define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
; SSE3-LABEL: hsub_v16i16b:
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[3,1,1,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,0,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,4]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,4,5]
; SSE3-NEXT: psubw %xmm2, %xmm0
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm1[3,1,1,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,5,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,5,4]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,2,2,0,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,6,6,4]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,4,5]
; SSE3-NEXT: psubw %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hsub_v16i16b:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubw %xmm0, %xmm0
@ -806,11 +979,11 @@ define <16 x i16> @hsub_v16i16b(<16 x i16> %a) {
}
define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
; SSSE3-LABEL: broadcast_haddps_v4f32:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm0, %xmm0
; SSSE3-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSSE3-NEXT: retq
; SSE-LABEL: broadcast_haddps_v4f32:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm0, %xmm0
; SSE-NEXT: movsldup {{.*#+}} xmm0 = xmm0[0,0,2,2]
; SSE-NEXT: retq
;
; AVX1-LABEL: broadcast_haddps_v4f32:
; AVX1: # %bb.0:
@ -831,10 +1004,10 @@ define <4 x float> @broadcast_haddps_v4f32(<4 x float> %a0) {
declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>)
define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
; SSSE3-LABEL: PR34724_1:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm1, %xmm0
; SSSE3-NEXT: retq
; SSE-LABEL: PR34724_1:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR34724_1:
; AVX: # %bb.0:
@ -851,10 +1024,10 @@ define <4 x float> @PR34724_1(<4 x float> %a, <4 x float> %b) {
}
define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
; SSSE3-LABEL: PR34724_2:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm1, %xmm0
; SSSE3-NEXT: retq
; SSE-LABEL: PR34724_2:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: PR34724_2:
; AVX: # %bb.0:
@ -876,11 +1049,11 @@ define <4 x float> @PR34724_2(<4 x float> %a, <4 x float> %b) {
;
define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) {
; SSSE3-LABEL: hadd_4f32_v8f32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm1, %xmm0
; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: retq
; SSE-LABEL: hadd_4f32_v8f32_shuffle:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_4f32_v8f32_shuffle:
; AVX: # %bb.0:
@ -899,11 +1072,11 @@ define <4 x float> @hadd_4f32_v8f32_shuffle(<8 x float> %a0) {
}
define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) {
; SSSE3-LABEL: hsub_4f32_v8f32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm1, %xmm0
; SSSE3-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSSE3-NEXT: retq
; SSE-LABEL: hsub_4f32_v8f32_shuffle:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: movshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_4f32_v8f32_shuffle:
; AVX: # %bb.0:
@ -922,6 +1095,14 @@ define <4 x float> @hsub_4f32_v8f32_shuffle(<8 x float> %a0) {
}
define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) {
; SSE3-LABEL: hadd_4i32_v8i32_shuffle:
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm2
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
; SSE3-NEXT: paddd %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_4i32_v8i32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
@ -953,6 +1134,14 @@ define <4 x i32> @hadd_4i32_v8i32_shuffle(<8 x i32> %a0) {
}
define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) {
; SSE3-LABEL: hsub_4i32_v8i32_shuffle:
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm0, %xmm2
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,2],xmm1[2,2]
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
; SSE3-NEXT: paddd %xmm2, %xmm0
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hsub_4i32_v8i32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
@ -988,12 +1177,12 @@ define <4 x i32> @hsub_4i32_v8i32_shuffle(<8 x i32> %a0) {
;
define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
; SSSE3-LABEL: hadd_4f64_v4f64_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddpd %xmm1, %xmm0
; SSSE3-NEXT: haddpd %xmm3, %xmm2
; SSSE3-NEXT: movapd %xmm2, %xmm1
; SSSE3-NEXT: retq
; SSE-LABEL: hadd_4f64_v4f64_shuffle:
; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0
; SSE-NEXT: haddpd %xmm3, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: hadd_4f64_v4f64_shuffle:
; AVX1: # %bb.0:
@ -1016,12 +1205,12 @@ define <4 x double> @hadd_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1)
}
define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1) {
; SSSE3-LABEL: hsub_4f64_v4f64_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: hsubpd %xmm1, %xmm0
; SSSE3-NEXT: hsubpd %xmm3, %xmm2
; SSSE3-NEXT: movapd %xmm2, %xmm1
; SSSE3-NEXT: retq
; SSE-LABEL: hsub_4f64_v4f64_shuffle:
; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0
; SSE-NEXT: hsubpd %xmm3, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: hsub_4f64_v4f64_shuffle:
; AVX1: # %bb.0:
@ -1044,12 +1233,12 @@ define <4 x double> @hsub_4f64_v4f64_shuffle(<4 x double> %a0, <4 x double> %a1)
}
define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
; SSSE3-LABEL: hadd_8f32_v8f32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm1, %xmm0
; SSSE3-NEXT: haddps %xmm3, %xmm2
; SSSE3-NEXT: movaps %xmm2, %xmm1
; SSSE3-NEXT: retq
; SSE-LABEL: hadd_8f32_v8f32_shuffle:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: haddps %xmm3, %xmm2
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: hadd_8f32_v8f32_shuffle:
; AVX1: # %bb.0:
@ -1072,12 +1261,12 @@ define <8 x float> @hadd_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
}
define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
; SSSE3-LABEL: hsub_8f32_v8f32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: haddps %xmm1, %xmm0
; SSSE3-NEXT: haddps %xmm3, %xmm2
; SSSE3-NEXT: movaps %xmm2, %xmm1
; SSSE3-NEXT: retq
; SSE-LABEL: hsub_8f32_v8f32_shuffle:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: haddps %xmm3, %xmm2
; SSE-NEXT: movaps %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX1-LABEL: hsub_8f32_v8f32_shuffle:
; AVX1: # %bb.0:
@ -1100,6 +1289,19 @@ define <8 x float> @hsub_8f32_v8f32_shuffle(<8 x float> %a0, <8 x float> %a1) {
}
define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
; SSE3-LABEL: hadd_8i32_v8i32_shuffle:
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm2, %xmm4
; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
; SSE3-NEXT: movaps %xmm0, %xmm5
; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
; SSE3-NEXT: paddd %xmm4, %xmm2
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE3-NEXT: paddd %xmm5, %xmm0
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_8i32_v8i32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
@ -1130,6 +1332,20 @@ define <8 x i32> @hadd_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
}
define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
; SSE3-LABEL: hsub_8i32_v8i32_shuffle:
; SSE3: # %bb.0:
; SSE3-NEXT: movaps %xmm2, %xmm4
; SSE3-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,2],xmm3[0,2]
; SSE3-NEXT: movaps %xmm0, %xmm5
; SSE3-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm1[0,2]
; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[1,3]
; SSE3-NEXT: psubd %xmm2, %xmm4
; SSE3-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
; SSE3-NEXT: psubd %xmm0, %xmm5
; SSE3-NEXT: movdqa %xmm5, %xmm0
; SSE3-NEXT: movdqa %xmm4, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hsub_8i32_v8i32_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm1, %xmm0
@ -1160,6 +1376,45 @@ define <8 x i32> @hsub_8i32_v8i32_shuffle(<8 x i32> %a0, <8 x i32> %a1) {
}
define <16 x i16> @hadd_16i16_16i16_shuffle(<16 x i16> %a0, <16 x i16> %a1) {
; SSE3-LABEL: hadd_16i16_16i16_shuffle:
; SSE3: # %bb.0:
; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm3[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm5 = xmm2[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm4[0]
; SSE3-NEXT: pshuflw {{.*#+}} xmm4 = xmm1[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[0,2,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,6,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm4[0]
; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
; SSE3-NEXT: paddw %xmm5, %xmm2
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
; SSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: paddw %xmm6, %xmm0
; SSE3-NEXT: movdqa %xmm2, %xmm1
; SSE3-NEXT: retq
;
; SSSE3-LABEL: hadd_16i16_16i16_shuffle:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm1, %xmm0