2016-01-16 22:03:40 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
|
|
|
|
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
|
2014-06-10 00:54:41 +08:00
|
|
|
|
|
|
|
define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hadd_ps_test1:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: haddps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hadd_ps_test1:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x float> %A, i32 0
|
|
|
|
%vecext1 = extractelement <4 x float> %A, i32 1
|
|
|
|
%add = fadd float %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x float> undef, float %add, i32 0
|
|
|
|
%vecext2 = extractelement <4 x float> %A, i32 2
|
|
|
|
%vecext3 = extractelement <4 x float> %A, i32 3
|
|
|
|
%add4 = fadd float %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x float> %B, i32 0
|
|
|
|
%vecext7 = extractelement <4 x float> %B, i32 1
|
|
|
|
%add8 = fadd float %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x float> %B, i32 2
|
|
|
|
%vecext11 = extractelement <4 x float> %B, i32 3
|
|
|
|
%add12 = fadd float %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
|
|
|
|
ret <4 x float> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hadd_ps_test2:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: haddps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hadd_ps_test2:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x float> %A, i32 2
|
|
|
|
%vecext1 = extractelement <4 x float> %A, i32 3
|
|
|
|
%add = fadd float %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x float> undef, float %add, i32 1
|
|
|
|
%vecext2 = extractelement <4 x float> %A, i32 0
|
|
|
|
%vecext3 = extractelement <4 x float> %A, i32 1
|
|
|
|
%add4 = fadd float %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
|
|
|
|
%vecext6 = extractelement <4 x float> %B, i32 2
|
|
|
|
%vecext7 = extractelement <4 x float> %B, i32 3
|
|
|
|
%add8 = fadd float %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
|
|
|
|
%vecext10 = extractelement <4 x float> %B, i32 0
|
|
|
|
%vecext11 = extractelement <4 x float> %B, i32 1
|
|
|
|
%add12 = fadd float %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
|
|
|
|
ret <4 x float> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hsub_ps_test1:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: hsubps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hsub_ps_test1:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x float> %A, i32 0
|
|
|
|
%vecext1 = extractelement <4 x float> %A, i32 1
|
|
|
|
%sub = fsub float %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x float> undef, float %sub, i32 0
|
|
|
|
%vecext2 = extractelement <4 x float> %A, i32 2
|
|
|
|
%vecext3 = extractelement <4 x float> %A, i32 3
|
|
|
|
%sub4 = fsub float %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x float> %B, i32 0
|
|
|
|
%vecext7 = extractelement <4 x float> %B, i32 1
|
|
|
|
%sub8 = fsub float %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x float> %B, i32 2
|
|
|
|
%vecext11 = extractelement <4 x float> %B, i32 3
|
|
|
|
%sub12 = fsub float %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
|
|
|
|
ret <4 x float> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hsub_ps_test2:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: hsubps %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hsub_ps_test2:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x float> %A, i32 2
|
|
|
|
%vecext1 = extractelement <4 x float> %A, i32 3
|
|
|
|
%sub = fsub float %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x float> undef, float %sub, i32 1
|
|
|
|
%vecext2 = extractelement <4 x float> %A, i32 0
|
|
|
|
%vecext3 = extractelement <4 x float> %A, i32 1
|
|
|
|
%sub4 = fsub float %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext6 = extractelement <4 x float> %B, i32 2
|
|
|
|
%vecext7 = extractelement <4 x float> %B, i32 3
|
2014-06-10 00:54:41 +08:00
|
|
|
%sub8 = fsub float %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext10 = extractelement <4 x float> %B, i32 0
|
|
|
|
%vecext11 = extractelement <4 x float> %B, i32 1
|
2014-06-10 00:54:41 +08:00
|
|
|
%sub12 = fsub float %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
|
|
|
|
ret <4 x float> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: phadd_d_test1:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %ecx
|
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE3-NEXT: addl %eax, %edx
|
|
|
|
; SSE3-NEXT: movd %xmm1, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %esi
|
|
|
|
; SSE3-NEXT: addl %eax, %esi
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edi
|
|
|
|
; SSE3-NEXT: addl %eax, %edi
|
|
|
|
; SSE3-NEXT: movd %edi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %esi, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %edx, %xmm2
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %ecx, %xmm0
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: phadd_d_test1:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phaddd %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: phadd_d_test1:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x i32> %A, i32 0
|
|
|
|
%vecext1 = extractelement <4 x i32> %A, i32 1
|
|
|
|
%add = add i32 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
|
|
|
|
%vecext2 = extractelement <4 x i32> %A, i32 2
|
|
|
|
%vecext3 = extractelement <4 x i32> %A, i32 3
|
|
|
|
%add4 = add i32 %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x i32> %B, i32 0
|
|
|
|
%vecext7 = extractelement <4 x i32> %B, i32 1
|
|
|
|
%add8 = add i32 %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x i32> %B, i32 2
|
|
|
|
%vecext11 = extractelement <4 x i32> %B, i32 3
|
|
|
|
%add12 = add i32 %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
|
|
|
|
ret <4 x i32> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: phadd_d_test2:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %ecx
|
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE3-NEXT: addl %eax, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %esi
|
|
|
|
; SSE3-NEXT: addl %eax, %esi
|
|
|
|
; SSE3-NEXT: movd %esi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %eax
|
|
|
|
; SSE3-NEXT: movd %xmm1, %esi
|
|
|
|
; SSE3-NEXT: addl %eax, %esi
|
|
|
|
; SSE3-NEXT: movd %esi, %xmm1
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %ecx, %xmm2
|
|
|
|
; SSE3-NEXT: movd %edx, %xmm0
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: phadd_d_test2:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phaddd %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: phadd_d_test2:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x i32> %A, i32 2
|
|
|
|
%vecext1 = extractelement <4 x i32> %A, i32 3
|
|
|
|
%add = add i32 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
|
|
|
|
%vecext2 = extractelement <4 x i32> %A, i32 0
|
|
|
|
%vecext3 = extractelement <4 x i32> %A, i32 1
|
|
|
|
%add4 = add i32 %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext6 = extractelement <4 x i32> %B, i32 3
|
|
|
|
%vecext7 = extractelement <4 x i32> %B, i32 2
|
2014-06-10 00:54:41 +08:00
|
|
|
%add8 = add i32 %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext10 = extractelement <4 x i32> %B, i32 1
|
|
|
|
%vecext11 = extractelement <4 x i32> %B, i32 0
|
2014-06-10 00:54:41 +08:00
|
|
|
%add12 = add i32 %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
|
|
|
|
ret <4 x i32> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: phsub_d_test1:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %ecx
|
|
|
|
; SSE3-NEXT: subl %ecx, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %ecx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE3-NEXT: subl %edx, %ecx
|
|
|
|
; SSE3-NEXT: movd %xmm1, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %esi
|
|
|
|
; SSE3-NEXT: subl %esi, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %esi
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edi
|
|
|
|
; SSE3-NEXT: subl %edi, %esi
|
|
|
|
; SSE3-NEXT: movd %esi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %edx, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %ecx, %xmm2
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %eax, %xmm0
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: phsub_d_test1:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phsubd %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: phsub_d_test1:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x i32> %A, i32 0
|
|
|
|
%vecext1 = extractelement <4 x i32> %A, i32 1
|
|
|
|
%sub = sub i32 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
|
|
|
|
%vecext2 = extractelement <4 x i32> %A, i32 2
|
|
|
|
%vecext3 = extractelement <4 x i32> %A, i32 3
|
|
|
|
%sub4 = sub i32 %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x i32> %B, i32 0
|
|
|
|
%vecext7 = extractelement <4 x i32> %B, i32 1
|
|
|
|
%sub8 = sub i32 %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x i32> %B, i32 2
|
|
|
|
%vecext11 = extractelement <4 x i32> %B, i32 3
|
|
|
|
%sub12 = sub i32 %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
|
|
|
|
ret <4 x i32> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: phsub_d_test2:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm2, %ecx
|
|
|
|
; SSE3-NEXT: subl %ecx, %eax
|
|
|
|
; SSE3-NEXT: movd %xmm0, %ecx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE3-NEXT: subl %edx, %ecx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %esi
|
|
|
|
; SSE3-NEXT: subl %esi, %edx
|
|
|
|
; SSE3-NEXT: movd %edx, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm1, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm1, %esi
|
|
|
|
; SSE3-NEXT: subl %esi, %edx
|
|
|
|
; SSE3-NEXT: movd %edx, %xmm1
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %eax, %xmm2
|
|
|
|
; SSE3-NEXT: movd %ecx, %xmm0
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: phsub_d_test2:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phsubd %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: phsub_d_test2:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x i32> %A, i32 2
|
|
|
|
%vecext1 = extractelement <4 x i32> %A, i32 3
|
|
|
|
%sub = sub i32 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
|
|
|
|
%vecext2 = extractelement <4 x i32> %A, i32 0
|
|
|
|
%vecext3 = extractelement <4 x i32> %A, i32 1
|
|
|
|
%sub4 = sub i32 %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext6 = extractelement <4 x i32> %B, i32 2
|
|
|
|
%vecext7 = extractelement <4 x i32> %B, i32 3
|
2014-06-10 00:54:41 +08:00
|
|
|
%sub8 = sub i32 %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext10 = extractelement <4 x i32> %B, i32 0
|
|
|
|
%vecext11 = extractelement <4 x i32> %B, i32 1
|
2014-06-10 00:54:41 +08:00
|
|
|
%sub12 = sub i32 %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
|
|
|
|
ret <4 x i32> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hadd_pd_test1:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: haddpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hadd_pd_test1:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <2 x double> %A, i32 0
|
|
|
|
%vecext1 = extractelement <2 x double> %A, i32 1
|
|
|
|
%add = fadd double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <2 x double> undef, double %add, i32 0
|
|
|
|
%vecext2 = extractelement <2 x double> %B, i32 0
|
|
|
|
%vecext3 = extractelement <2 x double> %B, i32 1
|
|
|
|
%add2 = fadd double %vecext2, %vecext3
|
|
|
|
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
|
|
|
|
ret <2 x double> %vecinit2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hadd_pd_test2:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: haddpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hadd_pd_test2:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <2 x double> %A, i32 1
|
|
|
|
%vecext1 = extractelement <2 x double> %A, i32 0
|
|
|
|
%add = fadd double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <2 x double> undef, double %add, i32 0
|
|
|
|
%vecext2 = extractelement <2 x double> %B, i32 1
|
|
|
|
%vecext3 = extractelement <2 x double> %B, i32 0
|
|
|
|
%add2 = fadd double %vecext2, %vecext3
|
|
|
|
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
|
|
|
|
ret <2 x double> %vecinit2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hsub_pd_test1:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: hsubpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hsub_pd_test1:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <2 x double> %A, i32 0
|
|
|
|
%vecext1 = extractelement <2 x double> %A, i32 1
|
|
|
|
%sub = fsub double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <2 x double> undef, double %sub, i32 0
|
|
|
|
%vecext2 = extractelement <2 x double> %B, i32 0
|
|
|
|
%vecext3 = extractelement <2 x double> %B, i32 1
|
|
|
|
%sub2 = fsub double %vecext2, %vecext3
|
|
|
|
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
|
|
|
|
ret <2 x double> %vecinit2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: hsub_pd_test2:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: hsubpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: hsub_pd_test2:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext = extractelement <2 x double> %B, i32 0
|
|
|
|
%vecext1 = extractelement <2 x double> %B, i32 1
|
2014-06-10 00:54:41 +08:00
|
|
|
%sub = fsub double %vecext, %vecext1
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecinit = insertelement <2 x double> undef, double %sub, i32 1
|
|
|
|
%vecext2 = extractelement <2 x double> %A, i32 0
|
|
|
|
%vecext3 = extractelement <2 x double> %A, i32 1
|
2014-06-10 00:54:41 +08:00
|
|
|
%sub2 = fsub double %vecext2, %vecext3
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
|
2014-06-10 00:54:41 +08:00
|
|
|
ret <2 x double> %vecinit2
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: avx_vhadd_pd_test:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: haddpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: haddpd %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avx_vhadd_pd_test:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x double> %A, i32 0
|
|
|
|
%vecext1 = extractelement <4 x double> %A, i32 1
|
|
|
|
%add = fadd double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x double> undef, double %add, i32 0
|
|
|
|
%vecext2 = extractelement <4 x double> %A, i32 2
|
|
|
|
%vecext3 = extractelement <4 x double> %A, i32 3
|
|
|
|
%add4 = fadd double %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x double> %B, i32 0
|
|
|
|
%vecext7 = extractelement <4 x double> %B, i32 1
|
|
|
|
%add8 = fadd double %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x double> %B, i32 2
|
|
|
|
%vecext11 = extractelement <4 x double> %B, i32 3
|
|
|
|
%add12 = fadd double %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
|
|
|
|
ret <4 x double> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: avx_vhsub_pd_test:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: hsubpd %xmm1, %xmm0
|
|
|
|
; SSE-NEXT: hsubpd %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avx_vhsub_pd_test:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <4 x double> %A, i32 0
|
|
|
|
%vecext1 = extractelement <4 x double> %A, i32 1
|
|
|
|
%sub = fsub double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x double> undef, double %sub, i32 0
|
|
|
|
%vecext2 = extractelement <4 x double> %A, i32 2
|
|
|
|
%vecext3 = extractelement <4 x double> %A, i32 3
|
|
|
|
%sub4 = fsub double %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x double> %B, i32 0
|
|
|
|
%vecext7 = extractelement <4 x double> %B, i32 1
|
|
|
|
%sub8 = fsub double %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x double> %B, i32 2
|
|
|
|
%vecext11 = extractelement <4 x double> %B, i32 3
|
|
|
|
%sub12 = fsub double %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
|
|
|
|
ret <4 x double> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: avx2_vphadd_d_test:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: movd %xmm0, %ecx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm4, %r8d
|
|
|
|
; SSE3-NEXT: addl %ecx, %r8d
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm4, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %r9d
|
|
|
|
; SSE3-NEXT: addl %edx, %r9d
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm1, %edx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %esi
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: addl %edx, %esi
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edi
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: addl %edx, %edi
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %xmm2, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm0, %r10d
|
|
|
|
; SSE3-NEXT: addl %eax, %r10d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %ecx
|
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
|
|
|
; SSE3-NEXT: movd %xmm3, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE3-NEXT: addl %eax, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm0, %r11d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: addl %r11d, %eax
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %edi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %esi, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %r9d, %xmm2
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %r8d, %xmm0
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE3-NEXT: movd %eax, %xmm1
|
|
|
|
; SSE3-NEXT: movd %edx, %xmm2
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %ecx, %xmm3
|
|
|
|
; SSE3-NEXT: movd %r10d, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: avx2_vphadd_d_test:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phaddd %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: phaddd %xmm3, %xmm2
|
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avx2_vphadd_d_test:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avx2_vphadd_d_test:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
|
|
|
|
; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2014-06-10 00:54:41 +08:00
|
|
|
%vecext = extractelement <8 x i32> %A, i32 0
|
|
|
|
%vecext1 = extractelement <8 x i32> %A, i32 1
|
|
|
|
%add = add i32 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
|
|
|
|
%vecext2 = extractelement <8 x i32> %A, i32 2
|
|
|
|
%vecext3 = extractelement <8 x i32> %A, i32 3
|
|
|
|
%add4 = add i32 %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
|
|
|
|
%vecext6 = extractelement <8 x i32> %A, i32 4
|
|
|
|
%vecext7 = extractelement <8 x i32> %A, i32 5
|
|
|
|
%add8 = add i32 %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
|
|
|
|
%vecext10 = extractelement <8 x i32> %A, i32 6
|
|
|
|
%vecext11 = extractelement <8 x i32> %A, i32 7
|
|
|
|
%add12 = add i32 %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
|
|
|
|
%vecext14 = extractelement <8 x i32> %B, i32 0
|
|
|
|
%vecext15 = extractelement <8 x i32> %B, i32 1
|
|
|
|
%add16 = add i32 %vecext14, %vecext15
|
|
|
|
%vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
|
|
|
|
%vecext18 = extractelement <8 x i32> %B, i32 2
|
|
|
|
%vecext19 = extractelement <8 x i32> %B, i32 3
|
|
|
|
%add20 = add i32 %vecext18, %vecext19
|
|
|
|
%vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
|
|
|
|
%vecext22 = extractelement <8 x i32> %B, i32 4
|
|
|
|
%vecext23 = extractelement <8 x i32> %B, i32 5
|
|
|
|
%add24 = add i32 %vecext22, %vecext23
|
|
|
|
%vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
|
|
|
|
%vecext26 = extractelement <8 x i32> %B, i32 6
|
|
|
|
%vecext27 = extractelement <8 x i32> %B, i32 7
|
|
|
|
%add28 = add i32 %vecext26, %vecext27
|
|
|
|
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
|
|
|
|
ret <8 x i32> %vecinit29
|
|
|
|
}
|
|
|
|
|
2014-06-11 00:42:57 +08:00
|
|
|
define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: avx2_vphadd_w_test:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: pushq %rbp
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi0:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SSE3-NEXT: pushq %r15
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi1:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 24
|
|
|
|
; SSE3-NEXT: pushq %r14
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi2:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 32
|
|
|
|
; SSE3-NEXT: pushq %r13
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi3:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 40
|
|
|
|
; SSE3-NEXT: pushq %r12
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi4:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 48
|
|
|
|
; SSE3-NEXT: pushq %rbx
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi5:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 56
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi6:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %rbx, -56
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi7:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r12, -48
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi8:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r13, -40
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi9:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r14, -32
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi10:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r15, -24
|
2016-12-01 07:48:26 +08:00
|
|
|
; SSE3-NEXT: .Lcfi11:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pextrw $1, %xmm0, %ecx
|
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
|
|
|
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
|
|
|
|
; SSE3-NEXT: pextrw $2, %xmm0, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $3, %xmm0, %ecx
|
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
|
|
|
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $4, %xmm0, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $5, %xmm0, %r11d
|
|
|
|
; SSE3-NEXT: addl %eax, %r11d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $6, %xmm0, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $7, %xmm0, %r15d
|
|
|
|
; SSE3-NEXT: addl %eax, %r15d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %xmm1, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $1, %xmm1, %r13d
|
|
|
|
; SSE3-NEXT: addl %eax, %r13d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $2, %xmm1, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $3, %xmm1, %ebx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: addl %eax, %ebx
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $4, %xmm1, %eax
|
|
|
|
; SSE3-NEXT: pextrw $5, %xmm1, %r8d
|
|
|
|
; SSE3-NEXT: addl %eax, %r8d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $6, %xmm1, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $7, %xmm1, %esi
|
|
|
|
; SSE3-NEXT: addl %eax, %esi
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %xmm2, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $1, %xmm2, %r10d
|
|
|
|
; SSE3-NEXT: addl %eax, %r10d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $2, %xmm2, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $3, %xmm2, %r14d
|
|
|
|
; SSE3-NEXT: addl %eax, %r14d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $4, %xmm2, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $5, %xmm2, %r12d
|
|
|
|
; SSE3-NEXT: addl %eax, %r12d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $6, %xmm2, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $7, %xmm2, %r9d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: addl %eax, %r9d
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm3, %eax
|
|
|
|
; SSE3-NEXT: pextrw $1, %xmm3, %ebp
|
|
|
|
; SSE3-NEXT: addl %eax, %ebp
|
|
|
|
; SSE3-NEXT: pextrw $2, %xmm3, %edx
|
|
|
|
; SSE3-NEXT: pextrw $3, %xmm3, %edi
|
|
|
|
; SSE3-NEXT: addl %edx, %edi
|
|
|
|
; SSE3-NEXT: pextrw $4, %xmm3, %edx
|
|
|
|
; SSE3-NEXT: pextrw $5, %xmm3, %ecx
|
|
|
|
; SSE3-NEXT: addl %edx, %ecx
|
|
|
|
; SSE3-NEXT: pextrw $6, %xmm3, %edx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $7, %xmm3, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: addl %edx, %eax
|
|
|
|
; SSE3-NEXT: movd %esi, %xmm8
|
|
|
|
; SSE3-NEXT: movd %r8d, %xmm3
|
|
|
|
; SSE3-NEXT: movd %ebx, %xmm9
|
|
|
|
; SSE3-NEXT: movd %r13d, %xmm4
|
|
|
|
; SSE3-NEXT: movd %r15d, %xmm10
|
|
|
|
; SSE3-NEXT: movd %r11d, %xmm7
|
|
|
|
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm11 # 4-byte Folded Reload
|
|
|
|
; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
|
|
|
|
; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
|
|
|
|
; SSE3-NEXT: movd %eax, %xmm12
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %ecx, %xmm6
|
|
|
|
; SSE3-NEXT: movd %edi, %xmm13
|
|
|
|
; SSE3-NEXT: movd %ebp, %xmm5
|
|
|
|
; SSE3-NEXT: movd %r9d, %xmm14
|
|
|
|
; SSE3-NEXT: movd %r12d, %xmm2
|
|
|
|
; SSE3-NEXT: movd %r14d, %xmm15
|
|
|
|
; SSE3-NEXT: movd %r10d, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
|
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: popq %rbx
|
|
|
|
; SSE3-NEXT: popq %r12
|
|
|
|
; SSE3-NEXT: popq %r13
|
|
|
|
; SSE3-NEXT: popq %r14
|
|
|
|
; SSE3-NEXT: popq %r15
|
|
|
|
; SSE3-NEXT: popq %rbp
|
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: avx2_vphadd_w_test:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phaddw %xmm1, %xmm0
|
|
|
|
; SSSE3-NEXT: phaddw %xmm3, %xmm2
|
|
|
|
; SSSE3-NEXT: movdqa %xmm2, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avx2_vphadd_w_test:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
|
|
|
|
; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avx2_vphadd_w_test:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
|
|
|
|
; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
|
|
|
|
; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0
|
|
|
|
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2014-06-11 00:42:57 +08:00
|
|
|
%vecext = extractelement <16 x i16> %a, i32 0
|
|
|
|
%vecext1 = extractelement <16 x i16> %a, i32 1
|
|
|
|
%add = add i16 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
|
|
|
|
%vecext4 = extractelement <16 x i16> %a, i32 2
|
|
|
|
%vecext6 = extractelement <16 x i16> %a, i32 3
|
|
|
|
%add8 = add i16 %vecext4, %vecext6
|
|
|
|
%vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
|
|
|
|
%vecext11 = extractelement <16 x i16> %a, i32 4
|
|
|
|
%vecext13 = extractelement <16 x i16> %a, i32 5
|
|
|
|
%add15 = add i16 %vecext11, %vecext13
|
|
|
|
%vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
|
|
|
|
%vecext18 = extractelement <16 x i16> %a, i32 6
|
|
|
|
%vecext20 = extractelement <16 x i16> %a, i32 7
|
|
|
|
%add22 = add i16 %vecext18, %vecext20
|
|
|
|
%vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
|
|
|
|
%vecext25 = extractelement <16 x i16> %a, i32 8
|
|
|
|
%vecext27 = extractelement <16 x i16> %a, i32 9
|
|
|
|
%add29 = add i16 %vecext25, %vecext27
|
|
|
|
%vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
|
|
|
|
%vecext32 = extractelement <16 x i16> %a, i32 10
|
|
|
|
%vecext34 = extractelement <16 x i16> %a, i32 11
|
|
|
|
%add36 = add i16 %vecext32, %vecext34
|
|
|
|
%vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
|
|
|
|
%vecext39 = extractelement <16 x i16> %a, i32 12
|
|
|
|
%vecext41 = extractelement <16 x i16> %a, i32 13
|
|
|
|
%add43 = add i16 %vecext39, %vecext41
|
|
|
|
%vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
|
|
|
|
%vecext46 = extractelement <16 x i16> %a, i32 14
|
|
|
|
%vecext48 = extractelement <16 x i16> %a, i32 15
|
|
|
|
%add50 = add i16 %vecext46, %vecext48
|
|
|
|
%vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
|
|
|
|
%vecext53 = extractelement <16 x i16> %b, i32 0
|
|
|
|
%vecext55 = extractelement <16 x i16> %b, i32 1
|
|
|
|
%add57 = add i16 %vecext53, %vecext55
|
|
|
|
%vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
|
|
|
|
%vecext60 = extractelement <16 x i16> %b, i32 2
|
|
|
|
%vecext62 = extractelement <16 x i16> %b, i32 3
|
|
|
|
%add64 = add i16 %vecext60, %vecext62
|
|
|
|
%vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
|
|
|
|
%vecext67 = extractelement <16 x i16> %b, i32 4
|
|
|
|
%vecext69 = extractelement <16 x i16> %b, i32 5
|
|
|
|
%add71 = add i16 %vecext67, %vecext69
|
|
|
|
%vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
|
|
|
|
%vecext74 = extractelement <16 x i16> %b, i32 6
|
|
|
|
%vecext76 = extractelement <16 x i16> %b, i32 7
|
|
|
|
%add78 = add i16 %vecext74, %vecext76
|
|
|
|
%vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
|
|
|
|
%vecext81 = extractelement <16 x i16> %b, i32 8
|
|
|
|
%vecext83 = extractelement <16 x i16> %b, i32 9
|
|
|
|
%add85 = add i16 %vecext81, %vecext83
|
|
|
|
%vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
|
|
|
|
%vecext88 = extractelement <16 x i16> %b, i32 10
|
|
|
|
%vecext90 = extractelement <16 x i16> %b, i32 11
|
|
|
|
%add92 = add i16 %vecext88, %vecext90
|
|
|
|
%vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
|
|
|
|
%vecext95 = extractelement <16 x i16> %b, i32 12
|
|
|
|
%vecext97 = extractelement <16 x i16> %b, i32 13
|
|
|
|
%add99 = add i16 %vecext95, %vecext97
|
|
|
|
%vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
|
|
|
|
%vecext102 = extractelement <16 x i16> %b, i32 14
|
|
|
|
%vecext104 = extractelement <16 x i16> %b, i32 15
|
|
|
|
%add106 = add i16 %vecext102, %vecext104
|
|
|
|
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
|
|
|
|
ret <16 x i16> %vecinit108
|
|
|
|
}
|
|
|
|
|
2014-06-11 15:57:50 +08:00
|
|
|
; Verify that we don't select horizontal subs in the following functions.
|
|
|
|
|
|
|
|
define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: not_a_hsub_1:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
|
|
|
|
; SSE-NEXT: movd %xmm2, %ecx
|
|
|
|
; SSE-NEXT: subl %ecx, %eax
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
|
|
|
|
; SSE-NEXT: movd %xmm2, %ecx
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; SSE-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE-NEXT: subl %edx, %ecx
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
|
|
|
; SSE-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE-NEXT: movd %xmm1, %esi
|
|
|
|
; SSE-NEXT: subl %esi, %edx
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
|
|
|
|
; SSE-NEXT: movd %xmm0, %esi
|
|
|
|
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE-NEXT: movd %xmm0, %edi
|
|
|
|
; SSE-NEXT: subl %edi, %esi
|
|
|
|
; SSE-NEXT: movd %esi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movd %edx, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movd %ecx, %xmm2
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: movd %eax, %xmm0
|
|
|
|
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: not_a_hsub_1:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vmovd %xmm0, %eax
|
|
|
|
; AVX-NEXT: vpextrd $1, %xmm0, %ecx
|
|
|
|
; AVX-NEXT: subl %ecx, %eax
|
|
|
|
; AVX-NEXT: vpextrd $2, %xmm0, %ecx
|
|
|
|
; AVX-NEXT: vpextrd $3, %xmm0, %edx
|
|
|
|
; AVX-NEXT: subl %edx, %ecx
|
|
|
|
; AVX-NEXT: vpextrd $1, %xmm1, %edx
|
|
|
|
; AVX-NEXT: vmovd %xmm1, %esi
|
|
|
|
; AVX-NEXT: subl %esi, %edx
|
|
|
|
; AVX-NEXT: vpextrd $3, %xmm1, %esi
|
|
|
|
; AVX-NEXT: vpextrd $2, %xmm1, %edi
|
|
|
|
; AVX-NEXT: subl %edi, %esi
|
|
|
|
; AVX-NEXT: vmovd %eax, %xmm0
|
|
|
|
; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext = extractelement <4 x i32> %A, i32 0
|
|
|
|
%vecext1 = extractelement <4 x i32> %A, i32 1
|
|
|
|
%sub = sub i32 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
|
|
|
|
%vecext2 = extractelement <4 x i32> %A, i32 2
|
|
|
|
%vecext3 = extractelement <4 x i32> %A, i32 3
|
|
|
|
%sub4 = sub i32 %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x i32> %B, i32 1
|
|
|
|
%vecext7 = extractelement <4 x i32> %B, i32 0
|
|
|
|
%sub8 = sub i32 %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x i32> %B, i32 3
|
|
|
|
%vecext11 = extractelement <4 x i32> %B, i32 2
|
|
|
|
%sub12 = sub i32 %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
|
|
|
|
ret <4 x i32> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: not_a_hsub_2:
|
|
|
|
; SSE: # BB#0:
|
2016-08-22 20:56:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm2
|
2017-09-04 23:47:00 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
|
2016-08-22 20:56:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm3
|
2017-09-04 23:47:00 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: subss %xmm3, %xmm2
|
|
|
|
; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
|
|
|
; SSE-NEXT: subss %xmm3, %xmm0
|
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm3
|
2017-09-04 23:47:00 +08:00
|
|
|
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1,2,3]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm4
|
2017-09-04 23:47:00 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: subss %xmm4, %xmm3
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
|
|
|
|
; SSE-NEXT: subss %xmm4, %xmm1
|
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
2017-09-18 12:40:58 +08:00
|
|
|
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: not_a_hsub_2:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
|
|
|
|
; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
|
|
|
|
; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
|
|
|
|
; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
|
|
|
|
; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
|
2016-01-16 22:03:40 +08:00
|
|
|
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
|
|
|
|
; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
|
2017-05-19 02:50:05 +08:00
|
|
|
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; AVX-NEXT: retq
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext = extractelement <4 x float> %A, i32 2
|
|
|
|
%vecext1 = extractelement <4 x float> %A, i32 3
|
|
|
|
%sub = fsub float %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x float> undef, float %sub, i32 1
|
|
|
|
%vecext2 = extractelement <4 x float> %A, i32 0
|
|
|
|
%vecext3 = extractelement <4 x float> %A, i32 1
|
|
|
|
%sub4 = fsub float %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
|
|
|
|
%vecext6 = extractelement <4 x float> %B, i32 3
|
|
|
|
%vecext7 = extractelement <4 x float> %B, i32 2
|
|
|
|
%sub8 = fsub float %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
|
|
|
|
%vecext10 = extractelement <4 x float> %B, i32 0
|
|
|
|
%vecext11 = extractelement <4 x float> %B, i32 1
|
|
|
|
%sub12 = fsub float %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
|
|
|
|
ret <4 x float> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: not_a_hsub_3:
|
|
|
|
; SSE: # BB#0:
|
2016-08-22 20:56:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm1, %xmm2
|
2017-09-04 23:47:00 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: subsd %xmm2, %xmm1
|
2016-08-22 20:56:54 +08:00
|
|
|
; SSE-NEXT: movaps %xmm0, %xmm2
|
2017-09-04 23:47:00 +08:00
|
|
|
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm2[1,1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-NEXT: subsd %xmm0, %xmm2
|
|
|
|
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
|
|
|
|
; SSE-NEXT: movapd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: not_a_hsub_3:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
|
|
|
|
; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
|
|
|
|
; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0
|
|
|
|
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; AVX-NEXT: retq
|
2014-06-11 15:57:50 +08:00
|
|
|
%vecext = extractelement <2 x double> %B, i32 0
|
|
|
|
%vecext1 = extractelement <2 x double> %B, i32 1
|
|
|
|
%sub = fsub double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <2 x double> undef, double %sub, i32 1
|
|
|
|
%vecext2 = extractelement <2 x double> %A, i32 1
|
|
|
|
%vecext3 = extractelement <2 x double> %A, i32 0
|
|
|
|
%sub2 = fsub double %vecext2, %vecext3
|
|
|
|
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
|
|
|
|
ret <2 x double> %vecinit2
|
|
|
|
}
|
2014-06-12 18:53:48 +08:00
|
|
|
|
|
|
|
; Test AVX horizontal add/sub of packed single/double precision
|
|
|
|
; floating point values from 256-bit vectors.
|
|
|
|
|
|
|
|
define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: avx_vhadd_ps:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: haddps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: haddps %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avx_vhadd_ps:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-12 18:53:48 +08:00
|
|
|
%vecext = extractelement <8 x float> %a, i32 0
|
|
|
|
%vecext1 = extractelement <8 x float> %a, i32 1
|
|
|
|
%add = fadd float %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <8 x float> undef, float %add, i32 0
|
|
|
|
%vecext2 = extractelement <8 x float> %a, i32 2
|
|
|
|
%vecext3 = extractelement <8 x float> %a, i32 3
|
|
|
|
%add4 = fadd float %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
|
|
|
|
%vecext6 = extractelement <8 x float> %b, i32 0
|
|
|
|
%vecext7 = extractelement <8 x float> %b, i32 1
|
|
|
|
%add8 = fadd float %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
|
|
|
|
%vecext10 = extractelement <8 x float> %b, i32 2
|
|
|
|
%vecext11 = extractelement <8 x float> %b, i32 3
|
|
|
|
%add12 = fadd float %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
|
|
|
|
%vecext14 = extractelement <8 x float> %a, i32 4
|
|
|
|
%vecext15 = extractelement <8 x float> %a, i32 5
|
|
|
|
%add16 = fadd float %vecext14, %vecext15
|
|
|
|
%vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
|
|
|
|
%vecext18 = extractelement <8 x float> %a, i32 6
|
|
|
|
%vecext19 = extractelement <8 x float> %a, i32 7
|
|
|
|
%add20 = fadd float %vecext18, %vecext19
|
|
|
|
%vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
|
|
|
|
%vecext22 = extractelement <8 x float> %b, i32 4
|
|
|
|
%vecext23 = extractelement <8 x float> %b, i32 5
|
|
|
|
%add24 = fadd float %vecext22, %vecext23
|
|
|
|
%vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
|
|
|
|
%vecext26 = extractelement <8 x float> %b, i32 6
|
|
|
|
%vecext27 = extractelement <8 x float> %b, i32 7
|
|
|
|
%add28 = fadd float %vecext26, %vecext27
|
|
|
|
%vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
|
|
|
|
ret <8 x float> %vecinit29
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: avx_vhsub_ps:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: hsubps %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: hsubps %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avx_vhsub_ps:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-12 18:53:48 +08:00
|
|
|
%vecext = extractelement <8 x float> %a, i32 0
|
|
|
|
%vecext1 = extractelement <8 x float> %a, i32 1
|
|
|
|
%sub = fsub float %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <8 x float> undef, float %sub, i32 0
|
|
|
|
%vecext2 = extractelement <8 x float> %a, i32 2
|
|
|
|
%vecext3 = extractelement <8 x float> %a, i32 3
|
|
|
|
%sub4 = fsub float %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
|
|
|
|
%vecext6 = extractelement <8 x float> %b, i32 0
|
|
|
|
%vecext7 = extractelement <8 x float> %b, i32 1
|
|
|
|
%sub8 = fsub float %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
|
|
|
|
%vecext10 = extractelement <8 x float> %b, i32 2
|
|
|
|
%vecext11 = extractelement <8 x float> %b, i32 3
|
|
|
|
%sub12 = fsub float %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
|
|
|
|
%vecext14 = extractelement <8 x float> %a, i32 4
|
|
|
|
%vecext15 = extractelement <8 x float> %a, i32 5
|
|
|
|
%sub16 = fsub float %vecext14, %vecext15
|
|
|
|
%vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
|
|
|
|
%vecext18 = extractelement <8 x float> %a, i32 6
|
|
|
|
%vecext19 = extractelement <8 x float> %a, i32 7
|
|
|
|
%sub20 = fsub float %vecext18, %vecext19
|
|
|
|
%vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
|
|
|
|
%vecext22 = extractelement <8 x float> %b, i32 4
|
|
|
|
%vecext23 = extractelement <8 x float> %b, i32 5
|
|
|
|
%sub24 = fsub float %vecext22, %vecext23
|
|
|
|
%vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
|
|
|
|
%vecext26 = extractelement <8 x float> %b, i32 6
|
|
|
|
%vecext27 = extractelement <8 x float> %b, i32 7
|
|
|
|
%sub28 = fsub float %vecext26, %vecext27
|
|
|
|
%vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
|
|
|
|
ret <8 x float> %vecinit29
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: avx_hadd_pd:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: haddpd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: haddpd %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avx_hadd_pd:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-12 18:53:48 +08:00
|
|
|
%vecext = extractelement <4 x double> %a, i32 0
|
|
|
|
%vecext1 = extractelement <4 x double> %a, i32 1
|
|
|
|
%add = fadd double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x double> undef, double %add, i32 0
|
|
|
|
%vecext2 = extractelement <4 x double> %b, i32 0
|
|
|
|
%vecext3 = extractelement <4 x double> %b, i32 1
|
|
|
|
%add4 = fadd double %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x double> %a, i32 2
|
|
|
|
%vecext7 = extractelement <4 x double> %a, i32 3
|
|
|
|
%add8 = fadd double %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x double> %b, i32 2
|
|
|
|
%vecext11 = extractelement <4 x double> %b, i32 3
|
|
|
|
%add12 = fadd double %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
|
|
|
|
ret <4 x double> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE-LABEL: avx_hsub_pd:
|
|
|
|
; SSE: # BB#0:
|
|
|
|
; SSE-NEXT: hsubpd %xmm2, %xmm0
|
|
|
|
; SSE-NEXT: hsubpd %xmm3, %xmm1
|
|
|
|
; SSE-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX-LABEL: avx_hsub_pd:
|
|
|
|
; AVX: # BB#0:
|
|
|
|
; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: retq
|
2014-06-12 18:53:48 +08:00
|
|
|
%vecext = extractelement <4 x double> %a, i32 0
|
|
|
|
%vecext1 = extractelement <4 x double> %a, i32 1
|
|
|
|
%sub = fsub double %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <4 x double> undef, double %sub, i32 0
|
|
|
|
%vecext2 = extractelement <4 x double> %b, i32 0
|
|
|
|
%vecext3 = extractelement <4 x double> %b, i32 1
|
|
|
|
%sub4 = fsub double %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
|
|
|
|
%vecext6 = extractelement <4 x double> %a, i32 2
|
|
|
|
%vecext7 = extractelement <4 x double> %a, i32 3
|
|
|
|
%sub8 = fsub double %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
|
|
|
|
%vecext10 = extractelement <4 x double> %b, i32 2
|
|
|
|
%vecext11 = extractelement <4 x double> %b, i32 3
|
|
|
|
%sub12 = fsub double %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
|
|
|
|
ret <4 x double> %vecinit13
|
|
|
|
}
|
|
|
|
|
|
|
|
; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
|
|
|
|
|
|
|
|
define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: avx2_hadd_d:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: movd %xmm0, %ecx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm4, %r8d
|
|
|
|
; SSE3-NEXT: addl %ecx, %r8d
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm4, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %r9d
|
|
|
|
; SSE3-NEXT: addl %edx, %r9d
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm2, %edx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %esi
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: addl %edx, %esi
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edi
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: addl %edx, %edi
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %xmm1, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm0, %r10d
|
|
|
|
; SSE3-NEXT: addl %eax, %r10d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %ecx
|
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
|
|
|
; SSE3-NEXT: movd %xmm3, %eax
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
|
|
|
|
; SSE3-NEXT: movd %xmm0, %edx
|
|
|
|
; SSE3-NEXT: addl %eax, %edx
|
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm0, %r11d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
|
|
|
; SSE3-NEXT: addl %r11d, %eax
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %edi, %xmm0
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %esi, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %r9d, %xmm2
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd %r8d, %xmm0
|
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
|
|
|
; SSE3-NEXT: movd %eax, %xmm1
|
|
|
|
; SSE3-NEXT: movd %edx, %xmm2
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movd %ecx, %xmm3
|
|
|
|
; SSE3-NEXT: movd %r10d, %xmm1
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: avx2_hadd_d:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phaddd %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: phaddd %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avx2_hadd_d:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avx2_hadd_d:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2014-06-12 18:53:48 +08:00
|
|
|
%vecext = extractelement <8 x i32> %a, i32 0
|
|
|
|
%vecext1 = extractelement <8 x i32> %a, i32 1
|
|
|
|
%add = add i32 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
|
|
|
|
%vecext2 = extractelement <8 x i32> %a, i32 2
|
|
|
|
%vecext3 = extractelement <8 x i32> %a, i32 3
|
|
|
|
%add4 = add i32 %vecext2, %vecext3
|
|
|
|
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
|
|
|
|
%vecext6 = extractelement <8 x i32> %b, i32 0
|
|
|
|
%vecext7 = extractelement <8 x i32> %b, i32 1
|
|
|
|
%add8 = add i32 %vecext6, %vecext7
|
|
|
|
%vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
|
|
|
|
%vecext10 = extractelement <8 x i32> %b, i32 2
|
|
|
|
%vecext11 = extractelement <8 x i32> %b, i32 3
|
|
|
|
%add12 = add i32 %vecext10, %vecext11
|
|
|
|
%vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
|
|
|
|
%vecext14 = extractelement <8 x i32> %a, i32 4
|
|
|
|
%vecext15 = extractelement <8 x i32> %a, i32 5
|
|
|
|
%add16 = add i32 %vecext14, %vecext15
|
|
|
|
%vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
|
|
|
|
%vecext18 = extractelement <8 x i32> %a, i32 6
|
|
|
|
%vecext19 = extractelement <8 x i32> %a, i32 7
|
|
|
|
%add20 = add i32 %vecext18, %vecext19
|
|
|
|
%vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
|
|
|
|
%vecext22 = extractelement <8 x i32> %b, i32 4
|
|
|
|
%vecext23 = extractelement <8 x i32> %b, i32 5
|
|
|
|
%add24 = add i32 %vecext22, %vecext23
|
|
|
|
%vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
|
|
|
|
%vecext26 = extractelement <8 x i32> %b, i32 6
|
|
|
|
%vecext27 = extractelement <8 x i32> %b, i32 7
|
|
|
|
%add28 = add i32 %vecext26, %vecext27
|
|
|
|
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
|
|
|
|
ret <8 x i32> %vecinit29
|
|
|
|
}
|
|
|
|
|
|
|
|
define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-LABEL: avx2_hadd_w:
|
|
|
|
; SSE3: # BB#0:
|
|
|
|
; SSE3-NEXT: pushq %rbp
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi12:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 16
|
|
|
|
; SSE3-NEXT: pushq %r15
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi13:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 24
|
|
|
|
; SSE3-NEXT: pushq %r14
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi14:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 32
|
|
|
|
; SSE3-NEXT: pushq %r13
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi15:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 40
|
|
|
|
; SSE3-NEXT: pushq %r12
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi16:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 48
|
|
|
|
; SSE3-NEXT: pushq %rbx
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi17:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_def_cfa_offset 56
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi18:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %rbx, -56
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi19:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r12, -48
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi20:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r13, -40
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi21:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r14, -32
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi22:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %r15, -24
|
2017-06-29 21:58:24 +08:00
|
|
|
; SSE3-NEXT: .Lcfi23:
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: .cfi_offset %rbp, -16
|
|
|
|
; SSE3-NEXT: movd %xmm0, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $1, %xmm0, %r10d
|
|
|
|
; SSE3-NEXT: addl %eax, %r10d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $2, %xmm0, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $3, %xmm0, %r11d
|
|
|
|
; SSE3-NEXT: addl %eax, %r11d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $4, %xmm0, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $5, %xmm0, %r12d
|
|
|
|
; SSE3-NEXT: addl %eax, %r12d
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $6, %xmm0, %eax
|
|
|
|
; SSE3-NEXT: pextrw $7, %xmm0, %r13d
|
|
|
|
; SSE3-NEXT: addl %eax, %r13d
|
|
|
|
; SSE3-NEXT: movd %xmm1, %eax
|
|
|
|
; SSE3-NEXT: pextrw $1, %xmm1, %ecx
|
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
|
|
|
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
|
|
|
|
; SSE3-NEXT: pextrw $2, %xmm1, %eax
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $3, %xmm1, %ecx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: addl %eax, %ecx
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
|
|
|
|
; SSE3-NEXT: pextrw $4, %xmm1, %eax
|
|
|
|
; SSE3-NEXT: pextrw $5, %xmm1, %r14d
|
|
|
|
; SSE3-NEXT: addl %eax, %r14d
|
|
|
|
; SSE3-NEXT: pextrw $6, %xmm1, %esi
|
|
|
|
; SSE3-NEXT: pextrw $7, %xmm1, %r15d
|
|
|
|
; SSE3-NEXT: addl %esi, %r15d
|
|
|
|
; SSE3-NEXT: movd %xmm2, %esi
|
|
|
|
; SSE3-NEXT: pextrw $1, %xmm2, %ebp
|
|
|
|
; SSE3-NEXT: addl %esi, %ebp
|
|
|
|
; SSE3-NEXT: pextrw $2, %xmm2, %esi
|
|
|
|
; SSE3-NEXT: pextrw $3, %xmm2, %edi
|
|
|
|
; SSE3-NEXT: addl %esi, %edi
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $4, %xmm2, %esi
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $5, %xmm2, %eax
|
|
|
|
; SSE3-NEXT: addl %esi, %eax
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $6, %xmm2, %esi
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: pextrw $7, %xmm2, %ecx
|
|
|
|
; SSE3-NEXT: addl %esi, %ecx
|
|
|
|
; SSE3-NEXT: movd %xmm3, %ebx
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: pextrw $1, %xmm3, %r9d
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: addl %ebx, %r9d
|
|
|
|
; SSE3-NEXT: pextrw $2, %xmm3, %edx
|
|
|
|
; SSE3-NEXT: pextrw $3, %xmm3, %ebx
|
|
|
|
; SSE3-NEXT: addl %edx, %ebx
|
|
|
|
; SSE3-NEXT: pextrw $4, %xmm3, %edx
|
|
|
|
; SSE3-NEXT: pextrw $5, %xmm3, %esi
|
|
|
|
; SSE3-NEXT: addl %edx, %esi
|
|
|
|
; SSE3-NEXT: pextrw $6, %xmm3, %r8d
|
|
|
|
; SSE3-NEXT: pextrw $7, %xmm3, %edx
|
|
|
|
; SSE3-NEXT: addl %r8d, %edx
|
|
|
|
; SSE3-NEXT: movd %ecx, %xmm8
|
|
|
|
; SSE3-NEXT: movd %eax, %xmm3
|
|
|
|
; SSE3-NEXT: movd %edi, %xmm9
|
|
|
|
; SSE3-NEXT: movd %ebp, %xmm4
|
|
|
|
; SSE3-NEXT: movd %r13d, %xmm10
|
|
|
|
; SSE3-NEXT: movd %r12d, %xmm7
|
|
|
|
; SSE3-NEXT: movd %r11d, %xmm11
|
|
|
|
; SSE3-NEXT: movd %r10d, %xmm0
|
|
|
|
; SSE3-NEXT: movd %edx, %xmm12
|
|
|
|
; SSE3-NEXT: movd %esi, %xmm6
|
|
|
|
; SSE3-NEXT: movd %ebx, %xmm13
|
|
|
|
; SSE3-NEXT: movd %r9d, %xmm5
|
|
|
|
; SSE3-NEXT: movd %r15d, %xmm14
|
|
|
|
; SSE3-NEXT: movd %r14d, %xmm2
|
|
|
|
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm15 # 4-byte Folded Reload
|
|
|
|
; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
|
|
|
|
; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
|
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
|
|
|
|
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
|
[X86][SSE] Change BUILD_VECTOR interleaving ordering to improve coalescing/combine opportunities
We currently generate BUILD_VECTOR as a tree of UNPCKL shuffles of the same type:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
The issue is because we are not placing sequential vector elements together early enough, we fail to recognise many combinable patterns - consecutive scalar loads, extractions etc.
Instead, this patch unpacks progressively larger sequential vector elements together:
e.g. for v4f32:
Step 1: unpcklps 0, 2 ==> X: <?, ?, 1, 0>
: unpcklps 1, 3 ==> Y: <?, ?, 3, 2>
Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
This does mean that we are creating UNPCKL shuffle of different value types, but the relevant combines that benefit from this are quite capable of handling the additional BITCASTs that are now included in the shuffle tree.
Differential Revision: https://reviews.llvm.org/D33864
llvm-svn: 304688
2017-06-05 04:12:04 +08:00
|
|
|
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
|
|
|
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
|
2016-01-16 22:03:40 +08:00
|
|
|
; SSE3-NEXT: popq %rbx
|
|
|
|
; SSE3-NEXT: popq %r12
|
|
|
|
; SSE3-NEXT: popq %r13
|
|
|
|
; SSE3-NEXT: popq %r14
|
|
|
|
; SSE3-NEXT: popq %r15
|
|
|
|
; SSE3-NEXT: popq %rbp
|
|
|
|
; SSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; SSSE3-LABEL: avx2_hadd_w:
|
|
|
|
; SSSE3: # BB#0:
|
|
|
|
; SSSE3-NEXT: phaddw %xmm2, %xmm0
|
|
|
|
; SSSE3-NEXT: phaddw %xmm3, %xmm1
|
|
|
|
; SSSE3-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX1-LABEL: avx2_hadd_w:
|
|
|
|
; AVX1: # BB#0:
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
|
|
|
|
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
|
|
|
|
; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2
|
|
|
|
; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
|
|
|
; AVX1-NEXT: retq
|
|
|
|
;
|
|
|
|
; AVX2-LABEL: avx2_hadd_w:
|
|
|
|
; AVX2: # BB#0:
|
|
|
|
; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
|
|
|
|
; AVX2-NEXT: retq
|
2014-06-12 18:53:48 +08:00
|
|
|
%vecext = extractelement <16 x i16> %a, i32 0
|
|
|
|
%vecext1 = extractelement <16 x i16> %a, i32 1
|
|
|
|
%add = add i16 %vecext, %vecext1
|
|
|
|
%vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
|
|
|
|
%vecext4 = extractelement <16 x i16> %a, i32 2
|
|
|
|
%vecext6 = extractelement <16 x i16> %a, i32 3
|
|
|
|
%add8 = add i16 %vecext4, %vecext6
|
|
|
|
%vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
|
|
|
|
%vecext11 = extractelement <16 x i16> %a, i32 4
|
|
|
|
%vecext13 = extractelement <16 x i16> %a, i32 5
|
|
|
|
%add15 = add i16 %vecext11, %vecext13
|
|
|
|
%vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
|
|
|
|
%vecext18 = extractelement <16 x i16> %a, i32 6
|
|
|
|
%vecext20 = extractelement <16 x i16> %a, i32 7
|
|
|
|
%add22 = add i16 %vecext18, %vecext20
|
|
|
|
%vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
|
|
|
|
%vecext25 = extractelement <16 x i16> %a, i32 8
|
|
|
|
%vecext27 = extractelement <16 x i16> %a, i32 9
|
|
|
|
%add29 = add i16 %vecext25, %vecext27
|
|
|
|
%vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
|
|
|
|
%vecext32 = extractelement <16 x i16> %a, i32 10
|
|
|
|
%vecext34 = extractelement <16 x i16> %a, i32 11
|
|
|
|
%add36 = add i16 %vecext32, %vecext34
|
|
|
|
%vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
|
|
|
|
%vecext39 = extractelement <16 x i16> %a, i32 12
|
|
|
|
%vecext41 = extractelement <16 x i16> %a, i32 13
|
|
|
|
%add43 = add i16 %vecext39, %vecext41
|
|
|
|
%vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
|
|
|
|
%vecext46 = extractelement <16 x i16> %a, i32 14
|
|
|
|
%vecext48 = extractelement <16 x i16> %a, i32 15
|
|
|
|
%add50 = add i16 %vecext46, %vecext48
|
|
|
|
%vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
|
|
|
|
%vecext53 = extractelement <16 x i16> %b, i32 0
|
|
|
|
%vecext55 = extractelement <16 x i16> %b, i32 1
|
|
|
|
%add57 = add i16 %vecext53, %vecext55
|
|
|
|
%vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
|
|
|
|
%vecext60 = extractelement <16 x i16> %b, i32 2
|
|
|
|
%vecext62 = extractelement <16 x i16> %b, i32 3
|
|
|
|
%add64 = add i16 %vecext60, %vecext62
|
|
|
|
%vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
|
|
|
|
%vecext67 = extractelement <16 x i16> %b, i32 4
|
|
|
|
%vecext69 = extractelement <16 x i16> %b, i32 5
|
|
|
|
%add71 = add i16 %vecext67, %vecext69
|
|
|
|
%vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
|
|
|
|
%vecext74 = extractelement <16 x i16> %b, i32 6
|
|
|
|
%vecext76 = extractelement <16 x i16> %b, i32 7
|
|
|
|
%add78 = add i16 %vecext74, %vecext76
|
|
|
|
%vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
|
|
|
|
%vecext81 = extractelement <16 x i16> %b, i32 8
|
|
|
|
%vecext83 = extractelement <16 x i16> %b, i32 9
|
|
|
|
%add85 = add i16 %vecext81, %vecext83
|
|
|
|
%vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
|
|
|
|
%vecext88 = extractelement <16 x i16> %b, i32 10
|
|
|
|
%vecext90 = extractelement <16 x i16> %b, i32 11
|
|
|
|
%add92 = add i16 %vecext88, %vecext90
|
|
|
|
%vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
|
|
|
|
%vecext95 = extractelement <16 x i16> %b, i32 12
|
|
|
|
%vecext97 = extractelement <16 x i16> %b, i32 13
|
|
|
|
%add99 = add i16 %vecext95, %vecext97
|
|
|
|
%vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
|
|
|
|
%vecext102 = extractelement <16 x i16> %b, i32 14
|
|
|
|
%vecext104 = extractelement <16 x i16> %b, i32 15
|
|
|
|
%add106 = add i16 %vecext102, %vecext104
|
|
|
|
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
|
|
|
|
ret <16 x i16> %vecinit108
|
|
|
|
}
|