llvm-project/llvm/test/CodeGen/X86/haddsub-2.ll

1458 lines
59 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSE3
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse3,+ssse3 | FileCheck %s --check-prefix=SSE --check-prefix=SSSE3
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
define <4 x float> @hadd_ps_test1(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hadd_ps_test1:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_ps_test1:
; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
%vecext1 = extractelement <4 x float> %A, i32 1
%add = fadd float %vecext, %vecext1
%vecinit = insertelement <4 x float> undef, float %add, i32 0
%vecext2 = extractelement <4 x float> %A, i32 2
%vecext3 = extractelement <4 x float> %A, i32 3
%add4 = fadd float %vecext2, %vecext3
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 1
%vecext6 = extractelement <4 x float> %B, i32 0
%vecext7 = extractelement <4 x float> %B, i32 1
%add8 = fadd float %vecext6, %vecext7
%vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 2
%vecext10 = extractelement <4 x float> %B, i32 2
%vecext11 = extractelement <4 x float> %B, i32 3
%add12 = fadd float %vecext10, %vecext11
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 3
ret <4 x float> %vecinit13
}
define <4 x float> @hadd_ps_test2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hadd_ps_test2:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_ps_test2:
; AVX: # %bb.0:
; AVX-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%add = fadd float %vecext, %vecext1
%vecinit = insertelement <4 x float> undef, float %add, i32 1
%vecext2 = extractelement <4 x float> %A, i32 0
%vecext3 = extractelement <4 x float> %A, i32 1
%add4 = fadd float %vecext2, %vecext3
%vecinit5 = insertelement <4 x float> %vecinit, float %add4, i32 0
%vecext6 = extractelement <4 x float> %B, i32 2
%vecext7 = extractelement <4 x float> %B, i32 3
%add8 = fadd float %vecext6, %vecext7
%vecinit9 = insertelement <4 x float> %vecinit5, float %add8, i32 3
%vecext10 = extractelement <4 x float> %B, i32 0
%vecext11 = extractelement <4 x float> %B, i32 1
%add12 = fadd float %vecext10, %vecext11
%vecinit13 = insertelement <4 x float> %vecinit9, float %add12, i32 2
ret <4 x float> %vecinit13
}
define <4 x float> @hsub_ps_test1(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hsub_ps_test1:
; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_ps_test1:
; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 0
%vecext1 = extractelement <4 x float> %A, i32 1
%sub = fsub float %vecext, %vecext1
%vecinit = insertelement <4 x float> undef, float %sub, i32 0
%vecext2 = extractelement <4 x float> %A, i32 2
%vecext3 = extractelement <4 x float> %A, i32 3
%sub4 = fsub float %vecext2, %vecext3
%vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 1
%vecext6 = extractelement <4 x float> %B, i32 0
%vecext7 = extractelement <4 x float> %B, i32 1
%sub8 = fsub float %vecext6, %vecext7
%vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 2
%vecext10 = extractelement <4 x float> %B, i32 2
%vecext11 = extractelement <4 x float> %B, i32 3
%sub12 = fsub float %vecext10, %vecext11
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 3
ret <4 x float> %vecinit13
}
define <4 x float> @hsub_ps_test2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: hsub_ps_test2:
; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_ps_test2:
; AVX: # %bb.0:
; AVX-NEXT: vhsubps %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%sub = fsub float %vecext, %vecext1
%vecinit = insertelement <4 x float> undef, float %sub, i32 1
%vecext2 = extractelement <4 x float> %A, i32 0
%vecext3 = extractelement <4 x float> %A, i32 1
%sub4 = fsub float %vecext2, %vecext3
%vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
%vecext6 = extractelement <4 x float> %B, i32 2
%vecext7 = extractelement <4 x float> %B, i32 3
%sub8 = fsub float %vecext6, %vecext7
%vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
%vecext10 = extractelement <4 x float> %B, i32 0
%vecext11 = extractelement <4 x float> %B, i32 1
%sub12 = fsub float %vecext10, %vecext11
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
ret <4 x float> %vecinit13
}
define <4 x i32> @phadd_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phadd_d_test1:
; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: addl %eax, %edi
; SSE3-NEXT: movd %edi, %xmm0
; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %edx, %xmm2
; SSE3-NEXT: movd %ecx, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phadd_d_test1:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phadd_d_test1:
; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%add = add i32 %vecext, %vecext1
%vecinit = insertelement <4 x i32> undef, i32 %add, i32 0
%vecext2 = extractelement <4 x i32> %A, i32 2
%vecext3 = extractelement <4 x i32> %A, i32 3
%add4 = add i32 %vecext2, %vecext3
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 1
%vecext6 = extractelement <4 x i32> %B, i32 0
%vecext7 = extractelement <4 x i32> %B, i32 1
%add8 = add i32 %vecext6, %vecext7
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 2
%vecext10 = extractelement <4 x i32> %B, i32 2
%vecext11 = extractelement <4 x i32> %B, i32 3
%add12 = add i32 %vecext10, %vecext11
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 3
ret <4 x i32> %vecinit13
}
define <4 x i32> @phadd_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phadd_d_test2:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: movd %esi, %xmm0
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: movd %xmm1, %esi
; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %ecx, %xmm2
; SSE3-NEXT: movd %edx, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phadd_d_test2:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phadd_d_test2:
; AVX: # %bb.0:
; AVX-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
%vecext1 = extractelement <4 x i32> %A, i32 3
%add = add i32 %vecext, %vecext1
%vecinit = insertelement <4 x i32> undef, i32 %add, i32 1
%vecext2 = extractelement <4 x i32> %A, i32 0
%vecext3 = extractelement <4 x i32> %A, i32 1
%add4 = add i32 %vecext2, %vecext3
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %add4, i32 0
%vecext6 = extractelement <4 x i32> %B, i32 3
%vecext7 = extractelement <4 x i32> %B, i32 2
%add8 = add i32 %vecext6, %vecext7
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %add8, i32 3
%vecext10 = extractelement <4 x i32> %B, i32 1
%vecext11 = extractelement <4 x i32> %B, i32 0
%add12 = add i32 %vecext10, %vecext11
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %add12, i32 2
ret <4 x i32> %vecinit13
}
define <4 x i32> @phsub_d_test1(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phsub_d_test1:
; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: subl %ecx, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: subl %edx, %ecx
; SSE3-NEXT: movd %xmm1, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: subl %esi, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: subl %edi, %esi
; SSE3-NEXT: movd %esi, %xmm0
; SSE3-NEXT: movd %edx, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %ecx, %xmm2
; SSE3-NEXT: movd %eax, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phsub_d_test1:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsub_d_test1:
; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%sub = sub i32 %vecext, %vecext1
%vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
%vecext2 = extractelement <4 x i32> %A, i32 2
%vecext3 = extractelement <4 x i32> %A, i32 3
%sub4 = sub i32 %vecext2, %vecext3
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
%vecext6 = extractelement <4 x i32> %B, i32 0
%vecext7 = extractelement <4 x i32> %B, i32 1
%sub8 = sub i32 %vecext6, %vecext7
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
%vecext10 = extractelement <4 x i32> %B, i32 2
%vecext11 = extractelement <4 x i32> %B, i32 3
%sub12 = sub i32 %vecext10, %vecext11
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
ret <4 x i32> %vecinit13
}
define <4 x i32> @phsub_d_test2(<4 x i32> %A, <4 x i32> %B) {
; SSE3-LABEL: phsub_d_test2:
; SSE3: # %bb.0:
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm2, %ecx
; SSE3-NEXT: subl %ecx, %eax
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: subl %edx, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: subl %esi, %edx
; SSE3-NEXT: movd %edx, %xmm0
; SSE3-NEXT: movd %xmm1, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm1, %esi
; SSE3-NEXT: subl %esi, %edx
; SSE3-NEXT: movd %edx, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %eax, %xmm2
; SSE3-NEXT: movd %ecx, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: phsub_d_test2:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phsubd %xmm1, %xmm0
; SSSE3-NEXT: retq
;
; AVX-LABEL: phsub_d_test2:
; AVX: # %bb.0:
; AVX-NEXT: vphsubd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 2
%vecext1 = extractelement <4 x i32> %A, i32 3
%sub = sub i32 %vecext, %vecext1
%vecinit = insertelement <4 x i32> undef, i32 %sub, i32 1
%vecext2 = extractelement <4 x i32> %A, i32 0
%vecext3 = extractelement <4 x i32> %A, i32 1
%sub4 = sub i32 %vecext2, %vecext3
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 0
%vecext6 = extractelement <4 x i32> %B, i32 2
%vecext7 = extractelement <4 x i32> %B, i32 3
%sub8 = sub i32 %vecext6, %vecext7
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 3
%vecext10 = extractelement <4 x i32> %B, i32 0
%vecext11 = extractelement <4 x i32> %B, i32 1
%sub12 = sub i32 %vecext10, %vecext11
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 2
ret <4 x i32> %vecinit13
}
define <2 x double> @hadd_pd_test1(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hadd_pd_test1:
; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_pd_test1:
; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
%vecext1 = extractelement <2 x double> %A, i32 1
%add = fadd double %vecext, %vecext1
%vecinit = insertelement <2 x double> undef, double %add, i32 0
%vecext2 = extractelement <2 x double> %B, i32 0
%vecext3 = extractelement <2 x double> %B, i32 1
%add2 = fadd double %vecext2, %vecext3
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
ret <2 x double> %vecinit2
}
define <2 x double> @hadd_pd_test2(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hadd_pd_test2:
; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hadd_pd_test2:
; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 1
%vecext1 = extractelement <2 x double> %A, i32 0
%add = fadd double %vecext, %vecext1
%vecinit = insertelement <2 x double> undef, double %add, i32 0
%vecext2 = extractelement <2 x double> %B, i32 1
%vecext3 = extractelement <2 x double> %B, i32 0
%add2 = fadd double %vecext2, %vecext3
%vecinit2 = insertelement <2 x double> %vecinit, double %add2, i32 1
ret <2 x double> %vecinit2
}
define <2 x double> @hsub_pd_test1(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hsub_pd_test1:
; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_pd_test1:
; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %A, i32 0
%vecext1 = extractelement <2 x double> %A, i32 1
%sub = fsub double %vecext, %vecext1
%vecinit = insertelement <2 x double> undef, double %sub, i32 0
%vecext2 = extractelement <2 x double> %B, i32 0
%vecext3 = extractelement <2 x double> %B, i32 1
%sub2 = fsub double %vecext2, %vecext3
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 1
ret <2 x double> %vecinit2
}
define <2 x double> @hsub_pd_test2(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: hsub_pd_test2:
; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: hsub_pd_test2:
; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %B, i32 0
%vecext1 = extractelement <2 x double> %B, i32 1
%sub = fsub double %vecext, %vecext1
%vecinit = insertelement <2 x double> undef, double %sub, i32 1
%vecext2 = extractelement <2 x double> %A, i32 0
%vecext3 = extractelement <2 x double> %A, i32 1
%sub2 = fsub double %vecext2, %vecext3
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
ret <2 x double> %vecinit2
}
define <4 x double> @avx_vhadd_pd_test(<4 x double> %A, <4 x double> %B) {
; SSE-LABEL: avx_vhadd_pd_test:
; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm1, %xmm0
; SSE-NEXT: haddpd %xmm3, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhadd_pd_test:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vhaddpd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vhaddpd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%add = fadd double %vecext, %vecext1
%vecinit = insertelement <4 x double> undef, double %add, i32 0
%vecext2 = extractelement <4 x double> %A, i32 2
%vecext3 = extractelement <4 x double> %A, i32 3
%add4 = fadd double %vecext2, %vecext3
%vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
%vecext6 = extractelement <4 x double> %B, i32 0
%vecext7 = extractelement <4 x double> %B, i32 1
%add8 = fadd double %vecext6, %vecext7
%vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
%vecext10 = extractelement <4 x double> %B, i32 2
%vecext11 = extractelement <4 x double> %B, i32 3
%add12 = fadd double %vecext10, %vecext11
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
ret <4 x double> %vecinit13
}
define <4 x double> @avx_vhsub_pd_test(<4 x double> %A, <4 x double> %B) {
; SSE-LABEL: avx_vhsub_pd_test:
; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm1, %xmm0
; SSE-NEXT: hsubpd %xmm3, %xmm2
; SSE-NEXT: movapd %xmm2, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhsub_pd_test:
; AVX: # %bb.0:
; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX-NEXT: vhsubpd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX-NEXT: vhsubpd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %A, i32 0
%vecext1 = extractelement <4 x double> %A, i32 1
%sub = fsub double %vecext, %vecext1
%vecinit = insertelement <4 x double> undef, double %sub, i32 0
%vecext2 = extractelement <4 x double> %A, i32 2
%vecext3 = extractelement <4 x double> %A, i32 3
%sub4 = fsub double %vecext2, %vecext3
%vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
%vecext6 = extractelement <4 x double> %B, i32 0
%vecext7 = extractelement <4 x double> %B, i32 1
%sub8 = fsub double %vecext6, %vecext7
%vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
%vecext10 = extractelement <4 x double> %B, i32 2
%vecext11 = extractelement <4 x double> %B, i32 3
%sub12 = fsub double %vecext10, %vecext11
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
ret <4 x double> %vecinit13
}
define <8 x i32> @avx2_vphadd_d_test(<8 x i32> %A, <8 x i32> %B) {
; SSE3-LABEL: avx2_vphadd_d_test:
; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm4, %r8d
; SSE3-NEXT: addl %ecx, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm4, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %r9d
; SSE3-NEXT: addl %edx, %r9d
; SSE3-NEXT: movd %xmm1, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %edx, %esi
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: addl %edx, %edi
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %r10d
; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movd %xmm3, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %r11d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: addl %r11d, %eax
; SSE3-NEXT: movd %edi, %xmm0
; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %r9d, %xmm2
; SSE3-NEXT: movd %r8d, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movd %edx, %xmm2
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE3-NEXT: movd %ecx, %xmm3
; SSE3-NEXT: movd %r10d, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_d_test:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm1, %xmm0
; SSSE3-NEXT: phaddd %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_vphadd_d_test:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vphaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vphaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_vphadd_d_test:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vphaddd %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vphaddd %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %A, i32 0
%vecext1 = extractelement <8 x i32> %A, i32 1
%add = add i32 %vecext, %vecext1
%vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
%vecext2 = extractelement <8 x i32> %A, i32 2
%vecext3 = extractelement <8 x i32> %A, i32 3
%add4 = add i32 %vecext2, %vecext3
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
%vecext6 = extractelement <8 x i32> %A, i32 4
%vecext7 = extractelement <8 x i32> %A, i32 5
%add8 = add i32 %vecext6, %vecext7
%vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
%vecext10 = extractelement <8 x i32> %A, i32 6
%vecext11 = extractelement <8 x i32> %A, i32 7
%add12 = add i32 %vecext10, %vecext11
%vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
%vecext14 = extractelement <8 x i32> %B, i32 0
%vecext15 = extractelement <8 x i32> %B, i32 1
%add16 = add i32 %vecext14, %vecext15
%vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
%vecext18 = extractelement <8 x i32> %B, i32 2
%vecext19 = extractelement <8 x i32> %B, i32 3
%add20 = add i32 %vecext18, %vecext19
%vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
%vecext22 = extractelement <8 x i32> %B, i32 4
%vecext23 = extractelement <8 x i32> %B, i32 5
%add24 = add i32 %vecext22, %vecext23
%vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
%vecext26 = extractelement <8 x i32> %B, i32 6
%vecext27 = extractelement <8 x i32> %B, i32 7
%add28 = add i32 %vecext26, %vecext27
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
ret <8 x i32> %vecinit29
}
define <16 x i16> @avx2_vphadd_w_test(<16 x i16> %a, <16 x i16> %b) {
; SSE3-LABEL: avx2_vphadd_w_test:
; SSE3: # %bb.0:
; SSE3-NEXT: pushq %rbp
; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: pushq %r15
; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: pushq %r14
; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: pushq %r13
; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: pushq %r12
; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: pushq %rbx
; SSE3-NEXT: .cfi_def_cfa_offset 56
; SSE3-NEXT: .cfi_offset %rbx, -56
; SSE3-NEXT: .cfi_offset %r12, -48
; SSE3-NEXT: .cfi_offset %r13, -40
; SSE3-NEXT: .cfi_offset %r14, -32
; SSE3-NEXT: .cfi_offset %r15, -24
; SSE3-NEXT: .cfi_offset %rbp, -16
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; SSE3-NEXT: pextrw $2, %xmm0, %eax
; SSE3-NEXT: pextrw $3, %xmm0, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; SSE3-NEXT: pextrw $4, %xmm0, %eax
; SSE3-NEXT: pextrw $5, %xmm0, %r11d
; SSE3-NEXT: addl %eax, %r11d
; SSE3-NEXT: pextrw $6, %xmm0, %eax
; SSE3-NEXT: pextrw $7, %xmm0, %r15d
; SSE3-NEXT: addl %eax, %r15d
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pextrw $1, %xmm1, %r13d
; SSE3-NEXT: addl %eax, %r13d
; SSE3-NEXT: pextrw $2, %xmm1, %eax
; SSE3-NEXT: pextrw $3, %xmm1, %ebx
; SSE3-NEXT: addl %eax, %ebx
; SSE3-NEXT: pextrw $4, %xmm1, %eax
; SSE3-NEXT: pextrw $5, %xmm1, %r8d
; SSE3-NEXT: addl %eax, %r8d
; SSE3-NEXT: pextrw $6, %xmm1, %eax
; SSE3-NEXT: pextrw $7, %xmm1, %esi
; SSE3-NEXT: addl %eax, %esi
; SSE3-NEXT: movd %xmm2, %eax
; SSE3-NEXT: pextrw $1, %xmm2, %r10d
; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pextrw $2, %xmm2, %eax
; SSE3-NEXT: pextrw $3, %xmm2, %r14d
; SSE3-NEXT: addl %eax, %r14d
; SSE3-NEXT: pextrw $4, %xmm2, %eax
; SSE3-NEXT: pextrw $5, %xmm2, %r12d
; SSE3-NEXT: addl %eax, %r12d
; SSE3-NEXT: pextrw $6, %xmm2, %eax
; SSE3-NEXT: pextrw $7, %xmm2, %r9d
; SSE3-NEXT: addl %eax, %r9d
; SSE3-NEXT: movd %xmm3, %eax
; SSE3-NEXT: pextrw $1, %xmm3, %ebp
; SSE3-NEXT: addl %eax, %ebp
; SSE3-NEXT: pextrw $2, %xmm3, %edx
; SSE3-NEXT: pextrw $3, %xmm3, %edi
; SSE3-NEXT: addl %edx, %edi
; SSE3-NEXT: pextrw $4, %xmm3, %edx
; SSE3-NEXT: pextrw $5, %xmm3, %ecx
; SSE3-NEXT: addl %edx, %ecx
; SSE3-NEXT: pextrw $6, %xmm3, %edx
; SSE3-NEXT: pextrw $7, %xmm3, %eax
; SSE3-NEXT: addl %edx, %eax
; SSE3-NEXT: movd %esi, %xmm8
; SSE3-NEXT: movd %r8d, %xmm3
; SSE3-NEXT: movd %ebx, %xmm9
; SSE3-NEXT: movd %r13d, %xmm4
; SSE3-NEXT: movd %r15d, %xmm10
; SSE3-NEXT: movd %r11d, %xmm7
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm11 # 4-byte Folded Reload
; SSE3-NEXT: # xmm11 = mem[0],zero,zero,zero
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm0 # 4-byte Folded Reload
; SSE3-NEXT: # xmm0 = mem[0],zero,zero,zero
; SSE3-NEXT: movd %eax, %xmm12
; SSE3-NEXT: movd %ecx, %xmm6
; SSE3-NEXT: movd %edi, %xmm13
; SSE3-NEXT: movd %ebp, %xmm5
; SSE3-NEXT: movd %r9d, %xmm14
; SSE3-NEXT: movd %r12d, %xmm2
; SSE3-NEXT: movd %r14d, %xmm15
; SSE3-NEXT: movd %r10d, %xmm1
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_vphadd_w_test:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm1, %xmm0
; SSSE3-NEXT: phaddw %xmm3, %xmm2
; SSSE3-NEXT: movdqa %xmm2, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_vphadd_w_test:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vphaddw %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vphaddw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_vphadd_w_test:
; AVX2: # %bb.0:
; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2-NEXT: vphaddw %xmm2, %xmm0, %xmm0
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <16 x i16> %a, i32 0
%vecext1 = extractelement <16 x i16> %a, i32 1
%add = add i16 %vecext, %vecext1
%vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
%vecext4 = extractelement <16 x i16> %a, i32 2
%vecext6 = extractelement <16 x i16> %a, i32 3
%add8 = add i16 %vecext4, %vecext6
%vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
%vecext11 = extractelement <16 x i16> %a, i32 4
%vecext13 = extractelement <16 x i16> %a, i32 5
%add15 = add i16 %vecext11, %vecext13
%vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
%vecext18 = extractelement <16 x i16> %a, i32 6
%vecext20 = extractelement <16 x i16> %a, i32 7
%add22 = add i16 %vecext18, %vecext20
%vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
%vecext25 = extractelement <16 x i16> %a, i32 8
%vecext27 = extractelement <16 x i16> %a, i32 9
%add29 = add i16 %vecext25, %vecext27
%vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 4
%vecext32 = extractelement <16 x i16> %a, i32 10
%vecext34 = extractelement <16 x i16> %a, i32 11
%add36 = add i16 %vecext32, %vecext34
%vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 5
%vecext39 = extractelement <16 x i16> %a, i32 12
%vecext41 = extractelement <16 x i16> %a, i32 13
%add43 = add i16 %vecext39, %vecext41
%vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 6
%vecext46 = extractelement <16 x i16> %a, i32 14
%vecext48 = extractelement <16 x i16> %a, i32 15
%add50 = add i16 %vecext46, %vecext48
%vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 7
%vecext53 = extractelement <16 x i16> %b, i32 0
%vecext55 = extractelement <16 x i16> %b, i32 1
%add57 = add i16 %vecext53, %vecext55
%vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 8
%vecext60 = extractelement <16 x i16> %b, i32 2
%vecext62 = extractelement <16 x i16> %b, i32 3
%add64 = add i16 %vecext60, %vecext62
%vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 9
%vecext67 = extractelement <16 x i16> %b, i32 4
%vecext69 = extractelement <16 x i16> %b, i32 5
%add71 = add i16 %vecext67, %vecext69
%vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 10
%vecext74 = extractelement <16 x i16> %b, i32 6
%vecext76 = extractelement <16 x i16> %b, i32 7
%add78 = add i16 %vecext74, %vecext76
%vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 11
%vecext81 = extractelement <16 x i16> %b, i32 8
%vecext83 = extractelement <16 x i16> %b, i32 9
%add85 = add i16 %vecext81, %vecext83
%vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
%vecext88 = extractelement <16 x i16> %b, i32 10
%vecext90 = extractelement <16 x i16> %b, i32 11
%add92 = add i16 %vecext88, %vecext90
%vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
%vecext95 = extractelement <16 x i16> %b, i32 12
%vecext97 = extractelement <16 x i16> %b, i32 13
%add99 = add i16 %vecext95, %vecext97
%vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
%vecext102 = extractelement <16 x i16> %b, i32 14
%vecext104 = extractelement <16 x i16> %b, i32 15
%add106 = add i16 %vecext102, %vecext104
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
ret <16 x i16> %vecinit108
}
; Verify that we don't select horizontal subs in the following functions.
define <4 x i32> @not_a_hsub_1(<4 x i32> %A, <4 x i32> %B) {
; SSE-LABEL: not_a_hsub_1:
; SSE: # %bb.0:
; SSE-NEXT: movd %xmm0, %eax
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
; SSE-NEXT: movd %xmm2, %ecx
; SSE-NEXT: subl %ecx, %eax
; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
; SSE-NEXT: movd %xmm2, %ecx
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE-NEXT: movd %xmm0, %edx
; SSE-NEXT: subl %edx, %ecx
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE-NEXT: movd %xmm0, %edx
; SSE-NEXT: movd %xmm1, %esi
; SSE-NEXT: subl %esi, %edx
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE-NEXT: movd %xmm0, %esi
; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE-NEXT: movd %xmm0, %edi
; SSE-NEXT: subl %edi, %esi
; SSE-NEXT: movd %esi, %xmm0
; SSE-NEXT: movd %edx, %xmm1
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE-NEXT: movd %ecx, %xmm2
; SSE-NEXT: movd %eax, %xmm0
; SSE-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_1:
; AVX: # %bb.0:
; AVX-NEXT: vmovd %xmm0, %eax
; AVX-NEXT: vpextrd $1, %xmm0, %ecx
; AVX-NEXT: subl %ecx, %eax
; AVX-NEXT: vpextrd $2, %xmm0, %ecx
; AVX-NEXT: vpextrd $3, %xmm0, %edx
; AVX-NEXT: subl %edx, %ecx
; AVX-NEXT: vpextrd $1, %xmm1, %edx
; AVX-NEXT: vmovd %xmm1, %esi
; AVX-NEXT: subl %esi, %edx
; AVX-NEXT: vpextrd $3, %xmm1, %esi
; AVX-NEXT: vpextrd $2, %xmm1, %edi
; AVX-NEXT: subl %edi, %esi
; AVX-NEXT: vmovd %eax, %xmm0
; AVX-NEXT: vpinsrd $1, %ecx, %xmm0, %xmm0
; AVX-NEXT: vpinsrd $2, %edx, %xmm0, %xmm0
; AVX-NEXT: vpinsrd $3, %esi, %xmm0, %xmm0
; AVX-NEXT: retq
%vecext = extractelement <4 x i32> %A, i32 0
%vecext1 = extractelement <4 x i32> %A, i32 1
%sub = sub i32 %vecext, %vecext1
%vecinit = insertelement <4 x i32> undef, i32 %sub, i32 0
%vecext2 = extractelement <4 x i32> %A, i32 2
%vecext3 = extractelement <4 x i32> %A, i32 3
%sub4 = sub i32 %vecext2, %vecext3
%vecinit5 = insertelement <4 x i32> %vecinit, i32 %sub4, i32 1
%vecext6 = extractelement <4 x i32> %B, i32 1
%vecext7 = extractelement <4 x i32> %B, i32 0
%sub8 = sub i32 %vecext6, %vecext7
%vecinit9 = insertelement <4 x i32> %vecinit5, i32 %sub8, i32 2
%vecext10 = extractelement <4 x i32> %B, i32 3
%vecext11 = extractelement <4 x i32> %B, i32 2
%sub12 = sub i32 %vecext10, %vecext11
%vecinit13 = insertelement <4 x i32> %vecinit9, i32 %sub12, i32 3
ret <4 x i32> %vecinit13
}
define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) {
; SSE-LABEL: not_a_hsub_2:
; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
; SSE-NEXT: movaps %xmm0, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm0[2,3]
; SSE-NEXT: subss %xmm3, %xmm2
; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; SSE-NEXT: subss %xmm3, %xmm0
; SSE-NEXT: movaps %xmm1, %xmm3
; SSE-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,1],xmm1[2,3]
; SSE-NEXT: movaps %xmm1, %xmm4
; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm1[1],xmm4[1]
; SSE-NEXT: subss %xmm4, %xmm3
; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
; SSE-NEXT: subss %xmm4, %xmm1
; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_2:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm0[3,1,2,3]
; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2
; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3]
; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0]
; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
; AVX-NEXT: retq
%vecext = extractelement <4 x float> %A, i32 2
%vecext1 = extractelement <4 x float> %A, i32 3
%sub = fsub float %vecext, %vecext1
%vecinit = insertelement <4 x float> undef, float %sub, i32 1
%vecext2 = extractelement <4 x float> %A, i32 0
%vecext3 = extractelement <4 x float> %A, i32 1
%sub4 = fsub float %vecext2, %vecext3
%vecinit5 = insertelement <4 x float> %vecinit, float %sub4, i32 0
%vecext6 = extractelement <4 x float> %B, i32 3
%vecext7 = extractelement <4 x float> %B, i32 2
%sub8 = fsub float %vecext6, %vecext7
%vecinit9 = insertelement <4 x float> %vecinit5, float %sub8, i32 3
%vecext10 = extractelement <4 x float> %B, i32 0
%vecext11 = extractelement <4 x float> %B, i32 1
%sub12 = fsub float %vecext10, %vecext11
%vecinit13 = insertelement <4 x float> %vecinit9, float %sub12, i32 2
ret <4 x float> %vecinit13
}
define <2 x double> @not_a_hsub_3(<2 x double> %A, <2 x double> %B) {
; SSE-LABEL: not_a_hsub_3:
; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm1, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm1[1],xmm2[1]
; SSE-NEXT: subsd %xmm2, %xmm1
; SSE-NEXT: movaps %xmm0, %xmm2
; SSE-NEXT: movhlps {{.*#+}} xmm2 = xmm0[1],xmm2[1]
; SSE-NEXT: subsd %xmm0, %xmm2
; SSE-NEXT: unpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
; SSE-NEXT: movapd %xmm2, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: not_a_hsub_3:
; AVX: # %bb.0:
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm1[1,0]
; AVX-NEXT: vsubsd %xmm2, %xmm1, %xmm1
; AVX-NEXT: vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
; AVX-NEXT: vsubsd %xmm0, %xmm2, %xmm0
; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: retq
%vecext = extractelement <2 x double> %B, i32 0
%vecext1 = extractelement <2 x double> %B, i32 1
%sub = fsub double %vecext, %vecext1
%vecinit = insertelement <2 x double> undef, double %sub, i32 1
%vecext2 = extractelement <2 x double> %A, i32 1
%vecext3 = extractelement <2 x double> %A, i32 0
%sub2 = fsub double %vecext2, %vecext3
%vecinit2 = insertelement <2 x double> %vecinit, double %sub2, i32 0
ret <2 x double> %vecinit2
}
; Test AVX horizontal add/sub of packed single/double precision
; floating point values from 256-bit vectors.
define <8 x float> @avx_vhadd_ps(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: avx_vhadd_ps:
; SSE: # %bb.0:
; SSE-NEXT: haddps %xmm2, %xmm0
; SSE-NEXT: haddps %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhadd_ps:
; AVX: # %bb.0:
; AVX-NEXT: vhaddps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%add = fadd float %vecext, %vecext1
%vecinit = insertelement <8 x float> undef, float %add, i32 0
%vecext2 = extractelement <8 x float> %a, i32 2
%vecext3 = extractelement <8 x float> %a, i32 3
%add4 = fadd float %vecext2, %vecext3
%vecinit5 = insertelement <8 x float> %vecinit, float %add4, i32 1
%vecext6 = extractelement <8 x float> %b, i32 0
%vecext7 = extractelement <8 x float> %b, i32 1
%add8 = fadd float %vecext6, %vecext7
%vecinit9 = insertelement <8 x float> %vecinit5, float %add8, i32 2
%vecext10 = extractelement <8 x float> %b, i32 2
%vecext11 = extractelement <8 x float> %b, i32 3
%add12 = fadd float %vecext10, %vecext11
%vecinit13 = insertelement <8 x float> %vecinit9, float %add12, i32 3
%vecext14 = extractelement <8 x float> %a, i32 4
%vecext15 = extractelement <8 x float> %a, i32 5
%add16 = fadd float %vecext14, %vecext15
%vecinit17 = insertelement <8 x float> %vecinit13, float %add16, i32 4
%vecext18 = extractelement <8 x float> %a, i32 6
%vecext19 = extractelement <8 x float> %a, i32 7
%add20 = fadd float %vecext18, %vecext19
%vecinit21 = insertelement <8 x float> %vecinit17, float %add20, i32 5
%vecext22 = extractelement <8 x float> %b, i32 4
%vecext23 = extractelement <8 x float> %b, i32 5
%add24 = fadd float %vecext22, %vecext23
%vecinit25 = insertelement <8 x float> %vecinit21, float %add24, i32 6
%vecext26 = extractelement <8 x float> %b, i32 6
%vecext27 = extractelement <8 x float> %b, i32 7
%add28 = fadd float %vecext26, %vecext27
%vecinit29 = insertelement <8 x float> %vecinit25, float %add28, i32 7
ret <8 x float> %vecinit29
}
define <8 x float> @avx_vhsub_ps(<8 x float> %a, <8 x float> %b) {
; SSE-LABEL: avx_vhsub_ps:
; SSE: # %bb.0:
; SSE-NEXT: hsubps %xmm2, %xmm0
; SSE-NEXT: hsubps %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_vhsub_ps:
; AVX: # %bb.0:
; AVX-NEXT: vhsubps %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <8 x float> %a, i32 0
%vecext1 = extractelement <8 x float> %a, i32 1
%sub = fsub float %vecext, %vecext1
%vecinit = insertelement <8 x float> undef, float %sub, i32 0
%vecext2 = extractelement <8 x float> %a, i32 2
%vecext3 = extractelement <8 x float> %a, i32 3
%sub4 = fsub float %vecext2, %vecext3
%vecinit5 = insertelement <8 x float> %vecinit, float %sub4, i32 1
%vecext6 = extractelement <8 x float> %b, i32 0
%vecext7 = extractelement <8 x float> %b, i32 1
%sub8 = fsub float %vecext6, %vecext7
%vecinit9 = insertelement <8 x float> %vecinit5, float %sub8, i32 2
%vecext10 = extractelement <8 x float> %b, i32 2
%vecext11 = extractelement <8 x float> %b, i32 3
%sub12 = fsub float %vecext10, %vecext11
%vecinit13 = insertelement <8 x float> %vecinit9, float %sub12, i32 3
%vecext14 = extractelement <8 x float> %a, i32 4
%vecext15 = extractelement <8 x float> %a, i32 5
%sub16 = fsub float %vecext14, %vecext15
%vecinit17 = insertelement <8 x float> %vecinit13, float %sub16, i32 4
%vecext18 = extractelement <8 x float> %a, i32 6
%vecext19 = extractelement <8 x float> %a, i32 7
%sub20 = fsub float %vecext18, %vecext19
%vecinit21 = insertelement <8 x float> %vecinit17, float %sub20, i32 5
%vecext22 = extractelement <8 x float> %b, i32 4
%vecext23 = extractelement <8 x float> %b, i32 5
%sub24 = fsub float %vecext22, %vecext23
%vecinit25 = insertelement <8 x float> %vecinit21, float %sub24, i32 6
%vecext26 = extractelement <8 x float> %b, i32 6
%vecext27 = extractelement <8 x float> %b, i32 7
%sub28 = fsub float %vecext26, %vecext27
%vecinit29 = insertelement <8 x float> %vecinit25, float %sub28, i32 7
ret <8 x float> %vecinit29
}
define <4 x double> @avx_hadd_pd(<4 x double> %a, <4 x double> %b) {
; SSE-LABEL: avx_hadd_pd:
; SSE: # %bb.0:
; SSE-NEXT: haddpd %xmm2, %xmm0
; SSE-NEXT: haddpd %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_hadd_pd:
; AVX: # %bb.0:
; AVX-NEXT: vhaddpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
%vecext1 = extractelement <4 x double> %a, i32 1
%add = fadd double %vecext, %vecext1
%vecinit = insertelement <4 x double> undef, double %add, i32 0
%vecext2 = extractelement <4 x double> %b, i32 0
%vecext3 = extractelement <4 x double> %b, i32 1
%add4 = fadd double %vecext2, %vecext3
%vecinit5 = insertelement <4 x double> %vecinit, double %add4, i32 1
%vecext6 = extractelement <4 x double> %a, i32 2
%vecext7 = extractelement <4 x double> %a, i32 3
%add8 = fadd double %vecext6, %vecext7
%vecinit9 = insertelement <4 x double> %vecinit5, double %add8, i32 2
%vecext10 = extractelement <4 x double> %b, i32 2
%vecext11 = extractelement <4 x double> %b, i32 3
%add12 = fadd double %vecext10, %vecext11
%vecinit13 = insertelement <4 x double> %vecinit9, double %add12, i32 3
ret <4 x double> %vecinit13
}
define <4 x double> @avx_hsub_pd(<4 x double> %a, <4 x double> %b) {
; SSE-LABEL: avx_hsub_pd:
; SSE: # %bb.0:
; SSE-NEXT: hsubpd %xmm2, %xmm0
; SSE-NEXT: hsubpd %xmm3, %xmm1
; SSE-NEXT: retq
;
; AVX-LABEL: avx_hsub_pd:
; AVX: # %bb.0:
; AVX-NEXT: vhsubpd %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
%vecext = extractelement <4 x double> %a, i32 0
%vecext1 = extractelement <4 x double> %a, i32 1
%sub = fsub double %vecext, %vecext1
%vecinit = insertelement <4 x double> undef, double %sub, i32 0
%vecext2 = extractelement <4 x double> %b, i32 0
%vecext3 = extractelement <4 x double> %b, i32 1
%sub4 = fsub double %vecext2, %vecext3
%vecinit5 = insertelement <4 x double> %vecinit, double %sub4, i32 1
%vecext6 = extractelement <4 x double> %a, i32 2
%vecext7 = extractelement <4 x double> %a, i32 3
%sub8 = fsub double %vecext6, %vecext7
%vecinit9 = insertelement <4 x double> %vecinit5, double %sub8, i32 2
%vecext10 = extractelement <4 x double> %b, i32 2
%vecext11 = extractelement <4 x double> %b, i32 3
%sub12 = fsub double %vecext10, %vecext11
%vecinit13 = insertelement <4 x double> %vecinit9, double %sub12, i32 3
ret <4 x double> %vecinit13
}
; Test AVX2 horizontal add of packed integer values from 256-bit vectors.
define <8 x i32> @avx2_hadd_d(<8 x i32> %a, <8 x i32> %b) {
; SSE3-LABEL: avx2_hadd_d:
; SSE3: # %bb.0:
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[1,1,2,3]
; SSE3-NEXT: movd %xmm4, %r8d
; SSE3-NEXT: addl %ecx, %r8d
; SSE3-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
; SSE3-NEXT: movd %xmm4, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %r9d
; SSE3-NEXT: addl %edx, %r9d
; SSE3-NEXT: movd %xmm2, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %esi
; SSE3-NEXT: addl %edx, %esi
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm2[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %edi
; SSE3-NEXT: addl %edx, %edi
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %r10d
; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm1[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movd %xmm3, %eax
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[1,1,2,3]
; SSE3-NEXT: movd %xmm0, %edx
; SSE3-NEXT: addl %eax, %edx
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
; SSE3-NEXT: movd %xmm0, %r11d
; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[3,1,2,3]
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: addl %r11d, %eax
; SSE3-NEXT: movd %edi, %xmm0
; SSE3-NEXT: movd %esi, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; SSE3-NEXT: movd %r9d, %xmm2
; SSE3-NEXT: movd %r8d, %xmm0
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSE3-NEXT: movd %eax, %xmm1
; SSE3-NEXT: movd %edx, %xmm2
; SSE3-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
; SSE3-NEXT: movd %ecx, %xmm3
; SSE3-NEXT: movd %r10d, %xmm1
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_d:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddd %xmm2, %xmm0
; SSSE3-NEXT: phaddd %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_hadd_d:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vphaddd %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_hadd_d:
; AVX2: # %bb.0:
; AVX2-NEXT: vphaddd %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <8 x i32> %a, i32 0
%vecext1 = extractelement <8 x i32> %a, i32 1
%add = add i32 %vecext, %vecext1
%vecinit = insertelement <8 x i32> undef, i32 %add, i32 0
%vecext2 = extractelement <8 x i32> %a, i32 2
%vecext3 = extractelement <8 x i32> %a, i32 3
%add4 = add i32 %vecext2, %vecext3
%vecinit5 = insertelement <8 x i32> %vecinit, i32 %add4, i32 1
%vecext6 = extractelement <8 x i32> %b, i32 0
%vecext7 = extractelement <8 x i32> %b, i32 1
%add8 = add i32 %vecext6, %vecext7
%vecinit9 = insertelement <8 x i32> %vecinit5, i32 %add8, i32 2
%vecext10 = extractelement <8 x i32> %b, i32 2
%vecext11 = extractelement <8 x i32> %b, i32 3
%add12 = add i32 %vecext10, %vecext11
%vecinit13 = insertelement <8 x i32> %vecinit9, i32 %add12, i32 3
%vecext14 = extractelement <8 x i32> %a, i32 4
%vecext15 = extractelement <8 x i32> %a, i32 5
%add16 = add i32 %vecext14, %vecext15
%vecinit17 = insertelement <8 x i32> %vecinit13, i32 %add16, i32 4
%vecext18 = extractelement <8 x i32> %a, i32 6
%vecext19 = extractelement <8 x i32> %a, i32 7
%add20 = add i32 %vecext18, %vecext19
%vecinit21 = insertelement <8 x i32> %vecinit17, i32 %add20, i32 5
%vecext22 = extractelement <8 x i32> %b, i32 4
%vecext23 = extractelement <8 x i32> %b, i32 5
%add24 = add i32 %vecext22, %vecext23
%vecinit25 = insertelement <8 x i32> %vecinit21, i32 %add24, i32 6
%vecext26 = extractelement <8 x i32> %b, i32 6
%vecext27 = extractelement <8 x i32> %b, i32 7
%add28 = add i32 %vecext26, %vecext27
%vecinit29 = insertelement <8 x i32> %vecinit25, i32 %add28, i32 7
ret <8 x i32> %vecinit29
}
define <16 x i16> @avx2_hadd_w(<16 x i16> %a, <16 x i16> %b) {
; SSE3-LABEL: avx2_hadd_w:
; SSE3: # %bb.0:
; SSE3-NEXT: pushq %rbp
; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: pushq %r15
; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: pushq %r14
; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: pushq %r13
; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: pushq %r12
; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: pushq %rbx
; SSE3-NEXT: .cfi_def_cfa_offset 56
; SSE3-NEXT: .cfi_offset %rbx, -56
; SSE3-NEXT: .cfi_offset %r12, -48
; SSE3-NEXT: .cfi_offset %r13, -40
; SSE3-NEXT: .cfi_offset %r14, -32
; SSE3-NEXT: .cfi_offset %r15, -24
; SSE3-NEXT: .cfi_offset %rbp, -16
; SSE3-NEXT: movd %xmm0, %eax
; SSE3-NEXT: pextrw $1, %xmm0, %r10d
; SSE3-NEXT: addl %eax, %r10d
; SSE3-NEXT: pextrw $2, %xmm0, %eax
; SSE3-NEXT: pextrw $3, %xmm0, %r11d
; SSE3-NEXT: addl %eax, %r11d
; SSE3-NEXT: pextrw $4, %xmm0, %eax
; SSE3-NEXT: pextrw $5, %xmm0, %r12d
; SSE3-NEXT: addl %eax, %r12d
; SSE3-NEXT: pextrw $6, %xmm0, %eax
; SSE3-NEXT: pextrw $7, %xmm0, %r13d
; SSE3-NEXT: addl %eax, %r13d
; SSE3-NEXT: movd %xmm1, %eax
; SSE3-NEXT: pextrw $1, %xmm1, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; SSE3-NEXT: pextrw $2, %xmm1, %eax
; SSE3-NEXT: pextrw $3, %xmm1, %ecx
; SSE3-NEXT: addl %eax, %ecx
; SSE3-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) # 4-byte Spill
; SSE3-NEXT: pextrw $4, %xmm1, %eax
; SSE3-NEXT: pextrw $5, %xmm1, %r14d
; SSE3-NEXT: addl %eax, %r14d
; SSE3-NEXT: pextrw $6, %xmm1, %esi
; SSE3-NEXT: pextrw $7, %xmm1, %r15d
; SSE3-NEXT: addl %esi, %r15d
; SSE3-NEXT: movd %xmm2, %esi
; SSE3-NEXT: pextrw $1, %xmm2, %ebp
; SSE3-NEXT: addl %esi, %ebp
; SSE3-NEXT: pextrw $2, %xmm2, %esi
; SSE3-NEXT: pextrw $3, %xmm2, %edi
; SSE3-NEXT: addl %esi, %edi
; SSE3-NEXT: pextrw $4, %xmm2, %esi
; SSE3-NEXT: pextrw $5, %xmm2, %eax
; SSE3-NEXT: addl %esi, %eax
; SSE3-NEXT: pextrw $6, %xmm2, %esi
; SSE3-NEXT: pextrw $7, %xmm2, %ecx
; SSE3-NEXT: addl %esi, %ecx
; SSE3-NEXT: movd %xmm3, %ebx
; SSE3-NEXT: pextrw $1, %xmm3, %r9d
; SSE3-NEXT: addl %ebx, %r9d
; SSE3-NEXT: pextrw $2, %xmm3, %edx
; SSE3-NEXT: pextrw $3, %xmm3, %ebx
; SSE3-NEXT: addl %edx, %ebx
; SSE3-NEXT: pextrw $4, %xmm3, %edx
; SSE3-NEXT: pextrw $5, %xmm3, %esi
; SSE3-NEXT: addl %edx, %esi
; SSE3-NEXT: pextrw $6, %xmm3, %r8d
; SSE3-NEXT: pextrw $7, %xmm3, %edx
; SSE3-NEXT: addl %r8d, %edx
; SSE3-NEXT: movd %ecx, %xmm8
; SSE3-NEXT: movd %eax, %xmm3
; SSE3-NEXT: movd %edi, %xmm9
; SSE3-NEXT: movd %ebp, %xmm4
; SSE3-NEXT: movd %r13d, %xmm10
; SSE3-NEXT: movd %r12d, %xmm7
; SSE3-NEXT: movd %r11d, %xmm11
; SSE3-NEXT: movd %r10d, %xmm0
; SSE3-NEXT: movd %edx, %xmm12
; SSE3-NEXT: movd %esi, %xmm6
; SSE3-NEXT: movd %ebx, %xmm13
; SSE3-NEXT: movd %r9d, %xmm5
; SSE3-NEXT: movd %r15d, %xmm14
; SSE3-NEXT: movd %r14d, %xmm2
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm15 # 4-byte Folded Reload
; SSE3-NEXT: # xmm15 = mem[0],zero,zero,zero
; SSE3-NEXT: movd -{{[0-9]+}}(%rsp), %xmm1 # 4-byte Folded Reload
; SSE3-NEXT: # xmm1 = mem[0],zero,zero,zero
; SSE3-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm9[0],xmm4[1],xmm9[1],xmm4[2],xmm9[2],xmm4[3],xmm9[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm13[0],xmm5[1],xmm13[1],xmm5[2],xmm13[2],xmm5[3],xmm13[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm14[0],xmm2[1],xmm14[1],xmm2[2],xmm14[2],xmm2[3],xmm14[3]
; SSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
; SSE3-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
; SSE3-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
; SSE3-NEXT: popq %rbx
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 48
; SSE3-NEXT: popq %r12
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 40
; SSE3-NEXT: popq %r13
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 32
; SSE3-NEXT: popq %r14
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 24
; SSE3-NEXT: popq %r15
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 16
; SSE3-NEXT: popq %rbp
Correct dwarf unwind information in function epilogue This patch aims to provide correct dwarf unwind information in function epilogue for X86. It consists of two parts. The first part inserts CFI instructions that set appropriate cfa offset and cfa register in emitEpilogue() in X86FrameLowering. This part is X86 specific. The second part is platform independent and ensures that: * CFI instructions do not affect code generation (they are not counted as instructions when tail duplicating or tail merging) * Unwind information remains correct when a function is modified by different passes. This is done in a late pass by analyzing information about cfa offset and cfa register in BBs and inserting additional CFI directives where necessary. Added CFIInstrInserter pass: * analyzes each basic block to determine cfa offset and register are valid at its entry and exit * verifies that outgoing cfa offset and register of predecessor blocks match incoming values of their successors * inserts additional CFI directives at basic block beginning to correct the rule for calculating CFA Having CFI instructions in function epilogue can cause incorrect CFA calculation rule for some basic blocks. This can happen if, due to basic block reordering, or the existence of multiple epilogue blocks, some of the blocks have wrong cfa offset and register values set by the epilogue block above them. CFIInstrInserter is currently run only on X86, but can be used by any target that implements support for adding CFI instructions in epilogue. Patch by Violeta Vukobrat. Differential Revision: https://reviews.llvm.org/D42848 llvm-svn: 330706
2018-04-24 18:32:08 +08:00
; SSE3-NEXT: .cfi_def_cfa_offset 8
; SSE3-NEXT: retq
;
; SSSE3-LABEL: avx2_hadd_w:
; SSSE3: # %bb.0:
; SSSE3-NEXT: phaddw %xmm2, %xmm0
; SSSE3-NEXT: phaddw %xmm3, %xmm1
; SSSE3-NEXT: retq
;
; AVX1-LABEL: avx2_hadd_w:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vphaddw %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vphaddw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: avx2_hadd_w:
; AVX2: # %bb.0:
; AVX2-NEXT: vphaddw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
%vecext = extractelement <16 x i16> %a, i32 0
%vecext1 = extractelement <16 x i16> %a, i32 1
%add = add i16 %vecext, %vecext1
%vecinit = insertelement <16 x i16> undef, i16 %add, i32 0
%vecext4 = extractelement <16 x i16> %a, i32 2
%vecext6 = extractelement <16 x i16> %a, i32 3
%add8 = add i16 %vecext4, %vecext6
%vecinit10 = insertelement <16 x i16> %vecinit, i16 %add8, i32 1
%vecext11 = extractelement <16 x i16> %a, i32 4
%vecext13 = extractelement <16 x i16> %a, i32 5
%add15 = add i16 %vecext11, %vecext13
%vecinit17 = insertelement <16 x i16> %vecinit10, i16 %add15, i32 2
%vecext18 = extractelement <16 x i16> %a, i32 6
%vecext20 = extractelement <16 x i16> %a, i32 7
%add22 = add i16 %vecext18, %vecext20
%vecinit24 = insertelement <16 x i16> %vecinit17, i16 %add22, i32 3
%vecext25 = extractelement <16 x i16> %a, i32 8
%vecext27 = extractelement <16 x i16> %a, i32 9
%add29 = add i16 %vecext25, %vecext27
%vecinit31 = insertelement <16 x i16> %vecinit24, i16 %add29, i32 8
%vecext32 = extractelement <16 x i16> %a, i32 10
%vecext34 = extractelement <16 x i16> %a, i32 11
%add36 = add i16 %vecext32, %vecext34
%vecinit38 = insertelement <16 x i16> %vecinit31, i16 %add36, i32 9
%vecext39 = extractelement <16 x i16> %a, i32 12
%vecext41 = extractelement <16 x i16> %a, i32 13
%add43 = add i16 %vecext39, %vecext41
%vecinit45 = insertelement <16 x i16> %vecinit38, i16 %add43, i32 10
%vecext46 = extractelement <16 x i16> %a, i32 14
%vecext48 = extractelement <16 x i16> %a, i32 15
%add50 = add i16 %vecext46, %vecext48
%vecinit52 = insertelement <16 x i16> %vecinit45, i16 %add50, i32 11
%vecext53 = extractelement <16 x i16> %b, i32 0
%vecext55 = extractelement <16 x i16> %b, i32 1
%add57 = add i16 %vecext53, %vecext55
%vecinit59 = insertelement <16 x i16> %vecinit52, i16 %add57, i32 4
%vecext60 = extractelement <16 x i16> %b, i32 2
%vecext62 = extractelement <16 x i16> %b, i32 3
%add64 = add i16 %vecext60, %vecext62
%vecinit66 = insertelement <16 x i16> %vecinit59, i16 %add64, i32 5
%vecext67 = extractelement <16 x i16> %b, i32 4
%vecext69 = extractelement <16 x i16> %b, i32 5
%add71 = add i16 %vecext67, %vecext69
%vecinit73 = insertelement <16 x i16> %vecinit66, i16 %add71, i32 6
%vecext74 = extractelement <16 x i16> %b, i32 6
%vecext76 = extractelement <16 x i16> %b, i32 7
%add78 = add i16 %vecext74, %vecext76
%vecinit80 = insertelement <16 x i16> %vecinit73, i16 %add78, i32 7
%vecext81 = extractelement <16 x i16> %b, i32 8
%vecext83 = extractelement <16 x i16> %b, i32 9
%add85 = add i16 %vecext81, %vecext83
%vecinit87 = insertelement <16 x i16> %vecinit80, i16 %add85, i32 12
%vecext88 = extractelement <16 x i16> %b, i32 10
%vecext90 = extractelement <16 x i16> %b, i32 11
%add92 = add i16 %vecext88, %vecext90
%vecinit94 = insertelement <16 x i16> %vecinit87, i16 %add92, i32 13
%vecext95 = extractelement <16 x i16> %b, i32 12
%vecext97 = extractelement <16 x i16> %b, i32 13
%add99 = add i16 %vecext95, %vecext97
%vecinit101 = insertelement <16 x i16> %vecinit94, i16 %add99, i32 14
%vecext102 = extractelement <16 x i16> %b, i32 14
%vecext104 = extractelement <16 x i16> %b, i32 15
%add106 = add i16 %vecext102, %vecext104
%vecinit108 = insertelement <16 x i16> %vecinit101, i16 %add106, i32 15
ret <16 x i16> %vecinit108
}