llvm-project/llvm/test/CodeGen/X86/sse3-avx-addsub.ll

; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=SSE -check-prefix=CHECK
; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s -check-prefix=AVX -check-prefix=CHECK

; Test ADDSUB ISel patterns.

; Functions below are obtained from the following source:
;
; typedef double double2 __attribute__((ext_vector_type(2)));
; typedef double double4 __attribute__((ext_vector_type(4)));
; typedef float float4 __attribute__((ext_vector_type(4)));
; typedef float float8 __attribute__((ext_vector_type(8)));
;
; float4 test1(float4 A, float4 B) {
;   float4 X = A - B;
;   float4 Y = A + B;
;   return (float4){X[0], Y[1], X[2], Y[3]};
; }
;
; float8 test2(float8 A, float8 B) {
;   float8 X = A - B;
;   float8 Y = A + B;
;   return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]};
; }
;
; double4 test3(double4 A, double4 B) {
;   double4 X = A - B;
;   double4 Y = A + B;
;   return (double4){X[0], Y[1], X[2], Y[3]};
; }
;
; double2 test4(double2 A, double2 B) {
;   double2 X = A - B;
;   double2 Y = A + B;
;   return (double2){X[0], Y[1]};
; }

define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {
  %sub = fsub <4 x float> %A, %B
  %add = fadd <4 x float> %A, %B
  %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
  ret <4 x float> %vecinit6
}
; CHECK-LABEL: test1
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret


define <8 x float> @test2(<8 x float> %A, <8 x float> %B) {
  %sub = fsub <8 x float> %A, %B
  %add = fadd <8 x float> %A, %B
  %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
  ret <8 x float> %vecinit14
}
; CHECK-LABEL: test2
; SSE: addsubps
; SSE-NEXT: addsubps
; AVX: vaddsubps
; AVX-NOT: vaddsubps
; CHECK: ret


define <4 x double> @test3(<4 x double> %A, <4 x double> %B) {
  %sub = fsub <4 x double> %A, %B
  %add = fadd <4 x double> %A, %B
  %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
  ret <4 x double> %vecinit6
}
; CHECK-LABEL: test3
; SSE: addsubpd
; SSE: addsubpd
; AVX: vaddsubpd
; AVX-NOT: vaddsubpd
; CHECK: ret


define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 {
  %add = fadd <2 x double> %A, %B
  %sub = fsub <2 x double> %A, %B
  %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>
  ret <2 x double> %vecinit2
}
; CHECK-LABEL: test4
; SSE: addsubpd
; AVX: vaddsubpd
; CHECK-NEXT: ret


define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {
  %1 = load <4 x float>, <4 x float>* %B
  %add = fadd <4 x float> %A, %1
  %sub = fsub <4 x float> %A, %1
  %vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
  ret <4 x float> %vecinit6
}
; CHECK-LABEL: test1b
; SSE: addsubps
; AVX: vaddsubps
; CHECK-NEXT: ret


define <8 x float> @test2b(<8 x float> %A, <8 x float>* %B) {
  %1 = load <8 x float>, <8 x float>* %B
  %add = fadd <8 x float> %A, %1
  %sub = fsub <8 x float> %A, %1
  %vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
  ret <8 x float> %vecinit14
}
; CHECK-LABEL: test2b
; SSE: addsubps
; SSE-NEXT: addsubps
; AVX: vaddsubps
; AVX-NOT: vaddsubps
; CHECK: ret


define <4 x double> @test3b(<4 x double> %A, <4 x double>* %B) {
  %1 = load <4 x double>, <4 x double>* %B
  %add = fadd <4 x double> %A, %1
  %sub = fsub <4 x double> %A, %1
  %vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
  ret <4 x double> %vecinit6
}
; CHECK-LABEL: test3b
; SSE: addsubpd
; SSE: addsubpd
; AVX: vaddsubpd
; AVX-NOT: vaddsubpd
; CHECK: ret


define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {
  %1 = load <2 x double>, <2 x double>* %B
  %sub = fsub <2 x double> %A, %1
  %add = fadd <2 x double> %A, %1
  %vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>
  ret <2 x double> %vecinit2
}
; CHECK-LABEL: test4b
; SSE: addsubpd
; AVX: vaddsubpd
; CHECK-NEXT: ret
[x86] Add the beginnings of a proper DAG combine to match ADDSUBPS and ADDSUBPD nodes out of blends of adds and subs. This allows us to actually form these instructions with SSE3 rather than only forming them when we had both SSE3 for the ADDSUB instructions and SSE4.1 for the blend instructions. ;] Kind-of important. I've adjusted the CPU requirements on one of the tests to demonstrate this kicking in nicely for an SSE3 cpu configuration. llvm-svn: 217848 2014-09-16 08:15:20 +08:00			`; RUN: llc < %s -march=x86-64 -mcpu=core2 \| FileCheck %s -check-prefix=SSE -check-prefix=CHECK`
[X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 2014-06-21 09:31:15 +08:00			`; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx \| FileCheck %s -check-prefix=AVX -check-prefix=CHECK`

			`; Test ADDSUB ISel patterns.`

[X86] Improve the selection of SSE3/AVX addsub instructions. This patch teaches the backend how to canonicalize a shuffle vectors according to the rule: - (shuffle (FADD A, B), (FSUB A, B), Mask) -> (shuffle (FSUB A, -B), (FADD A, -B), Mask) Where 'Mask' is: <0,5,2,7> ;; for v4f32 and v4f64 shuffles. <0,3> ;; for v2f64 shuffles. <0,9,2,11,4,13,6,15> ;; for v8f32 shuffles. In general, ISel only knows how to pattern-match a canonical 'fadd + fsub + blendi' dag node sequence into an ADDSUB instruction. This new rule allows to convert a non-canonical dag sequence into a canonical one that will be matched by a single ADDSUB at ISel stage. The idea of converting a non-canonical ADDSUB into a canonical one by swapping the first two operands of the shuffle, and then negating the second operand of the FADD and FSUB, was originally proposed by Hal Finkel. llvm-svn: 211771 2014-06-26 18:45:21 +08:00			`; Functions below are obtained from the following source:`
[X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 2014-06-21 09:31:15 +08:00			`;`
			`; typedef double double2 __attribute__((ext_vector_type(2)));`
			`; typedef double double4 __attribute__((ext_vector_type(4)));`
			`; typedef float float4 __attribute__((ext_vector_type(4)));`
			`; typedef float float8 __attribute__((ext_vector_type(8)));`
			`;`
			`; float4 test1(float4 A, float4 B) {`
			`; float4 X = A - B;`
			`; float4 Y = A + B;`
			`; return (float4){X[0], Y[1], X[2], Y[3]};`
			`; }`
			`;`
			`; float8 test2(float8 A, float8 B) {`
			`; float8 X = A - B;`
			`; float8 Y = A + B;`
[X86] Improve the selection of SSE3/AVX addsub instructions. This patch teaches the backend how to canonicalize a shuffle vectors according to the rule: - (shuffle (FADD A, B), (FSUB A, B), Mask) -> (shuffle (FSUB A, -B), (FADD A, -B), Mask) Where 'Mask' is: <0,5,2,7> ;; for v4f32 and v4f64 shuffles. <0,3> ;; for v2f64 shuffles. <0,9,2,11,4,13,6,15> ;; for v8f32 shuffles. In general, ISel only knows how to pattern-match a canonical 'fadd + fsub + blendi' dag node sequence into an ADDSUB instruction. This new rule allows to convert a non-canonical dag sequence into a canonical one that will be matched by a single ADDSUB at ISel stage. The idea of converting a non-canonical ADDSUB into a canonical one by swapping the first two operands of the shuffle, and then negating the second operand of the FADD and FSUB, was originally proposed by Hal Finkel. llvm-svn: 211771 2014-06-26 18:45:21 +08:00			`; return (float8){X[0], Y[1], X[2], Y[3], X[4], Y[5], X[6], Y[7]};`
[X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 2014-06-21 09:31:15 +08:00			`; }`
			`;`
			`; double4 test3(double4 A, double4 B) {`
			`; double4 X = A - B;`
			`; double4 Y = A + B;`
			`; return (double4){X[0], Y[1], X[2], Y[3]};`
			`; }`
			`;`
			`; double2 test4(double2 A, double2 B) {`
			`; double2 X = A - B;`
			`; double2 Y = A + B;`
			`; return (double2){X[0], Y[1]};`
			`; }`

			`define <4 x float> @test1(<4 x float> %A, <4 x float> %B) {`
			`%sub = fsub <4 x float> %A, %B`
			`%add = fadd <4 x float> %A, %B`
			`%vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>`
			`ret <4 x float> %vecinit6`
			`}`
			`; CHECK-LABEL: test1`
			`; SSE: addsubps`
			`; AVX: vaddsubps`
			`; CHECK-NEXT: ret`


			`define <8 x float> @test2(<8 x float> %A, <8 x float> %B) {`
			`%sub = fsub <8 x float> %A, %B`
			`%add = fadd <8 x float> %A, %B`
			`%vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>`
			`ret <8 x float> %vecinit14`
			`}`
			`; CHECK-LABEL: test2`
			`; SSE: addsubps`
			`; SSE-NEXT: addsubps`
			`; AVX: vaddsubps`
			`; AVX-NOT: vaddsubps`
			`; CHECK: ret`


			`define <4 x double> @test3(<4 x double> %A, <4 x double> %B) {`
			`%sub = fsub <4 x double> %A, %B`
			`%add = fadd <4 x double> %A, %B`
			`%vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>`
			`ret <4 x double> %vecinit6`
			`}`
			`; CHECK-LABEL: test3`
			`; SSE: addsubpd`
			`; SSE: addsubpd`
			`; AVX: vaddsubpd`
			`; AVX-NOT: vaddsubpd`
			`; CHECK: ret`


			`define <2 x double> @test4(<2 x double> %A, <2 x double> %B) #0 {`
			`%add = fadd <2 x double> %A, %B`
			`%sub = fsub <2 x double> %A, %B`
			`%vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>`
			`ret <2 x double> %vecinit2`
			`}`
			`; CHECK-LABEL: test4`
			`; SSE: addsubpd`
			`; AVX: vaddsubpd`
			`; CHECK-NEXT: ret`


			`define <4 x float> @test1b(<4 x float> %A, <4 x float>* %B) {`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%1 = load <4 x float>, <4 x float>* %B`
[X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 2014-06-21 09:31:15 +08:00			`%add = fadd <4 x float> %A, %1`
			`%sub = fsub <4 x float> %A, %1`
			`%vecinit6 = shufflevector <4 x float> %sub, <4 x float> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>`
			`ret <4 x float> %vecinit6`
			`}`
			`; CHECK-LABEL: test1b`
			`; SSE: addsubps`
			`; AVX: vaddsubps`
			`; CHECK-NEXT: ret`


			`define <8 x float> @test2b(<8 x float> %A, <8 x float>* %B) {`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%1 = load <8 x float>, <8 x float>* %B`
[X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 2014-06-21 09:31:15 +08:00			`%add = fadd <8 x float> %A, %1`
			`%sub = fsub <8 x float> %A, %1`
			`%vecinit14 = shufflevector <8 x float> %sub, <8 x float> %add, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>`
			`ret <8 x float> %vecinit14`
			`}`
			`; CHECK-LABEL: test2b`
			`; SSE: addsubps`
			`; SSE-NEXT: addsubps`
			`; AVX: vaddsubps`
			`; AVX-NOT: vaddsubps`
			`; CHECK: ret`


			`define <4 x double> @test3b(<4 x double> %A, <4 x double>* %B) {`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%1 = load <4 x double>, <4 x double>* %B`
[X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 2014-06-21 09:31:15 +08:00			`%add = fadd <4 x double> %A, %1`
			`%sub = fsub <4 x double> %A, %1`
			`%vecinit6 = shufflevector <4 x double> %sub, <4 x double> %add, <4 x i32> <i32 0, i32 5, i32 2, i32 7>`
			`ret <4 x double> %vecinit6`
			`}`
			`; CHECK-LABEL: test3b`
			`; SSE: addsubpd`
			`; SSE: addsubpd`
			`; AVX: vaddsubpd`
			`; AVX-NOT: vaddsubpd`
			`; CHECK: ret`


			`define <2 x double> @test4b(<2 x double> %A, <2 x double>* %B) {`
[opaque pointer type] Add textual IR support for explicit type parameter to load instruction Essentially the same as the GEP change in r230786. A similar migration script can be used to update test cases, though a few more test case improvements/changes were required this time around: (r229269-r229278) import fileinput import sys import re pat = re.compile(r"((?:=\|:\|^)\sload (?:atomic )?(?:volatile )?(.?))(\| addrspace\(\d+\) )\($\| (?:%\|@\|null\|undef\|blockaddress\|getelementptr\|addrspacecast\|bitcast\|inttoptr\|\[\[[a-zA-Z]\|\{\{).$)") for line in sys.stdin: sys.stdout.write(re.sub(pat, r"\1, \2\3*\4", line)) Reviewers: rafael, dexonsmith, grosser Differential Revision: http://reviews.llvm.org/D7649 llvm-svn: 230794 2015-02-28 05:17:42 +08:00			`%1 = load <2 x double>, <2 x double>* %B`
[X86] Add ISel patterns to select SSE3/AVX ADDSUB instructions. This patch adds ISel patterns to select SSE3/AVX ADDSUB instructions from a sequence of "vadd + vsub + blend". Example: /// typedef float float4 __attribute__((ext_vector_type(4))); float4 foo(float4 A, float4 B) { float4 X = A - B; float4 Y = A + B; return (float4){X[0], Y[1], X[2], Y[3]}; } /// Before this patch, (with flag -mcpu=corei7) llc produced the following assembly sequence: movaps %xmm0, %xmm2 addps %xmm1, %xmm2 subps %xmm1, %xmm0 blendps $10, %xmm2, %xmm0 With this patch, we now get a single addsubps %xmm1, %xmm0 llvm-svn: 211427 2014-06-21 09:31:15 +08:00			`%sub = fsub <2 x double> %A, %1`
			`%add = fadd <2 x double> %A, %1`
			`%vecinit2 = shufflevector <2 x double> %sub, <2 x double> %add, <2 x i32> <i32 0, i32 3>`
			`ret <2 x double> %vecinit2`
			`}`
			`; CHECK-LABEL: test4b`
			`; SSE: addsubpd`
			`; AVX: vaddsubpd`
			`; CHECK-NEXT: ret`