[X86][SLM] Add SLM arithmetic vectorization tests

As discussed on D33983, as SLM has so many custom costs its worth testing as well.

llvm-svn: 305151
This commit is contained in:
Simon Pilgrim 2017-06-10 19:16:09 +00:00
parent b644814c28
commit 448c2290f1
4 changed files with 333 additions and 37 deletions

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
@ -38,6 +39,25 @@ define void @add_v8i64() {
; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v8i64(
; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = add <2 x i64> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = add <2 x i64> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v8i64(
; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@ -111,6 +131,25 @@ define void @add_v16i32() {
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v16i32(
; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = add <4 x i32> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = add <4 x i32> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = add <4 x i32> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = add <4 x i32> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v16i32(
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@ -216,6 +255,25 @@ define void @add_v32i16() {
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @add_v32i16(
; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = add <8 x i16> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = add <8 x i16> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = add <8 x i16> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = add <8 x i16> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @add_v32i16(
; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=skx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
@ -69,13 +70,32 @@ define <2 x double> @buildvector_mul_2f64(<2 x double> %a, <2 x double> %b) {
}
define <2 x double> @buildvector_div_2f64(<2 x double> %a, <2 x double> %b) {
; CHECK-LABEL: @buildvector_div_2f64(
; CHECK-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
; CHECK-NEXT: ret <2 x double> [[R1]]
; SSE-LABEL: @buildvector_div_2f64(
; SSE-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
; SSE-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; SSE-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
; SSE-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; SSE-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
; SSE-NEXT: ret <2 x double> [[R1]]
;
; SLM-LABEL: @buildvector_div_2f64(
; SLM-NEXT: [[A0:%.*]] = extractelement <2 x double> [[A:%.*]], i32 0
; SLM-NEXT: [[A1:%.*]] = extractelement <2 x double> [[A]], i32 1
; SLM-NEXT: [[B0:%.*]] = extractelement <2 x double> [[B:%.*]], i32 0
; SLM-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B]], i32 1
; SLM-NEXT: [[C0:%.*]] = fdiv double [[A0]], [[B0]]
; SLM-NEXT: [[C1:%.*]] = fdiv double [[A1]], [[B1]]
; SLM-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[C0]], i32 0
; SLM-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[C1]], i32 1
; SLM-NEXT: ret <2 x double> [[R1]]
;
; AVX-LABEL: @buildvector_div_2f64(
; AVX-NEXT: [[TMP1:%.*]] = fdiv <2 x double> [[A:%.*]], [[B:%.*]]
; AVX-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; AVX-NEXT: [[R0:%.*]] = insertelement <2 x double> undef, double [[TMP2]], i32 0
; AVX-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; AVX-NEXT: [[R1:%.*]] = insertelement <2 x double> [[R0]], double [[TMP3]], i32 1
; AVX-NEXT: ret <2 x double> [[R1]]
;
%a0 = extractelement <2 x double> %a, i32 0
%a1 = extractelement <2 x double> %a, i32 1
@ -317,17 +337,48 @@ define <4 x double> @buildvector_mul_4f64(<4 x double> %a, <4 x double> %b) {
}
define <4 x double> @buildvector_div_4f64(<4 x double> %a, <4 x double> %b) {
; CHECK-LABEL: @buildvector_div_4f64(
; CHECK-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
; CHECK-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
; CHECK-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
; CHECK-NEXT: ret <4 x double> [[R3]]
; SSE-LABEL: @buildvector_div_4f64(
; SSE-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
; SSE-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
; SSE-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
; SSE-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
; SSE-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
; SSE-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
; SSE-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
; SSE-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
; SSE-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
; SSE-NEXT: ret <4 x double> [[R3]]
;
; SLM-LABEL: @buildvector_div_4f64(
; SLM-NEXT: [[A0:%.*]] = extractelement <4 x double> [[A:%.*]], i32 0
; SLM-NEXT: [[A1:%.*]] = extractelement <4 x double> [[A]], i32 1
; SLM-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A]], i32 2
; SLM-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i32 3
; SLM-NEXT: [[B0:%.*]] = extractelement <4 x double> [[B:%.*]], i32 0
; SLM-NEXT: [[B1:%.*]] = extractelement <4 x double> [[B]], i32 1
; SLM-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B]], i32 2
; SLM-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i32 3
; SLM-NEXT: [[C0:%.*]] = fdiv double [[A0]], [[B0]]
; SLM-NEXT: [[C1:%.*]] = fdiv double [[A1]], [[B1]]
; SLM-NEXT: [[C2:%.*]] = fdiv double [[A2]], [[B2]]
; SLM-NEXT: [[C3:%.*]] = fdiv double [[A3]], [[B3]]
; SLM-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[C0]], i32 0
; SLM-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[C1]], i32 1
; SLM-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[C2]], i32 2
; SLM-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[C3]], i32 3
; SLM-NEXT: ret <4 x double> [[R3]]
;
; AVX-LABEL: @buildvector_div_4f64(
; AVX-NEXT: [[TMP1:%.*]] = fdiv <4 x double> [[A:%.*]], [[B:%.*]]
; AVX-NEXT: [[TMP2:%.*]] = extractelement <4 x double> [[TMP1]], i32 0
; AVX-NEXT: [[R0:%.*]] = insertelement <4 x double> undef, double [[TMP2]], i32 0
; AVX-NEXT: [[TMP3:%.*]] = extractelement <4 x double> [[TMP1]], i32 1
; AVX-NEXT: [[R1:%.*]] = insertelement <4 x double> [[R0]], double [[TMP3]], i32 1
; AVX-NEXT: [[TMP4:%.*]] = extractelement <4 x double> [[TMP1]], i32 2
; AVX-NEXT: [[R2:%.*]] = insertelement <4 x double> [[R1]], double [[TMP4]], i32 2
; AVX-NEXT: [[TMP5:%.*]] = extractelement <4 x double> [[TMP1]], i32 3
; AVX-NEXT: [[R3:%.*]] = insertelement <4 x double> [[R2]], double [[TMP5]], i32 3
; AVX-NEXT: ret <4 x double> [[R3]]
;
%a0 = extractelement <4 x double> %a, i32 0
%a1 = extractelement <4 x double> %a, i32 1
@ -745,25 +796,80 @@ define <8 x double> @buildvector_mul_8f64(<8 x double> %a, <8 x double> %b) {
}
define <8 x double> @buildvector_div_8f64(<8 x double> %a, <8 x double> %b) {
; CHECK-LABEL: @buildvector_div_8f64(
; CHECK-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
; CHECK-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
; CHECK-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
; CHECK-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
; CHECK-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
; CHECK-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
; CHECK-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
; CHECK-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
; CHECK-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
; CHECK-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
; CHECK-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
; CHECK-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
; CHECK-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
; CHECK-NEXT: ret <8 x double> [[R7]]
; SSE-LABEL: @buildvector_div_8f64(
; SSE-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
; SSE-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
; SSE-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
; SSE-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
; SSE-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
; SSE-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
; SSE-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
; SSE-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
; SSE-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
; SSE-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
; SSE-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
; SSE-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
; SSE-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
; SSE-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
; SSE-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
; SSE-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
; SSE-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
; SSE-NEXT: ret <8 x double> [[R7]]
;
; SLM-LABEL: @buildvector_div_8f64(
; SLM-NEXT: [[A0:%.*]] = extractelement <8 x double> [[A:%.*]], i32 0
; SLM-NEXT: [[A1:%.*]] = extractelement <8 x double> [[A]], i32 1
; SLM-NEXT: [[A2:%.*]] = extractelement <8 x double> [[A]], i32 2
; SLM-NEXT: [[A3:%.*]] = extractelement <8 x double> [[A]], i32 3
; SLM-NEXT: [[A4:%.*]] = extractelement <8 x double> [[A]], i32 4
; SLM-NEXT: [[A5:%.*]] = extractelement <8 x double> [[A]], i32 5
; SLM-NEXT: [[A6:%.*]] = extractelement <8 x double> [[A]], i32 6
; SLM-NEXT: [[A7:%.*]] = extractelement <8 x double> [[A]], i32 7
; SLM-NEXT: [[B0:%.*]] = extractelement <8 x double> [[B:%.*]], i32 0
; SLM-NEXT: [[B1:%.*]] = extractelement <8 x double> [[B]], i32 1
; SLM-NEXT: [[B2:%.*]] = extractelement <8 x double> [[B]], i32 2
; SLM-NEXT: [[B3:%.*]] = extractelement <8 x double> [[B]], i32 3
; SLM-NEXT: [[B4:%.*]] = extractelement <8 x double> [[B]], i32 4
; SLM-NEXT: [[B5:%.*]] = extractelement <8 x double> [[B]], i32 5
; SLM-NEXT: [[B6:%.*]] = extractelement <8 x double> [[B]], i32 6
; SLM-NEXT: [[B7:%.*]] = extractelement <8 x double> [[B]], i32 7
; SLM-NEXT: [[C0:%.*]] = fdiv double [[A0]], [[B0]]
; SLM-NEXT: [[C1:%.*]] = fdiv double [[A1]], [[B1]]
; SLM-NEXT: [[C2:%.*]] = fdiv double [[A2]], [[B2]]
; SLM-NEXT: [[C3:%.*]] = fdiv double [[A3]], [[B3]]
; SLM-NEXT: [[C4:%.*]] = fdiv double [[A4]], [[B4]]
; SLM-NEXT: [[C5:%.*]] = fdiv double [[A5]], [[B5]]
; SLM-NEXT: [[C6:%.*]] = fdiv double [[A6]], [[B6]]
; SLM-NEXT: [[C7:%.*]] = fdiv double [[A7]], [[B7]]
; SLM-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[C0]], i32 0
; SLM-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[C1]], i32 1
; SLM-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[C2]], i32 2
; SLM-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[C3]], i32 3
; SLM-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[C4]], i32 4
; SLM-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[C5]], i32 5
; SLM-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[C6]], i32 6
; SLM-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[C7]], i32 7
; SLM-NEXT: ret <8 x double> [[R7]]
;
; AVX-LABEL: @buildvector_div_8f64(
; AVX-NEXT: [[TMP1:%.*]] = fdiv <8 x double> [[A:%.*]], [[B:%.*]]
; AVX-NEXT: [[TMP2:%.*]] = extractelement <8 x double> [[TMP1]], i32 0
; AVX-NEXT: [[R0:%.*]] = insertelement <8 x double> undef, double [[TMP2]], i32 0
; AVX-NEXT: [[TMP3:%.*]] = extractelement <8 x double> [[TMP1]], i32 1
; AVX-NEXT: [[R1:%.*]] = insertelement <8 x double> [[R0]], double [[TMP3]], i32 1
; AVX-NEXT: [[TMP4:%.*]] = extractelement <8 x double> [[TMP1]], i32 2
; AVX-NEXT: [[R2:%.*]] = insertelement <8 x double> [[R1]], double [[TMP4]], i32 2
; AVX-NEXT: [[TMP5:%.*]] = extractelement <8 x double> [[TMP1]], i32 3
; AVX-NEXT: [[R3:%.*]] = insertelement <8 x double> [[R2]], double [[TMP5]], i32 3
; AVX-NEXT: [[TMP6:%.*]] = extractelement <8 x double> [[TMP1]], i32 4
; AVX-NEXT: [[R4:%.*]] = insertelement <8 x double> [[R3]], double [[TMP6]], i32 4
; AVX-NEXT: [[TMP7:%.*]] = extractelement <8 x double> [[TMP1]], i32 5
; AVX-NEXT: [[R5:%.*]] = insertelement <8 x double> [[R4]], double [[TMP7]], i32 5
; AVX-NEXT: [[TMP8:%.*]] = extractelement <8 x double> [[TMP1]], i32 6
; AVX-NEXT: [[R6:%.*]] = insertelement <8 x double> [[R5]], double [[TMP8]], i32 6
; AVX-NEXT: [[TMP9:%.*]] = extractelement <8 x double> [[TMP1]], i32 7
; AVX-NEXT: [[R7:%.*]] = insertelement <8 x double> [[R6]], double [[TMP9]], i32 7
; AVX-NEXT: ret <8 x double> [[R7]]
;
%a0 = extractelement <8 x double> %a, i32 0
%a1 = extractelement <8 x double> %a, i32 1

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
@ -54,6 +55,41 @@ define void @mul_v8i64() {
; SSE-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SSE-NEXT: ret void
;
; SLM-LABEL: @mul_v8i64(
; SLM-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
; SLM-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
; SLM-NEXT: [[A2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2), align 8
; SLM-NEXT: [[A3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 3), align 8
; SLM-NEXT: [[A4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4), align 8
; SLM-NEXT: [[A5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 5), align 8
; SLM-NEXT: [[A6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6), align 8
; SLM-NEXT: [[A7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 7), align 8
; SLM-NEXT: [[B0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 0), align 8
; SLM-NEXT: [[B1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 1), align 8
; SLM-NEXT: [[B2:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2), align 8
; SLM-NEXT: [[B3:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 3), align 8
; SLM-NEXT: [[B4:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4), align 8
; SLM-NEXT: [[B5:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 5), align 8
; SLM-NEXT: [[B6:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6), align 8
; SLM-NEXT: [[B7:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 7), align 8
; SLM-NEXT: [[R0:%.*]] = mul i64 [[A0]], [[B0]]
; SLM-NEXT: [[R1:%.*]] = mul i64 [[A1]], [[B1]]
; SLM-NEXT: [[R2:%.*]] = mul i64 [[A2]], [[B2]]
; SLM-NEXT: [[R3:%.*]] = mul i64 [[A3]], [[B3]]
; SLM-NEXT: [[R4:%.*]] = mul i64 [[A4]], [[B4]]
; SLM-NEXT: [[R5:%.*]] = mul i64 [[A5]], [[B5]]
; SLM-NEXT: [[R6:%.*]] = mul i64 [[A6]], [[B6]]
; SLM-NEXT: [[R7:%.*]] = mul i64 [[A7]], [[B7]]
; SLM-NEXT: store i64 [[R0]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 0), align 8
; SLM-NEXT: store i64 [[R1]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 1), align 8
; SLM-NEXT: store i64 [[R2]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2), align 8
; SLM-NEXT: store i64 [[R3]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 3), align 8
; SLM-NEXT: store i64 [[R4]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4), align 8
; SLM-NEXT: store i64 [[R5]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 5), align 8
; SLM-NEXT: store i64 [[R6]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6), align 8
; SLM-NEXT: store i64 [[R7]], i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 7), align 8
; SLM-NEXT: ret void
;
; AVX1-LABEL: @mul_v8i64(
; AVX1-NEXT: [[A0:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 0), align 8
; AVX1-NEXT: [[A1:%.*]] = load i64, i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 1), align 8
@ -162,6 +198,25 @@ define void @mul_v16i32() {
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @mul_v16i32(
; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = mul <4 x i32> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = mul <4 x i32> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = mul <4 x i32> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = mul <4 x i32> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @mul_v16i32(
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@ -267,6 +322,25 @@ define void @mul_v32i16() {
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @mul_v32i16(
; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = mul <8 x i16> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = mul <8 x i16> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = mul <8 x i16> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @mul_v32i16(
; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2

View File

@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -mtriple=x86_64-unknown -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=slm -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=SLM
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=corei7-avx -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=core-avx2 -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX2
; RUN: opt < %s -mtriple=x86_64-unknown -mcpu=knl -basicaa -slp-vectorizer -S | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
@ -38,6 +39,25 @@ define void @sub_v8i64() {
; SSE-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v8i64(
; SLM-NEXT: [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @a64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP3:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP4:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP5:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([8 x i64]* @b64 to <2 x i64>*), align 8
; SLM-NEXT: [[TMP6:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP7:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP8:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @b64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: [[TMP9:%.*]] = sub <2 x i64> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = sub <2 x i64> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = sub <2 x i64> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = sub <2 x i64> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <2 x i64> [[TMP9]], <2 x i64>* bitcast ([8 x i64]* @c64 to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP10]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 2) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP11]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 4) to <2 x i64>*), align 8
; SLM-NEXT: store <2 x i64> [[TMP12]], <2 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @c64, i32 0, i64 6) to <2 x i64>*), align 8
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v8i64(
; AVX-NEXT: [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([8 x i64]* @a64 to <4 x i64>*), align 8
; AVX-NEXT: [[TMP2:%.*]] = load <4 x i64>, <4 x i64>* bitcast (i64* getelementptr inbounds ([8 x i64], [8 x i64]* @a64, i32 0, i64 4) to <4 x i64>*), align 8
@ -111,6 +131,25 @@ define void @sub_v16i32() {
; SSE-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v16i32(
; SLM-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @a32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP4:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([16 x i32]* @b32 to <4 x i32>*), align 4
; SLM-NEXT: [[TMP6:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP7:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP8:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @b32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = sub <4 x i32> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = sub <4 x i32> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = sub <4 x i32> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* bitcast ([16 x i32]* @c32 to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 4) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 8) to <4 x i32>*), align 4
; SLM-NEXT: store <4 x i32> [[TMP12]], <4 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @c32, i32 0, i64 12) to <4 x i32>*), align 4
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v16i32(
; AVX-NEXT: [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([16 x i32]* @a32 to <8 x i32>*), align 4
; AVX-NEXT: [[TMP2:%.*]] = load <8 x i32>, <8 x i32>* bitcast (i32* getelementptr inbounds ([16 x i32], [16 x i32]* @a32, i32 0, i64 8) to <8 x i32>*), align 4
@ -216,6 +255,25 @@ define void @sub_v32i16() {
; SSE-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SSE-NEXT: ret void
;
; SLM-LABEL: @sub_v32i16(
; SLM-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @a16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP3:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP4:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP5:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([32 x i16]* @b16 to <8 x i16>*), align 2
; SLM-NEXT: [[TMP6:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP7:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP8:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @b16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: [[TMP9:%.*]] = sub <8 x i16> [[TMP1]], [[TMP5]]
; SLM-NEXT: [[TMP10:%.*]] = sub <8 x i16> [[TMP2]], [[TMP6]]
; SLM-NEXT: [[TMP11:%.*]] = sub <8 x i16> [[TMP3]], [[TMP7]]
; SLM-NEXT: [[TMP12:%.*]] = sub <8 x i16> [[TMP4]], [[TMP8]]
; SLM-NEXT: store <8 x i16> [[TMP9]], <8 x i16>* bitcast ([32 x i16]* @c16 to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP10]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 8) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP11]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 16) to <8 x i16>*), align 2
; SLM-NEXT: store <8 x i16> [[TMP12]], <8 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @c16, i32 0, i64 24) to <8 x i16>*), align 2
; SLM-NEXT: ret void
;
; AVX-LABEL: @sub_v32i16(
; AVX-NEXT: [[TMP1:%.*]] = load <16 x i16>, <16 x i16>* bitcast ([32 x i16]* @a16 to <16 x i16>*), align 2
; AVX-NEXT: [[TMP2:%.*]] = load <16 x i16>, <16 x i16>* bitcast (i16* getelementptr inbounds ([32 x i16], [32 x i16]* @a16, i32 0, i64 16) to <16 x i16>*), align 2