[X86][AVX] Tidyup prefixes and regenerate interleaved tests

Share common AVX prefix and split off AVX2OR512 prefix instead

llvm-svn: 346399
This commit is contained in:
Simon Pilgrim 2018-11-08 12:14:10 +00:00
parent 0d79aaf1a7
commit 1ef4af5278
1 changed files with 103 additions and 184 deletions

View File

@ -1,26 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefix=AVX1
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=AVX --check-prefix=AVX512
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX2
; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
; AVX1-LABEL: load_factorf64_4:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rdi), %ymm0
; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
; AVX1-NEXT: vhaddpd %ymm5, %ymm4, %ymm4
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vaddpd %ymm2, %ymm4, %ymm2
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX1-NEXT: vaddpd %ymm0, %ymm2, %ymm0
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factorf64_4:
; AVX: # %bb.0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@ -49,21 +32,6 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) {
}
define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
; AVX1-LABEL: load_factorf64_2:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rdi), %ymm0
; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
; AVX1-NEXT: vmovupd 64(%rdi), %ymm2
; AVX1-NEXT: vmovupd 96(%rdi), %ymm3
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX1-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX1-NEXT: vmulpd %ymm0, %ymm4, %ymm0
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factorf64_2:
; AVX: # %bb.0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@ -86,16 +54,6 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
}
define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
; AVX1-LABEL: load_factorf64_1:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovupd (%rdi), %ymm0
; AVX1-NEXT: vmovupd 32(%rdi), %ymm1
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[0,1],mem[0,1]
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[0,1],mem[0,1]
; AVX1-NEXT: vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX1-NEXT: vmulpd %ymm0, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factorf64_1:
; AVX: # %bb.0:
; AVX-NEXT: vmovupd (%rdi), %ymm0
@ -140,24 +98,24 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) {
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX-LABEL: load_factori64_4:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqu (%rdi), %ymm0
; AVX-NEXT: vmovdqu 32(%rdi), %ymm1
; AVX-NEXT: vmovdqu 64(%rdi), %ymm2
; AVX-NEXT: vmovdqu 96(%rdi), %ymm3
; AVX-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
; AVX-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
; AVX-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX-NEXT: vpaddq %ymm3, %ymm4, %ymm3
; AVX-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX-NEXT: vpaddq %ymm0, %ymm3, %ymm0
; AVX-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX-NEXT: retq
; AVX2OR512-LABEL: load_factori64_4:
; AVX2OR512: # %bb.0:
; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0
; AVX2OR512-NEXT: vmovdqu 32(%rdi), %ymm1
; AVX2OR512-NEXT: vmovdqu 64(%rdi), %ymm2
; AVX2OR512-NEXT: vmovdqu 96(%rdi), %ymm3
; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm4 = ymm0[0,1],ymm2[0,1]
; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm5 = ymm1[0,1],ymm3[0,1]
; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3]
; AVX2OR512-NEXT: vperm2i128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3]
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2]
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3]
; AVX2OR512-NEXT: vpaddq %ymm3, %ymm4, %ymm3
; AVX2OR512-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3]
; AVX2OR512-NEXT: vpaddq %ymm0, %ymm3, %ymm0
; AVX2OR512-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2OR512-NEXT: retq
%wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16
%strided.v0 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
%strided.v1 = shufflevector <16 x i64> %wide.vec, <16 x i64> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
@ -459,33 +417,33 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(<32 x i8>* %ptr) {
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX-LABEL: interleaved_load_vf8_i8_stride4:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqu (%rdi), %ymm0
; AVX-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
; AVX-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
; AVX-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX-NEXT: vpaddw %xmm1, %xmm4, %xmm1
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
; AVX-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
; AVX-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2OR512-LABEL: interleaved_load_vf8_i8_stride4:
; AVX2OR512: # %bb.0:
; AVX2OR512-NEXT: vmovdqu (%rdi), %ymm0
; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
; AVX2OR512-NEXT: vextracti128 $1, %ymm0, %xmm2
; AVX2OR512-NEXT: vpshufb %xmm1, %xmm2, %xmm3
; AVX2OR512-NEXT: vpshufb %xmm1, %xmm0, %xmm1
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm1[0],xmm3[0]
; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm5 = [1,1,3,3,5,5,7,7,7,7,3,3,6,6,7,7]
; AVX2OR512-NEXT: vpshufb %xmm5, %xmm3, %xmm3
; AVX2OR512-NEXT: vpshufb %xmm5, %xmm1, %xmm1
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
; AVX2OR512-NEXT: vpaddw %xmm1, %xmm4, %xmm1
; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm3 = [6,7,2,3,14,15,10,11,14,15,10,11,12,13,14,15]
; AVX2OR512-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX2OR512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
; AVX2OR512-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX2OR512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,0,3,2,4,5,6,7]
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm4 = [3,3,1,1,7,7,5,5,1,1,5,5,0,0,1,1]
; AVX2OR512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2OR512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2OR512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; AVX2OR512-NEXT: vpaddw %xmm3, %xmm0, %xmm0
; AVX2OR512-NEXT: vpmullw %xmm0, %xmm1, %xmm0
; AVX2OR512-NEXT: vzeroupper
; AVX2OR512-NEXT: retq
%wide.vec = load <32 x i8>, <32 x i8>* %ptr, align 16
%v1 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
%v2 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
@ -981,21 +939,21 @@ define void @interleaved_store_vf8_i8_stride4(<8 x i8> %x1, <8 x i8> %x2, <8 x i
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX-LABEL: interleaved_store_vf8_i8_stride4:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vpshufb %xmm4, %xmm3, %xmm1
; AVX-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
; AVX-NEXT: vmovdqa %ymm0, (%rdi)
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2OR512-LABEL: interleaved_store_vf8_i8_stride4:
; AVX2OR512: # %bb.0:
; AVX2OR512-NEXT: vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX2OR512-NEXT: vpshufb %xmm4, %xmm1, %xmm1
; AVX2OR512-NEXT: vpshufb %xmm4, %xmm0, %xmm0
; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2OR512-NEXT: vpshufb %xmm4, %xmm3, %xmm1
; AVX2OR512-NEXT: vpshufb %xmm4, %xmm2, %xmm2
; AVX2OR512-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; AVX2OR512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2OR512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
; AVX2OR512-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0
; AVX2OR512-NEXT: vmovdqa %ymm0, (%rdi)
; AVX2OR512-NEXT: vzeroupper
; AVX2OR512-NEXT: retq
%v1 = shufflevector <8 x i8> %x1, <8 x i8> %x2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%v2 = shufflevector <8 x i8> %x3, <8 x i8> %x4, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%interleaved.vec = shufflevector <16 x i8> %v1, <16 x i8> %v2, <32 x i32> <i32 0,i32 8,i32 16,i32 24,i32 1,i32 9,i32 17,i32 25,i32 2,i32 10,i32 18,i32 26,i32 3,i32 11,i32 19,i32 27,i32 4,i32 12,i32 20,i32 28,i32 5,i32 13,i32 21,i32 29,i32 6,i32 14,i32 22,i32 30,i32 7,i32 15,i32 23,i32 31>
@ -1050,29 +1008,29 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX-LABEL: interleaved_load_vf32_i8_stride3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
; AVX-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
; AVX-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
; AVX-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
; AVX-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX-NEXT: retq
; AVX2OR512-LABEL: interleaved_load_vf32_i8_stride3:
; AVX2OR512: # %bb.0:
; AVX2OR512-NEXT: vmovdqa (%rdi), %xmm0
; AVX2OR512-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX2OR512-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX2OR512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
; AVX2OR512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
; AVX2OR512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13,0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX2OR512-NEXT: vpshufb %ymm3, %ymm0, %ymm0
; AVX2OR512-NEXT: vpshufb %ymm3, %ymm1, %ymm1
; AVX2OR512-NEXT: vpshufb %ymm3, %ymm2, %ymm2
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm3 = ymm2[11,12,13,14,15],ymm0[0,1,2,3,4,5,6,7,8,9,10],ymm2[27,28,29,30,31],ymm0[16,17,18,19,20,21,22,23,24,25,26]
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm1 = ymm1[11,12,13,14,15],ymm2[0,1,2,3,4,5,6,7,8,9,10],ymm1[27,28,29,30,31],ymm2[16,17,18,19,20,21,22,23,24,25,26]
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm2 = ymm3[11,12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10],ymm3[27,28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26]
; AVX2OR512-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
; AVX2OR512-NEXT: vpaddb %ymm2, %ymm1, %ymm1
; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX2OR512-NEXT: retq
%wide.vec = load <96 x i8>, <96 x i8>* %ptr
%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
%v2 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22,i32 25,i32 28,i32 31,i32 34,i32 37,i32 40,i32 43,i32 46,i32 49,i32 52,i32 55,i32 58,i32 61,i32 64,i32 67,i32 70,i32 73,i32 76,i32 79,i32 82,i32 85,i32 88,i32 91,i32 94>
@ -1083,28 +1041,6 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(<96 x i8>* %ptr){
}
define <16 x i8> @interleaved_load_vf16_i8_stride3(<48 x i8>* %ptr){
; AVX1-LABEL: interleaved_load_vf16_i8_stride3:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa (%rdi), %xmm0
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm2
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13]
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm2[11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX1-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
; AVX1-NEXT: vpaddb %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,xmm0[11,12,13,14,15],zero,zero,zero,zero,zero
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm3[5,6,7,8,9,10],zero,zero,zero,zero,zero,xmm3[0,1,2,3,4]
; AVX1-NEXT: vpor %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vpaddb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
;
; AVX-LABEL: interleaved_load_vf16_i8_stride3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
@ -1154,23 +1090,23 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX-LABEL: interleaved_load_vf8_i8_stride3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %ymm0
; AVX-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddw %xmm0, %xmm3, %xmm0
; AVX-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX-NEXT: vzeroupper
; AVX-NEXT: retq
; AVX2OR512-LABEL: interleaved_load_vf8_i8_stride3:
; AVX2OR512: # %bb.0:
; AVX2OR512-NEXT: vmovdqa (%rdi), %ymm0
; AVX2OR512-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,2,u,5,u]
; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,u,3,u,6,u,9,u,12,u,15,u],zero,xmm0[u],zero,xmm0[u]
; AVX2OR512-NEXT: vpor %xmm2, %xmm3, %xmm2
; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,0,u,3,u,6,u]
; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,u,4,u,7,u,10,u,13,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
; AVX2OR512-NEXT: vpor %xmm3, %xmm4, %xmm3
; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u],zero,xmm1[u,1,u,4,u,7,u]
; AVX2OR512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,5,u,8,u,11,u,14,u],zero,xmm0[u],zero,xmm0[u],zero,xmm0[u]
; AVX2OR512-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2OR512-NEXT: vpaddw %xmm0, %xmm3, %xmm0
; AVX2OR512-NEXT: vpaddw %xmm0, %xmm2, %xmm0
; AVX2OR512-NEXT: vzeroupper
; AVX2OR512-NEXT: retq
%wide.vec = load <24 x i8>, <24 x i8>* %ptr
%v1 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21>
%v2 = shufflevector <24 x i8> %wide.vec, <24 x i8> undef,<8 x i32> <i32 1,i32 4,i32 7,i32 10,i32 13,i32 16,i32 19,i32 22>
@ -1181,23 +1117,6 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(<24 x i8>* %ptr){
}
define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <24 x i8>* %p) {
; AVX1-LABEL: interleaved_store_vf8_i8_stride3:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm1
; AVX1-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
; AVX1-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero
; AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u]
; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, 16(%rdi)
; AVX1-NEXT: vmovdqu %xmm2, (%rdi)
; AVX1-NEXT: retq
;
; AVX-LABEL: interleaved_store_vf8_i8_stride3:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>