Add support for 256-bit versions of VPERMIL instruction. This is a new
instruction introduced in AVX, which can operate on 128 and 256-bit vectors.
It considers a 256-bit vector as two independent 128-bit lanes. It can permute
any 32 or 64 elements inside a lane, and restricts the second lane to
have the same permutation of the first one. With the improved splat support
introduced early today, adding codegen for this instruction enable more
efficient 256-bit code:
Instead of:
vextractf128 $0, %ymm0, %xmm0
punpcklbw %xmm0, %xmm0
punpckhbw %xmm0, %xmm0
vinsertf128 $0, %xmm0, %ymm0, %ymm1
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vextractf128 $1, %ymm0, %xmm1
shufps $1, %xmm1, %xmm1
movss %xmm1, 28(%rsp)
movss %xmm1, 24(%rsp)
movss %xmm1, 20(%rsp)
movss %xmm1, 16(%rsp)
vextractf128 $0, %ymm0, %xmm0
shufps $1, %xmm0, %xmm0
movss %xmm0, 12(%rsp)
movss %xmm0, 8(%rsp)
movss %xmm0, 4(%rsp)
movss %xmm0, (%rsp)
vmovaps (%rsp), %ymm0
We get:
vextractf128 $0, %ymm0, %xmm0
punpcklbw %xmm0, %xmm0
punpckhbw %xmm0, %xmm0
vinsertf128 $0, %xmm0, %ymm0, %ymm1
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vpermilps $85, %ymm0, %ymm0
llvm-svn: 135662
2011-07-21 09:55:47 +08:00
|
|
|
; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
|
|
|
|
|
|
|
|
|
[x86] Teach the target shuffle mask extraction to recognize unary forms
of normally binary shuffle instructions like PUNPCKL and MOVLHPS.
This detects cases where a single register is used for both operands
making the shuffle behave in a unary way. We detect this and adjust the
mask to use the unary form which allows the existing DAG combine for
shuffle instructions to actually work at all.
As a consequence, this uncovered a number of obvious bugs in the
existing DAG combine which are fixed. It also now canonicalizes several
shuffles even with the existing lowering. These typically are trying to
match the shuffle to the domain of the input where before we only really
modeled them with the floating point variants. All of the cases which
change to an integer shuffle here have something in the integer domain, so
there are no more or fewer domain crosses here AFAICT. Technically, it
might be better to go from a GPR directly to the floating point domain,
but detecting floating point *outputs* despite integer inputs is a lot
more code and seems unlikely to be worthwhile in practice. If folks are
seeing domain-crossing regressions here though, let me know and I can
hack something up to fix it.
Also as a consequence, a bunch of missed opportunities to form pshufb
now can be formed. Notably, splats of i8s now form pshufb.
Interestingly, this improves the existing splat lowering too. We go from
3 instructions to 1. Yes, we may tie up a register, but it seems very
likely to be worth it, especially if splatting the 0th byte (the
common case) as then we can use a zeroed register as the mask.
llvm-svn: 214625
2014-08-02 18:27:38 +08:00
|
|
|
; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
|
Add support for 256-bit versions of VPERMIL instruction. This is a new
instruction introduced in AVX, which can operate on 128 and 256-bit vectors.
It considers a 256-bit vector as two independent 128-bit lanes. It can permute
any 32 or 64 elements inside a lane, and restricts the second lane to
have the same permutation of the first one. With the improved splat support
introduced early today, adding codegen for this instruction enable more
efficient 256-bit code:
Instead of:
vextractf128 $0, %ymm0, %xmm0
punpcklbw %xmm0, %xmm0
punpckhbw %xmm0, %xmm0
vinsertf128 $0, %xmm0, %ymm0, %ymm1
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vextractf128 $1, %ymm0, %xmm1
shufps $1, %xmm1, %xmm1
movss %xmm1, 28(%rsp)
movss %xmm1, 24(%rsp)
movss %xmm1, 20(%rsp)
movss %xmm1, 16(%rsp)
vextractf128 $0, %ymm0, %xmm0
shufps $1, %xmm0, %xmm0
movss %xmm0, 12(%rsp)
movss %xmm0, 8(%rsp)
movss %xmm0, 4(%rsp)
movss %xmm0, (%rsp)
vmovaps (%rsp), %ymm0
We get:
vextractf128 $0, %ymm0, %xmm0
punpcklbw %xmm0, %xmm0
punpckhbw %xmm0, %xmm0
vinsertf128 $0, %xmm0, %ymm0, %ymm1
vinsertf128 $1, %xmm0, %ymm1, %ymm0
vpermilps $85, %ymm0, %ymm0
llvm-svn: 135662
2011-07-21 09:55:47 +08:00
|
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
|
|
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
|
|
|
|
entry:
|
|
|
|
%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
|
|
|
|
ret <32 x i8> %shuffle
|
|
|
|
}
|
|
|
|
|
2011-08-23 12:36:33 +08:00
|
|
|
; CHECK: vpunpckhwd %xmm
|
2013-01-26 19:44:21 +08:00
|
|
|
; CHECK-NEXT: vpshufd $85
|
2011-07-21 10:24:08 +08:00
|
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
|
|
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
|
|
|
|
entry:
|
|
|
|
%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
|
|
|
|
ret <16 x i16> %shuffle
|
|
|
|
}
|
|
|
|
|
2013-10-08 13:53:50 +08:00
|
|
|
; CHECK: vmovq
|
2014-09-15 18:35:41 +08:00
|
|
|
; CHECK-NEXT: vmovlhps %xmm
|
2011-07-26 07:05:25 +08:00
|
|
|
; CHECK-NEXT: vinsertf128 $1
|
|
|
|
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
|
|
|
|
entry:
|
|
|
|
%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
|
|
|
|
%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
|
|
|
|
%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
|
|
|
|
%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
|
|
|
|
ret <4 x i64> %vecinit6.i
|
|
|
|
}
|
|
|
|
|
2014-09-15 06:41:37 +08:00
|
|
|
; CHECK: vunpcklpd %xmm
|
2011-08-24 06:06:37 +08:00
|
|
|
; CHECK-NEXT: vinsertf128 $1
|
2011-07-26 07:05:25 +08:00
|
|
|
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
|
|
|
|
entry:
|
|
|
|
%vecinit.i = insertelement <4 x double> undef, double %q, i32 0
|
|
|
|
%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
|
|
|
|
%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
|
|
|
|
%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
|
|
|
|
ret <4 x double> %vecinit6.i
|
|
|
|
}
|
Make this kind of lowering to be supported by 256-bit instructions:
shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
To:
shuffle (vload ptr)), undef, <1, 1, 1, 1>
Fix PR10494
llvm-svn: 136691
2011-08-03 00:06:18 +08:00
|
|
|
|
2014-07-09 18:06:58 +08:00
|
|
|
; Test this turns into a broadcast:
|
Make this kind of lowering to be supported by 256-bit instructions:
shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
To:
shuffle (vload ptr)), undef, <1, 1, 1, 1>
Fix PR10494
llvm-svn: 136691
2011-08-03 00:06:18 +08:00
|
|
|
; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
|
2014-07-09 18:06:58 +08:00
|
|
|
;
|
|
|
|
; CHECK: vbroadcastss
|
2011-08-11 10:49:44 +08:00
|
|
|
define <8 x float> @funcE() nounwind {
|
Make this kind of lowering to be supported by 256-bit instructions:
shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
To:
shuffle (vload ptr)), undef, <1, 1, 1, 1>
Fix PR10494
llvm-svn: 136691
2011-08-03 00:06:18 +08:00
|
|
|
allocas:
|
|
|
|
%udx495 = alloca [18 x [18 x float]], align 32
|
|
|
|
br label %for_test505.preheader
|
|
|
|
|
|
|
|
for_test505.preheader: ; preds = %for_test505.preheader, %allocas
|
|
|
|
br i1 undef, label %for_exit499, label %for_test505.preheader
|
|
|
|
|
|
|
|
for_exit499: ; preds = %for_test505.preheader
|
|
|
|
br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247
|
|
|
|
|
|
|
|
load.i1247: ; preds = %for_exit499
|
|
|
|
%ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
|
|
|
|
%ptr.i1237 = bitcast float* %ptr1227 to i32*
|
|
|
|
%val.i1238 = load i32* %ptr.i1237, align 4
|
|
|
|
%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
|
|
|
|
%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
|
|
|
|
%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
|
|
|
|
br label %__load_and_broadcast_32.exit1249
|
|
|
|
|
|
|
|
__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499
|
|
|
|
%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
|
2011-08-11 10:49:44 +08:00
|
|
|
ret <8 x float> %load_broadcast12281250
|
Make this kind of lowering to be supported by 256-bit instructions:
shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
To:
shuffle (vload ptr)), undef, <1, 1, 1, 1>
Fix PR10494
llvm-svn: 136691
2011-08-03 00:06:18 +08:00
|
|
|
}
|
|
|
|
|
2012-11-27 03:24:31 +08:00
|
|
|
; CHECK: vpshufd $0
|
|
|
|
; CHECK-NEXT: vinsertf128 $1
|
2011-08-17 10:29:15 +08:00
|
|
|
define <8 x float> @funcF(i32 %val) nounwind {
|
2011-08-11 10:49:41 +08:00
|
|
|
%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
|
|
|
|
%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
|
|
|
|
%tmp = bitcast <8 x i32> %ret7 to <8 x float>
|
|
|
|
ret <8 x float> %tmp
|
|
|
|
}
|
|
|
|
|
2012-12-08 03:01:13 +08:00
|
|
|
; CHECK: vpshufd $0
|
2012-11-27 03:24:31 +08:00
|
|
|
; CHECK-NEXT: vinsertf128 $1
|
2011-08-11 10:49:44 +08:00
|
|
|
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
|
|
|
|
entry:
|
|
|
|
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
|
|
|
|
ret <8 x float> %shuffle
|
|
|
|
}
|
|
|
|
|
|
|
|
; CHECK: vextractf128 $1
|
2012-12-08 03:01:13 +08:00
|
|
|
; CHECK-NEXT: vpshufd
|
2012-11-27 03:24:31 +08:00
|
|
|
; CHECK-NEXT: vinsertf128 $1
|
2011-08-11 10:49:44 +08:00
|
|
|
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
|
|
|
|
entry:
|
|
|
|
%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
|
|
|
|
ret <8 x float> %shuffle
|
|
|
|
}
|
|
|
|
|