llvm-project/llvm/test/CodeGen/X86/avx-splat.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s


; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; CHECK-NEXT: vinsertf128 $1
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <32 x i8> %shuffle
}

; CHECK: vpunpckhwd %xmm
; CHECK-NEXT: vpshufd $85
; CHECK-NEXT: vinsertf128 $1
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <16 x i16> %shuffle
}

; CHECK: vmovq
; CHECK-NEXT: vpunpcklqdq %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
  ret <4 x i64> %vecinit6.i
}

; CHECK: vmovlhps %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
  ret <4 x double> %vecinit6.i
}

; Test this turns into a broadcast:
;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
;   
; CHECK: vbroadcastss
define <8 x float> @funcE() nounwind {
allocas:
  %udx495 = alloca [18 x [18 x float]], align 32
  br label %for_test505.preheader

for_test505.preheader:                            ; preds = %for_test505.preheader, %allocas
  br i1 undef, label %for_exit499, label %for_test505.preheader

for_exit499:                                      ; preds = %for_test505.preheader
  br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247

load.i1247:                                       ; preds = %for_exit499
  %ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
  %ptr.i1237 = bitcast float* %ptr1227 to i32*
  %val.i1238 = load i32* %ptr.i1237, align 4
  %ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
  %ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
  %phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
  br label %__load_and_broadcast_32.exit1249

__load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_exit499
  %load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
  ret <8 x float> %load_broadcast12281250
}

; CHECK: vpshufd $0
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcF(i32 %val) nounwind {
  %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
  %tmp = bitcast <8 x i32> %ret7 to <8 x float>
  ret <8 x float> %tmp
}

; CHECK: vpshufd  $0
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  ret <8 x float> %shuffle
}

; CHECK: vextractf128  $1
; CHECK-NEXT: vpshufd
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <8 x float> %shuffle
}
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 llvm-svn: 135662 2011-07-21 09:55:47 +08:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck %s`


[x86] Teach the target shuffle mask extraction to recognize unary forms of normally binary shuffle instructions like PUNPCKL and MOVLHPS. This detects cases where a single register is used for both operands making the shuffle behave in a unary way. We detect this and adjust the mask to use the unary form which allows the existing DAG combine for shuffle instructions to actually work at all. As a consequence, this uncovered a number of obvious bugs in the existing DAG combine which are fixed. It also now canonicalizes several shuffles even with the existing lowering. These typically are trying to match the shuffle to the domain of the input where before we only really modeled them with the floating point variants. All of the cases which change to an integer shuffle here have something in the integer domain, so there are no more or fewer domain crosses here AFAICT. Technically, it might be better to go from a GPR directly to the floating point domain, but detecting floating point outputs despite integer inputs is a lot more code and seems unlikely to be worthwhile in practice. If folks are seeing domain-crossing regressions here though, let me know and I can hack something up to fix it. Also as a consequence, a bunch of missed opportunities to form pshufb now can be formed. Notably, splats of i8s now form pshufb. Interestingly, this improves the existing splat lowering too. We go from 3 instructions to 1. Yes, we may tie up a register, but it seems very likely to be worth it, especially if splatting the 0th byte (the common case) as then we can use a zeroed register as the mask. llvm-svn: 214625 2014-08-02 18:27:38 +08:00			`; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]`
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 llvm-svn: 135662 2011-07-21 09:55:47 +08:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <32 x i8> %shuffle`
			`}`

Add support for breaking 256-bit v16i16 and v32i8 VSETCC into two 128-bit ones, avoiding sclarization. Add vex form of pcmpeqq and pcmpgtq. Fixes more cases for PR10712. llvm-svn: 138321 2011-08-23 12:36:33 +08:00			`; CHECK: vpunpckhwd %xmm`
X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. llvm-svn: 173569 2013-01-26 19:44:21 +08:00			`; CHECK-NEXT: vpshufd $85`
- Register v16i16 as valid VR256 register class - Add more bitcasts for v16i16 - Since 135661 and 135662 already added the splat logic, just add one more splat test for v16i16 llvm-svn: 135663 2011-07-21 10:24:08 +08:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <16 x i16> %shuffle`
			`}`

Remove some instructions that existed to provide aliases to the assembler. Can be done with InstAlias instead. Unfortunately, this was causing printer to use 'vmovq' or 'vmovd' based on what was parsed. To cleanup the inconsistencies convert all 'vmovd' with 64-bit registers to 'vmovq', but provide an alias so that 'vmovd' will still parse. llvm-svn: 192171 2013-10-08 13:53:50 +08:00			`; CHECK: vmovq`
[x86] Teach the target shuffle mask extraction to recognize unary forms of normally binary shuffle instructions like PUNPCKL and MOVLHPS. This detects cases where a single register is used for both operands making the shuffle behave in a unary way. We detect this and adjust the mask to use the unary form which allows the existing DAG combine for shuffle instructions to actually work at all. As a consequence, this uncovered a number of obvious bugs in the existing DAG combine which are fixed. It also now canonicalizes several shuffles even with the existing lowering. These typically are trying to match the shuffle to the domain of the input where before we only really modeled them with the floating point variants. All of the cases which change to an integer shuffle here have something in the integer domain, so there are no more or fewer domain crosses here AFAICT. Technically, it might be better to go from a GPR directly to the floating point domain, but detecting floating point outputs despite integer inputs is a lot more code and seems unlikely to be worthwhile in practice. If folks are seeing domain-crossing regressions here though, let me know and I can hack something up to fix it. Also as a consequence, a bunch of missed opportunities to form pshufb now can be formed. Notably, splats of i8s now form pshufb. Interestingly, this improves the existing splat lowering too. We go from 3 instructions to 1. Yes, we may tie up a register, but it seems very likely to be worth it, especially if splatting the 0th byte (the common case) as then we can use a zeroed register as the mask. llvm-svn: 214625 2014-08-02 18:27:38 +08:00			`; CHECK-NEXT: vpunpcklqdq %xmm`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002 2011-07-26 07:05:25 +08:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0`
			`%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1`
			`%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2`
			`%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3`
			`ret <4 x i64> %vecinit6.i`
			`}`

[x86] Add a much more powerful framework for combining x86 shuffle instructions in the legalized DAG, and leverage it to combine long sequences of instructions to PSHUFB. Eventually, the other x86-instruction-specific shuffle combines will probably all be driven out of this routine. But the real motivation is to detect after we have fully legalized and optimized a shuffle to the minimal number of x86 instructions whether it is profitable to replace the chain with a fully generic PSHUFB instruction even though doing so requires either a load from a constant pool or tying up a register with the mask. While the Intel manuals claim it should be used when it replaces 5 or more instructions (!!!!) my experience is that it is actually very fast on modern chips, and so I've gon with a much more aggressive model of replacing any sequence of 3 or more instructions. I've also taught it to do some basic canonicalization to special-purpose instructions which have smaller encodings than their generic counterparts. There are still quite a few FIXMEs here, and I've not yet implemented support for lowering blends with PSHUFB (where its power really shines due to being able to zero out lanes), but this starts implementing real PSHUFB support even when using the new, fancy shuffle lowering. =] llvm-svn: 214042 2014-07-27 09:15:58 +08:00			`; CHECK: vmovlhps %xmm`
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit permutations. Also tidy up some patterns and make them close to their instruction definition! llvm-svn: 138392 2011-08-24 06:06:37 +08:00			`; CHECK-NEXT: vinsertf128 $1`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002 2011-07-26 07:05:25 +08:00			`define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x double> undef, double %q, i32 0`
			`%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1`
			`%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2`
			`%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3`
			`ret <4 x double> %vecinit6.i`
			`}`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00
[x86] Re-apply a variant of the x86 side of r212324 now that the rest has settled without incident, removing the x86-specific and overly strict 'isVectorSplat' routine in favor of generic and more powerful splat detection. The primary motivation and result of this is that the x86 backend can now see through splats which contain undef elements. This is essential if we are using a widening form of legalization and I've updated a test case to also run in that mode as before this change the generated code for the test case was completely scalarized. This version of the patch much more carefully handles the undef lanes. - We aren't overly conservative about them in the shift lowering (where we will never use the splat itself). - One place where the splat would have been re-used by the existing code now explicitly constructs a new constant splat that will be safe. - The broadcast lowering is much more reasonable with undefs by doing a correct check of whether the splat is the only user of a loaded value, checking that the splat actually crosses multiple lanes before using a broadcast, and handling broadcasts of non-constant splats. As a consequence of the last bullet, the weird usage of vpshufd instead of vbroadcast is gone, and we actually can lower an AVX splat with vbroadcastss where before we emitted a really strange pattern of a vector load and a manual splat across the vector. llvm-svn: 212602 2014-07-09 18:06:58 +08:00			`; Test this turns into a broadcast:`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00			`; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>`
[x86] Re-apply a variant of the x86 side of r212324 now that the rest has settled without incident, removing the x86-specific and overly strict 'isVectorSplat' routine in favor of generic and more powerful splat detection. The primary motivation and result of this is that the x86 backend can now see through splats which contain undef elements. This is essential if we are using a widening form of legalization and I've updated a test case to also run in that mode as before this change the generated code for the test case was completely scalarized. This version of the patch much more carefully handles the undef lanes. - We aren't overly conservative about them in the shift lowering (where we will never use the splat itself). - One place where the splat would have been re-used by the existing code now explicitly constructs a new constant splat that will be safe. - The broadcast lowering is much more reasonable with undefs by doing a correct check of whether the splat is the only user of a loaded value, checking that the splat actually crosses multiple lanes before using a broadcast, and handling broadcasts of non-constant splats. As a consequence of the last bullet, the weird usage of vpshufd instead of vbroadcast is gone, and we actually can lower an AVX splat with vbroadcastss where before we emitted a really strange pattern of a vector load and a manual splat across the vector. llvm-svn: 212602 2014-07-09 18:06:58 +08:00			`;`
			`; CHECK: vbroadcastss`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`define <8 x float> @funcE() nounwind {`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00			`allocas:`
			`%udx495 = alloca [18 x [18 x float]], align 32`
			`br label %for_test505.preheader`

			`for_test505.preheader: ; preds = %for_test505.preheader, %allocas`
			`br i1 undef, label %for_exit499, label %for_test505.preheader`

			`for_exit499: ; preds = %for_test505.preheader`
			`br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247`

			`load.i1247: ; preds = %for_exit499`
			`%ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1`
			`%ptr.i1237 = bitcast float* %ptr1227 to i32*`
			`%val.i1238 = load i32* %ptr.i1237, align 4`
			`%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6`
			`%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7`
			`%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>`
			`br label %__load_and_broadcast_32.exit1249`

			`__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499`
			`%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`ret <8 x float> %load_broadcast12281250`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00			`}`

Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-27 03:24:31 +08:00			`; CHECK: vpshufd $0`
			`; CHECK-NEXT: vinsertf128 $1`
Update test to not use the scalar type to splat from a load llvm-svn: 137809 2011-08-17 10:29:15 +08:00			`define <8 x float> @funcF(i32 %val) nounwind {`
Use the splat index to generate the desired shuffle. Otherwise we could only get undefs and the vector shuffle becomes an undef, generating wrong code. llvm-svn: 137295 2011-08-11 10:49:41 +08:00			`%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6`
			`%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7`
			`%tmp = bitcast <8 x i32> %ret7 to <8 x float>`
			`ret <8 x float> %tmp`
			`}`

X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. llvm-svn: 169624 2012-12-08 03:01:13 +08:00			`; CHECK: vpshufd $0`
Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-27 03:24:31 +08:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>`
			`ret <8 x float> %shuffle`
			`}`

			`; CHECK: vextractf128 $1`
X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. llvm-svn: 169624 2012-12-08 03:01:13 +08:00			`; CHECK-NEXT: vpshufd`
Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-27 03:24:31 +08:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <8 x float> %shuffle`
			`}`