llvm-project/llvm/test/CodeGen/X86/avx-splat.ll

; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s


; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]
; CHECK-NEXT: vinsertf128 $1
define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <32 x i8> %shuffle
}

; CHECK: vpunpckhwd %xmm
; CHECK-NEXT: vpshufd $85
; CHECK-NEXT: vinsertf128 $1
define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <16 x i16> %shuffle
}

; CHECK: vmovq
; CHECK-NEXT: vmovlhps %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
  ret <4 x i64> %vecinit6.i
}

; CHECK: vunpcklpd %xmm
; CHECK-NEXT: vinsertf128 $1
define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {
entry:
  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
  ret <4 x double> %vecinit6.i
}

; Test this turns into a broadcast:
;   shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
;   
; CHECK: vbroadcastss
define <8 x float> @funcE() nounwind {
allocas:
  %udx495 = alloca [18 x [18 x float]], align 32
  br label %for_test505.preheader

for_test505.preheader:                            ; preds = %for_test505.preheader, %allocas
  br i1 undef, label %for_exit499, label %for_test505.preheader

for_exit499:                                      ; preds = %for_test505.preheader
  br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247

load.i1247:                                       ; preds = %for_exit499
  %ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1
  %ptr.i1237 = bitcast float* %ptr1227 to i32*
  %val.i1238 = load i32* %ptr.i1237, align 4
  %ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6
  %ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7
  %phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>
  br label %__load_and_broadcast_32.exit1249

__load_and_broadcast_32.exit1249:                 ; preds = %load.i1247, %for_exit499
  %load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]
  ret <8 x float> %load_broadcast12281250
}

; CHECK: vpshufd $0
; CHECK-NEXT: vinsertf128 $1
define <8 x float> @funcF(i32 %val) nounwind {
  %ret6 = insertelement <8 x i32> undef, i32 %val, i32 6
  %ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7
  %tmp = bitcast <8 x i32> %ret7 to <8 x float>
  ret <8 x float> %tmp
}

; CHECK: vpshufd  $0
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
  ret <8 x float> %shuffle
}

; CHECK: vextractf128  $1
; CHECK-NEXT: vpshufd
; CHECK-NEXT: vinsertf128  $1
define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {
entry:
  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
  ret <8 x float> %shuffle
}
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 llvm-svn: 135662 2011-07-21 09:55:47 +08:00			`; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx \| FileCheck %s`


[x86] Teach the target shuffle mask extraction to recognize unary forms of normally binary shuffle instructions like PUNPCKL and MOVLHPS. This detects cases where a single register is used for both operands making the shuffle behave in a unary way. We detect this and adjust the mask to use the unary form which allows the existing DAG combine for shuffle instructions to actually work at all. As a consequence, this uncovered a number of obvious bugs in the existing DAG combine which are fixed. It also now canonicalizes several shuffles even with the existing lowering. These typically are trying to match the shuffle to the domain of the input where before we only really modeled them with the floating point variants. All of the cases which change to an integer shuffle here have something in the integer domain, so there are no more or fewer domain crosses here AFAICT. Technically, it might be better to go from a GPR directly to the floating point domain, but detecting floating point outputs despite integer inputs is a lot more code and seems unlikely to be worthwhile in practice. If folks are seeing domain-crossing regressions here though, let me know and I can hack something up to fix it. Also as a consequence, a bunch of missed opportunities to form pshufb now can be formed. Notably, splats of i8s now form pshufb. Interestingly, this improves the existing splat lowering too. We go from 3 instructions to 1. Yes, we may tie up a register, but it seems very likely to be worth it, especially if splatting the 0th byte (the common case) as then we can use a zeroed register as the mask. llvm-svn: 214625 2014-08-02 18:27:38 +08:00			`; CHECK: vpshufb {{.*}} ## xmm0 = xmm0[5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5]`
Add support for 256-bit versions of VPERMIL instruction. This is a new instruction introduced in AVX, which can operate on 128 and 256-bit vectors. It considers a 256-bit vector as two independent 128-bit lanes. It can permute any 32 or 64 elements inside a lane, and restricts the second lane to have the same permutation of the first one. With the improved splat support introduced early today, adding codegen for this instruction enable more efficient 256-bit code: Instead of: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vextractf128 $1, %ymm0, %xmm1 shufps $1, %xmm1, %xmm1 movss %xmm1, 28(%rsp) movss %xmm1, 24(%rsp) movss %xmm1, 20(%rsp) movss %xmm1, 16(%rsp) vextractf128 $0, %ymm0, %xmm0 shufps $1, %xmm0, %xmm0 movss %xmm0, 12(%rsp) movss %xmm0, 8(%rsp) movss %xmm0, 4(%rsp) movss %xmm0, (%rsp) vmovaps (%rsp), %ymm0 We get: vextractf128 $0, %ymm0, %xmm0 punpcklbw %xmm0, %xmm0 punpckhbw %xmm0, %xmm0 vinsertf128 $0, %xmm0, %ymm0, %ymm1 vinsertf128 $1, %xmm0, %ymm1, %ymm0 vpermilps $85, %ymm0, %ymm0 llvm-svn: 135662 2011-07-21 09:55:47 +08:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <32 x i8> @funcA(<32 x i8> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <32 x i8> %shuffle`
			`}`

Add support for breaking 256-bit v16i16 and v32i8 VSETCC into two 128-bit ones, avoiding sclarization. Add vex form of pcmpeqq and pcmpgtq. Fixes more cases for PR10712. llvm-svn: 138321 2011-08-23 12:36:33 +08:00			`; CHECK: vpunpckhwd %xmm`
X86: Do splat promotion later, so the optimizer can chew on it first. This catches many cases where we can emit a more efficient shuffle for a specific mask or when the mask contains undefs. Once the splat is lowered to unpacks we can't do that anymore. There is a possibility of moving the promotion after pshufb matching, but I'm not sure if pshufb with a mask loaded from memory is faster than 3 shuffles, so I avoided that for now. llvm-svn: 173569 2013-01-26 19:44:21 +08:00			`; CHECK-NEXT: vpshufd $85`
- Register v16i16 as valid VR256 register class - Add more bitcasts for v16i16 - Since 135661 and 135662 already added the splat logic, just add one more splat test for v16i16 llvm-svn: 135663 2011-07-21 10:24:08 +08:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <16 x i16> %a, <16 x i16> undef, <16 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <16 x i16> %shuffle`
			`}`

Remove some instructions that existed to provide aliases to the assembler. Can be done with InstAlias instead. Unfortunately, this was causing printer to use 'vmovq' or 'vmovd' based on what was parsed. To cleanup the inconsistencies convert all 'vmovd' with 64-bit registers to 'vmovq', but provide an alias so that 'vmovd' will still parse. llvm-svn: 192171 2013-10-08 13:53:50 +08:00			`; CHECK: vmovq`
[x86] Undo a flawed transform I added to form UNPCK instructions when AVX is available, and generally tidy up things surrounding UNPCK formation. Originally, I was thinking that the only advantage of PSHUFD over UNPCK instruction variants was its free copy, and otherwise we should use the shorter encoding UNPCK instructions. This isn't right though, there is a larger advantage of being able to fold a load into the operand of a PSHUFD. For UNPCK, the operand must be in a register so it can be the second input. This removes the UNPCK formation in the target-specific DAG combine for v4i32 shuffles. It also lifts the v8 and v16 cases out of the AVX-specific check as they are potentially replacing multiple instructions with a single instruction and so should always be valuable. The floating point checks are simplified accordingly. This also adjusts the formation of PSHUFD instructions to attempt to match the shuffle mask to one which would fit an UNPCK instruction variant. This was originally motivated to allow it to match the UNPCK instructions in the combiner, but clearly won't now. Eventually, we should add a MachineCombiner pass that can form UNPCK instructions post-RA when the operand is known to be in a register and thus there is no loss. llvm-svn: 217755 2014-09-15 18:35:41 +08:00			`; CHECK-NEXT: vmovlhps %xmm`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002 2011-07-26 07:05:25 +08:00			`; CHECK-NEXT: vinsertf128 $1`
			`define <4 x i64> @funcC(i64 %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0`
			`%vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1`
			`%vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2`
			`%vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3`
			`ret <4 x i64> %vecinit6.i`
			`}`

[x86] Teach the vector combiner that picks a canonical shuffle from to support transforming the forms from the new vector shuffle lowering to use 'movddup' when appropriate. A bunch of the cases where we actually form 'movddup' don't actually show up in the test results because something even later than DAG legalization maps them back to 'unpcklpd'. If this shows back up as a performance problem, I'll probably chase it down, but it is at least an encoded size loss. =/ To make this work, also always do this canonicalizing step for floating point vectors where the baseline shuffle instructions don't provide any free copies of their inputs. This also causes us to canonicalize unpck[hl]pd into mov{hl,lh}ps (resp.) which is a nice encoding space win. There is one test which is "regressed" by this: extractelement-load. There, the test case where the optimization it is testing fails, the exact instruction pattern which results is slightly different. This should probably be fixed by having the appropriate extract formed earlier in the DAG, but that would defeat the purpose of the test.... If this test case is critically important for anyone, please let me know and I'll try to work on it. The prior behavior was actually contrary to the comment in the test case and seems likely to have been an accident. llvm-svn: 217738 2014-09-15 06:41:37 +08:00			`; CHECK: vunpcklpd %xmm`
Fix a nasty bug where a v4i64 was being wrong emitted with 32-bit permutations. Also tidy up some patterns and make them close to their instruction definition! llvm-svn: 138392 2011-08-24 06:06:37 +08:00			`; CHECK-NEXT: vinsertf128 $1`
- Handle special scalar_to_vector case: splats. Using a native 128-bit shuffle before inserting on a 256-bit vector. - Add AVX versions of movd/movq instructions - Introduce a few COPY patterns to match insert_subvector instructions. This turns a trivial insert_subvector instruction into a register copy, coalescing the xmm into a ymm and avoid emiting on more instruction. llvm-svn: 136002 2011-07-26 07:05:25 +08:00			`define <4 x double> @funcD(double %q) nounwind uwtable readnone ssp {`
			`entry:`
			`%vecinit.i = insertelement <4 x double> undef, double %q, i32 0`
			`%vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1`
			`%vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2`
			`%vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3`
			`ret <4 x double> %vecinit6.i`
			`}`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00
[x86] Re-apply a variant of the x86 side of r212324 now that the rest has settled without incident, removing the x86-specific and overly strict 'isVectorSplat' routine in favor of generic and more powerful splat detection. The primary motivation and result of this is that the x86 backend can now see through splats which contain undef elements. This is essential if we are using a widening form of legalization and I've updated a test case to also run in that mode as before this change the generated code for the test case was completely scalarized. This version of the patch much more carefully handles the undef lanes. - We aren't overly conservative about them in the shift lowering (where we will never use the splat itself). - One place where the splat would have been re-used by the existing code now explicitly constructs a new constant splat that will be safe. - The broadcast lowering is much more reasonable with undefs by doing a correct check of whether the splat is the only user of a loaded value, checking that the splat actually crosses multiple lanes before using a broadcast, and handling broadcasts of non-constant splats. As a consequence of the last bullet, the weird usage of vpshufd instead of vbroadcast is gone, and we actually can lower an AVX splat with vbroadcastss where before we emitted a really strange pattern of a vector load and a manual splat across the vector. llvm-svn: 212602 2014-07-09 18:06:58 +08:00			`; Test this turns into a broadcast:`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00			`; shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>`
[x86] Re-apply a variant of the x86 side of r212324 now that the rest has settled without incident, removing the x86-specific and overly strict 'isVectorSplat' routine in favor of generic and more powerful splat detection. The primary motivation and result of this is that the x86 backend can now see through splats which contain undef elements. This is essential if we are using a widening form of legalization and I've updated a test case to also run in that mode as before this change the generated code for the test case was completely scalarized. This version of the patch much more carefully handles the undef lanes. - We aren't overly conservative about them in the shift lowering (where we will never use the splat itself). - One place where the splat would have been re-used by the existing code now explicitly constructs a new constant splat that will be safe. - The broadcast lowering is much more reasonable with undefs by doing a correct check of whether the splat is the only user of a loaded value, checking that the splat actually crosses multiple lanes before using a broadcast, and handling broadcasts of non-constant splats. As a consequence of the last bullet, the weird usage of vpshufd instead of vbroadcast is gone, and we actually can lower an AVX splat with vbroadcastss where before we emitted a really strange pattern of a vector load and a manual splat across the vector. llvm-svn: 212602 2014-07-09 18:06:58 +08:00			`;`
			`; CHECK: vbroadcastss`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`define <8 x float> @funcE() nounwind {`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00			`allocas:`
			`%udx495 = alloca [18 x [18 x float]], align 32`
			`br label %for_test505.preheader`

			`for_test505.preheader: ; preds = %for_test505.preheader, %allocas`
			`br i1 undef, label %for_exit499, label %for_test505.preheader`

			`for_exit499: ; preds = %for_test505.preheader`
			`br i1 undef, label %__load_and_broadcast_32.exit1249, label %load.i1247`

			`load.i1247: ; preds = %for_exit499`
			`%ptr1227 = getelementptr [18 x [18 x float]]* %udx495, i64 0, i64 1, i64 1`
			`%ptr.i1237 = bitcast float* %ptr1227 to i32*`
			`%val.i1238 = load i32* %ptr.i1237, align 4`
			`%ret6.i1245 = insertelement <8 x i32> undef, i32 %val.i1238, i32 6`
			`%ret7.i1246 = insertelement <8 x i32> %ret6.i1245, i32 %val.i1238, i32 7`
			`%phitmp = bitcast <8 x i32> %ret7.i1246 to <8 x float>`
			`br label %__load_and_broadcast_32.exit1249`

			`__load_and_broadcast_32.exit1249: ; preds = %load.i1247, %for_exit499`
			`%load_broadcast12281250 = phi <8 x float> [ %phitmp, %load.i1247 ], [ undef, %for_exit499 ]`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`ret <8 x float> %load_broadcast12281250`
Make this kind of lowering to be supported by 256-bit instructions: shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0> To: shuffle (vload ptr)), undef, <1, 1, 1, 1> Fix PR10494 llvm-svn: 136691 2011-08-03 00:06:18 +08:00			`}`

Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-27 03:24:31 +08:00			`; CHECK: vpshufd $0`
			`; CHECK-NEXT: vinsertf128 $1`
Update test to not use the scalar type to splat from a load llvm-svn: 137809 2011-08-17 10:29:15 +08:00			`define <8 x float> @funcF(i32 %val) nounwind {`
Use the splat index to generate the desired shuffle. Otherwise we could only get undefs and the vector shuffle becomes an undef, generating wrong code. llvm-svn: 137295 2011-08-11 10:49:41 +08:00			`%ret6 = insertelement <8 x i32> undef, i32 %val, i32 6`
			`%ret7 = insertelement <8 x i32> %ret6, i32 %val, i32 7`
			`%tmp = bitcast <8 x i32> %ret7 to <8 x float>`
			`ret <8 x float> %tmp`
			`}`

X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. llvm-svn: 169624 2012-12-08 03:01:13 +08:00			`; CHECK: vpshufd $0`
Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-27 03:24:31 +08:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`define <8 x float> @funcG(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>`
			`ret <8 x float> %shuffle`
			`}`

			`; CHECK: vextractf128 $1`
X86: Prefer using VPSHUFD over VPERMIL because it has better throughput. llvm-svn: 169624 2012-12-08 03:01:13 +08:00			`; CHECK-NEXT: vpshufd`
Normalize splat 256bit vectors with 8 elements. llvm-svn: 168600 2012-11-27 03:24:31 +08:00			`; CHECK-NEXT: vinsertf128 $1`
Splats for v8i32/v8f32 can be handled by VPERMILPSY. This was causing infinite recursive calls in legalize. Fix PR10562 llvm-svn: 137296 2011-08-11 10:49:44 +08:00			`define <8 x float> @funcH(<8 x float> %a) nounwind uwtable readnone ssp {`
			`entry:`
			`%shuffle = shufflevector <8 x float> %a, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>`
			`ret <8 x float> %shuffle`
			`}`