llvm-project/llvm/test/CodeGen/X86/lower-vec-shift-2.ll

; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s -check-prefix=SSE2
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s -check-prefix=AVX

define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: test1:
; SSE2:       # BB#0
; SSE2-NEXT:    movd  %xmm1, %eax
; SSE2-NEXT:    movzwl  %ax, %eax
; SSE2-NEXT:    movd  %eax, %xmm1
; SSE2-NEXT:    psllw  %xmm1, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test1:
; AVX:       # BB#0
; AVX-NEXT:    vpxor  %xmm2, %xmm2, %xmm2
; AVX-NEXT:    vpblendw  {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX-NEXT:    vpsllw  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
  %shl = shl <8 x i16> %A, %vecinit14
  ret <8 x i16> %shl
}

define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test2:
; SSE2:       # BB#0
; SSE2-NEXT:    xorps  %xmm2, %xmm2
; SSE2-NEXT:    movss  %xmm1, %xmm2
; SSE2-NEXT:    pslld  %xmm2, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test2:
; AVX:       # BB#0
; AVX-NEXT:    vpxor  %xmm2, %xmm2
; AVX-NEXT:    vpblendw  {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT:    vpslld  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
  %shl = shl <4 x i32> %A, %vecinit6
  ret <4 x i32> %shl
}

define <2 x i64> @test3(<2 x i64> %A, <2 x i64> %B) {
; SSE2-LABEL: test3:
; SSE2:       # BB#0
; SSE2-NEXT:    psllq  %xmm1, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test3:
; AVX:       # BB#0
; AVX-NEXT:    vpsllq  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer
  %shl = shl <2 x i64> %A, %vecinit2
  ret <2 x i64> %shl
}

define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: test4:
; SSE2:       # BB#0
; SSE2-NEXT:    movd  %xmm1, %eax
; SSE2-NEXT:    movzwl  %ax, %eax
; SSE2-NEXT:    movd  %eax, %xmm1
; SSE2-NEXT:    psrlw  %xmm1, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test4:
; AVX:       # BB#0
; AVX-NEXT:    vpxor  %xmm2, %xmm2, %xmm2
; AVX-NEXT:    vpblendw  {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX-NEXT:    vpsrlw  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
  %shr = lshr <8 x i16> %A, %vecinit14
  ret <8 x i16> %shr
}

define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test5:
; SSE2:       # BB#0
; SSE2-NEXT:    xorps  %xmm2, %xmm2
; SSE2-NEXT:    movss  %xmm1, %xmm2
; SSE2-NEXT:    psrld  %xmm2, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test5:
; AVX:       # BB#0
; AVX-NEXT:    vpxor  %xmm2, %xmm2
; AVX-NEXT:    vpblendw  {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT:    vpsrld  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
  %shr = lshr <4 x i32> %A, %vecinit6
  ret <4 x i32> %shr
}

define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) {
; SSE2-LABEL: test6:
; SSE2:       # BB#0
; SSE2-NEXT:    psrlq  %xmm1, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test6:
; AVX:       # BB#0
; AVX-NEXT:    vpsrlq  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer
  %shr = lshr <2 x i64> %A, %vecinit2
  ret <2 x i64> %shr
}

define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
; SSE2-LABEL: test7:
; SSE2:       # BB#0
; SSE2-NEXT:    movd  %xmm1, %eax
; SSE2-NEXT:    movzwl  %ax, %eax
; SSE2-NEXT:    movd  %eax, %xmm1
; SSE2-NEXT:    psraw  %xmm1, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test7:
; AVX:       # BB#0
; AVX-NEXT:    vpxor  %xmm2, %xmm2, %xmm2
; AVX-NEXT:    vpblendw  {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
; AVX-NEXT:    vpsraw  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
  %shr = ashr <8 x i16> %A, %vecinit14
  ret <8 x i16> %shr
}

define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
; SSE2-LABEL: test8:
; SSE2:       # BB#0
; SSE2-NEXT:    xorps  %xmm2, %xmm2
; SSE2-NEXT:    movss  %xmm1, %xmm2
; SSE2-NEXT:    psrad  %xmm2, %xmm0
; SSE2-NEXT:    retq
; AVX-LABEL: test8:
; AVX:       # BB#0
; AVX-NEXT:    vpxor  %xmm2, %xmm2
; AVX-NEXT:    vpblendw  {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
; AVX-NEXT:    vpsrad  %xmm1, %xmm0, %xmm0
; AVX-NEXT:    retq
entry:
  %vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
  %shr = ashr <4 x i32> %A, %vecinit6
  ret <4 x i32> %shr
}
[X86] Avoid introducing extra shuffles when lowering packed vector shifts. When lowering a vector shift node, the backend checks if the shift count is a shuffle with a splat mask. If so, then it introduces an extra dag node to extract the splat value from the shuffle. The splat value is then used to generate a shift count of a target specific shift. However, if we know that the shift count is a splat shuffle, we can use the splat index 'I' to extract the I-th element from the first shuffle operand. The advantage is that the splat shuffle may become dead since we no longer use it. Example: ;; define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) { %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer %shl = shl <4 x i32> %a, %c ret <4 x i32> %shl } ;; Before this patch, llc generated the following code (-mattr=+avx): vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq With this patch, the redundant splat operation is removed from the code. vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq llvm-svn: 223461 2014-12-05 20:13:30 +08:00			`; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s \| FileCheck %s -check-prefix=SSE2`
			`; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s \| FileCheck %s -check-prefix=AVX`

			`define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {`
			`; SSE2-LABEL: test1:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: movd %xmm1, %eax`
			`; SSE2-NEXT: movzwl %ax, %eax`
			`; SSE2-NEXT: movd %eax, %xmm1`
			`; SSE2-NEXT: psllw %xmm1, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test1:`
			`; AVX: # BB#0`
[X86] Improved lowering of packed v8i16 vector shifts by non-constant count. Before this patch, the backend sub-optimally expanded the non-constant shift count of a v8i16 shift into a sequence of two 'movd' plus 'movzwl'. With this patch the backend checks if the target features sse4.1. If so, then it lets the shuffle legalizer deal with the expansion of the shift amount. Example: ;; define <8 x i16> @test(<8 x i16> %A, <8 x i16> %B) { %shamt = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer %shl = shl <8 x i16> %A, %shamt ret <8 x i16> %shl } ;; Before (with -mattr=+avx): vmovd %xmm1, %eax movzwl %ax, %eax vmovd %eax, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq Now: vpxor %xmm2, %xmm2, %xmm2 vpblendw $1, %xmm1, %xmm2, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq llvm-svn: 223660 2014-12-08 22:36:51 +08:00			`; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2`
			`; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]`
[X86] Avoid introducing extra shuffles when lowering packed vector shifts. When lowering a vector shift node, the backend checks if the shift count is a shuffle with a splat mask. If so, then it introduces an extra dag node to extract the splat value from the shuffle. The splat value is then used to generate a shift count of a target specific shift. However, if we know that the shift count is a splat shuffle, we can use the splat index 'I' to extract the I-th element from the first shuffle operand. The advantage is that the splat shuffle may become dead since we no longer use it. Example: ;; define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) { %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer %shl = shl <4 x i32> %a, %c ret <4 x i32> %shl } ;; Before this patch, llc generated the following code (-mattr=+avx): vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq With this patch, the redundant splat operation is removed from the code. vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq llvm-svn: 223461 2014-12-05 20:13:30 +08:00			`; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer`
			`%shl = shl <8 x i16> %A, %vecinit14`
			`ret <8 x i16> %shl`
			`}`

			`define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {`
			`; SSE2-LABEL: test2:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: xorps %xmm2, %xmm2`
			`; SSE2-NEXT: movss %xmm1, %xmm2`
			`; SSE2-NEXT: pslld %xmm2, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test2:`
			`; AVX: # BB#0`
			`; AVX-NEXT: vpxor %xmm2, %xmm2`
			`; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]`
			`; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer`
			`%shl = shl <4 x i32> %A, %vecinit6`
			`ret <4 x i32> %shl`
			`}`

			`define <2 x i64> @test3(<2 x i64> %A, <2 x i64> %B) {`
			`; SSE2-LABEL: test3:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: psllq %xmm1, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test3:`
			`; AVX: # BB#0`
			`; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer`
			`%shl = shl <2 x i64> %A, %vecinit2`
			`ret <2 x i64> %shl`
			`}`

			`define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {`
			`; SSE2-LABEL: test4:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: movd %xmm1, %eax`
			`; SSE2-NEXT: movzwl %ax, %eax`
			`; SSE2-NEXT: movd %eax, %xmm1`
			`; SSE2-NEXT: psrlw %xmm1, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test4:`
			`; AVX: # BB#0`
[X86] Improved lowering of packed v8i16 vector shifts by non-constant count. Before this patch, the backend sub-optimally expanded the non-constant shift count of a v8i16 shift into a sequence of two 'movd' plus 'movzwl'. With this patch the backend checks if the target features sse4.1. If so, then it lets the shuffle legalizer deal with the expansion of the shift amount. Example: ;; define <8 x i16> @test(<8 x i16> %A, <8 x i16> %B) { %shamt = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer %shl = shl <8 x i16> %A, %shamt ret <8 x i16> %shl } ;; Before (with -mattr=+avx): vmovd %xmm1, %eax movzwl %ax, %eax vmovd %eax, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq Now: vpxor %xmm2, %xmm2, %xmm2 vpblendw $1, %xmm1, %xmm2, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq llvm-svn: 223660 2014-12-08 22:36:51 +08:00			`; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2`
			`; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]`
[X86] Avoid introducing extra shuffles when lowering packed vector shifts. When lowering a vector shift node, the backend checks if the shift count is a shuffle with a splat mask. If so, then it introduces an extra dag node to extract the splat value from the shuffle. The splat value is then used to generate a shift count of a target specific shift. However, if we know that the shift count is a splat shuffle, we can use the splat index 'I' to extract the I-th element from the first shuffle operand. The advantage is that the splat shuffle may become dead since we no longer use it. Example: ;; define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) { %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer %shl = shl <4 x i32> %a, %c ret <4 x i32> %shl } ;; Before this patch, llc generated the following code (-mattr=+avx): vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq With this patch, the redundant splat operation is removed from the code. vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq llvm-svn: 223461 2014-12-05 20:13:30 +08:00			`; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer`
			`%shr = lshr <8 x i16> %A, %vecinit14`
			`ret <8 x i16> %shr`
			`}`

			`define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {`
			`; SSE2-LABEL: test5:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: xorps %xmm2, %xmm2`
			`; SSE2-NEXT: movss %xmm1, %xmm2`
			`; SSE2-NEXT: psrld %xmm2, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test5:`
			`; AVX: # BB#0`
			`; AVX-NEXT: vpxor %xmm2, %xmm2`
			`; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]`
			`; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer`
			`%shr = lshr <4 x i32> %A, %vecinit6`
			`ret <4 x i32> %shr`
			`}`

			`define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) {`
			`; SSE2-LABEL: test6:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: psrlq %xmm1, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test6:`
			`; AVX: # BB#0`
			`; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer`
			`%shr = lshr <2 x i64> %A, %vecinit2`
			`ret <2 x i64> %shr`
			`}`

			`define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {`
			`; SSE2-LABEL: test7:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: movd %xmm1, %eax`
			`; SSE2-NEXT: movzwl %ax, %eax`
			`; SSE2-NEXT: movd %eax, %xmm1`
			`; SSE2-NEXT: psraw %xmm1, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test7:`
			`; AVX: # BB#0`
[X86] Improved lowering of packed v8i16 vector shifts by non-constant count. Before this patch, the backend sub-optimally expanded the non-constant shift count of a v8i16 shift into a sequence of two 'movd' plus 'movzwl'. With this patch the backend checks if the target features sse4.1. If so, then it lets the shuffle legalizer deal with the expansion of the shift amount. Example: ;; define <8 x i16> @test(<8 x i16> %A, <8 x i16> %B) { %shamt = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer %shl = shl <8 x i16> %A, %shamt ret <8 x i16> %shl } ;; Before (with -mattr=+avx): vmovd %xmm1, %eax movzwl %ax, %eax vmovd %eax, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq Now: vpxor %xmm2, %xmm2, %xmm2 vpblendw $1, %xmm1, %xmm2, %xmm1 vpsllw %xmm1, %xmm0, %xmm0 retq llvm-svn: 223660 2014-12-08 22:36:51 +08:00			`; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2`
			`; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]`
[X86] Avoid introducing extra shuffles when lowering packed vector shifts. When lowering a vector shift node, the backend checks if the shift count is a shuffle with a splat mask. If so, then it introduces an extra dag node to extract the splat value from the shuffle. The splat value is then used to generate a shift count of a target specific shift. However, if we know that the shift count is a splat shuffle, we can use the splat index 'I' to extract the I-th element from the first shuffle operand. The advantage is that the splat shuffle may become dead since we no longer use it. Example: ;; define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) { %c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer %shl = shl <4 x i32> %a, %c ret <4 x i32> %shl } ;; Before this patch, llc generated the following code (-mattr=+avx): vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0] vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq With this patch, the redundant splat operation is removed from the code. vpxor %xmm2, %xmm2 vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7] vpslld %xmm1, %xmm0, %xmm0 retq llvm-svn: 223461 2014-12-05 20:13:30 +08:00			`; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer`
			`%shr = ashr <8 x i16> %A, %vecinit14`
			`ret <8 x i16> %shr`
			`}`

			`define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {`
			`; SSE2-LABEL: test8:`
			`; SSE2: # BB#0`
			`; SSE2-NEXT: xorps %xmm2, %xmm2`
			`; SSE2-NEXT: movss %xmm1, %xmm2`
			`; SSE2-NEXT: psrad %xmm2, %xmm0`
			`; SSE2-NEXT: retq`
			`; AVX-LABEL: test8:`
			`; AVX: # BB#0`
			`; AVX-NEXT: vpxor %xmm2, %xmm2`
			`; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]`
			`; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0`
			`; AVX-NEXT: retq`
			`entry:`
			`%vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer`
			`%shr = ashr <4 x i32> %A, %vecinit6`
			`ret <4 x i32> %shr`
			`}`