[X86] Avoid introducing extra shuffles when lowering packed vector shifts.
When lowering a vector shift node, the backend checks if the shift count is a
shuffle with a splat mask. If so, then it introduces an extra dag node to
extract the splat value from the shuffle. The splat value is then used
to generate a shift count of a target specific shift.
However, if we know that the shift count is a splat shuffle, we can use the
splat index 'I' to extract the I-th element from the first shuffle operand.
The advantage is that the splat shuffle may become dead since we no longer
use it.
Example:
;;
define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) {
%c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
%shl = shl <4 x i32> %a, %c
ret <4 x i32> %shl
}
;;
Before this patch, llc generated the following code (-mattr=+avx):
vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0]
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
With this patch, the redundant splat operation is removed from the code.
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
llvm-svn: 223461
2014-12-05 20:13:30 +08:00
|
|
|
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse2 < %s | FileCheck %s -check-prefix=SSE2
|
|
|
|
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s -check-prefix=AVX
|
|
|
|
|
|
|
|
define <8 x i16> @test1(<8 x i16> %A, <8 x i16> %B) {
|
|
|
|
; SSE2-LABEL: test1:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: movd %xmm1, %eax
|
|
|
|
; SSE2-NEXT: movzwl %ax, %eax
|
|
|
|
; SSE2-NEXT: movd %eax, %xmm1
|
|
|
|
; SSE2-NEXT: psllw %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test1:
|
|
|
|
; AVX: # BB#0
|
[X86] Improved lowering of packed v8i16 vector shifts by non-constant count.
Before this patch, the backend sub-optimally expanded the non-constant shift
count of a v8i16 shift into a sequence of two 'movd' plus 'movzwl'.
With this patch the backend checks if the target features sse4.1. If so, then
it lets the shuffle legalizer deal with the expansion of the shift amount.
Example:
;;
define <8 x i16> @test(<8 x i16> %A, <8 x i16> %B) {
%shamt = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
%shl = shl <8 x i16> %A, %shamt
ret <8 x i16> %shl
}
;;
Before (with -mattr=+avx):
vmovd %xmm1, %eax
movzwl %ax, %eax
vmovd %eax, %xmm1
vpsllw %xmm1, %xmm0, %xmm0
retq
Now:
vpxor %xmm2, %xmm2, %xmm2
vpblendw $1, %xmm1, %xmm2, %xmm1
vpsllw %xmm1, %xmm0, %xmm0
retq
llvm-svn: 223660
2014-12-08 22:36:51 +08:00
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
|
[X86] Avoid introducing extra shuffles when lowering packed vector shifts.
When lowering a vector shift node, the backend checks if the shift count is a
shuffle with a splat mask. If so, then it introduces an extra dag node to
extract the splat value from the shuffle. The splat value is then used
to generate a shift count of a target specific shift.
However, if we know that the shift count is a splat shuffle, we can use the
splat index 'I' to extract the I-th element from the first shuffle operand.
The advantage is that the splat shuffle may become dead since we no longer
use it.
Example:
;;
define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) {
%c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
%shl = shl <4 x i32> %a, %c
ret <4 x i32> %shl
}
;;
Before this patch, llc generated the following code (-mattr=+avx):
vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0]
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
With this patch, the redundant splat operation is removed from the code.
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
llvm-svn: 223461
2014-12-05 20:13:30 +08:00
|
|
|
; AVX-NEXT: vpsllw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
|
|
|
|
%shl = shl <8 x i16> %A, %vecinit14
|
|
|
|
ret <8 x i16> %shl
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
|
|
|
|
; SSE2-LABEL: test2:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE2-NEXT: movss %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: pslld %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test2:
|
|
|
|
; AVX: # BB#0
|
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
|
|
|
|
; AVX-NEXT: vpslld %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
%shl = shl <4 x i32> %A, %vecinit6
|
|
|
|
ret <4 x i32> %shl
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test3(<2 x i64> %A, <2 x i64> %B) {
|
|
|
|
; SSE2-LABEL: test3:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: psllq %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test3:
|
|
|
|
; AVX: # BB#0
|
|
|
|
; AVX-NEXT: vpsllq %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer
|
|
|
|
%shl = shl <2 x i64> %A, %vecinit2
|
|
|
|
ret <2 x i64> %shl
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @test4(<8 x i16> %A, <8 x i16> %B) {
|
|
|
|
; SSE2-LABEL: test4:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: movd %xmm1, %eax
|
|
|
|
; SSE2-NEXT: movzwl %ax, %eax
|
|
|
|
; SSE2-NEXT: movd %eax, %xmm1
|
|
|
|
; SSE2-NEXT: psrlw %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test4:
|
|
|
|
; AVX: # BB#0
|
[X86] Improved lowering of packed v8i16 vector shifts by non-constant count.
Before this patch, the backend sub-optimally expanded the non-constant shift
count of a v8i16 shift into a sequence of two 'movd' plus 'movzwl'.
With this patch the backend checks if the target features sse4.1. If so, then
it lets the shuffle legalizer deal with the expansion of the shift amount.
Example:
;;
define <8 x i16> @test(<8 x i16> %A, <8 x i16> %B) {
%shamt = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
%shl = shl <8 x i16> %A, %shamt
ret <8 x i16> %shl
}
;;
Before (with -mattr=+avx):
vmovd %xmm1, %eax
movzwl %ax, %eax
vmovd %eax, %xmm1
vpsllw %xmm1, %xmm0, %xmm0
retq
Now:
vpxor %xmm2, %xmm2, %xmm2
vpblendw $1, %xmm1, %xmm2, %xmm1
vpsllw %xmm1, %xmm0, %xmm0
retq
llvm-svn: 223660
2014-12-08 22:36:51 +08:00
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
|
[X86] Avoid introducing extra shuffles when lowering packed vector shifts.
When lowering a vector shift node, the backend checks if the shift count is a
shuffle with a splat mask. If so, then it introduces an extra dag node to
extract the splat value from the shuffle. The splat value is then used
to generate a shift count of a target specific shift.
However, if we know that the shift count is a splat shuffle, we can use the
splat index 'I' to extract the I-th element from the first shuffle operand.
The advantage is that the splat shuffle may become dead since we no longer
use it.
Example:
;;
define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) {
%c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
%shl = shl <4 x i32> %a, %c
ret <4 x i32> %shl
}
;;
Before this patch, llc generated the following code (-mattr=+avx):
vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0]
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
With this patch, the redundant splat operation is removed from the code.
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
llvm-svn: 223461
2014-12-05 20:13:30 +08:00
|
|
|
; AVX-NEXT: vpsrlw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
|
|
|
|
%shr = lshr <8 x i16> %A, %vecinit14
|
|
|
|
ret <8 x i16> %shr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test5(<4 x i32> %A, <4 x i32> %B) {
|
|
|
|
; SSE2-LABEL: test5:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE2-NEXT: movss %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: psrld %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test5:
|
|
|
|
; AVX: # BB#0
|
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
|
|
|
|
; AVX-NEXT: vpsrld %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
%shr = lshr <4 x i32> %A, %vecinit6
|
|
|
|
ret <4 x i32> %shr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <2 x i64> @test6(<2 x i64> %A, <2 x i64> %B) {
|
|
|
|
; SSE2-LABEL: test6:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: psrlq %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test6:
|
|
|
|
; AVX: # BB#0
|
|
|
|
; AVX-NEXT: vpsrlq %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit2 = shufflevector <2 x i64> %B, <2 x i64> undef, <2 x i32> zeroinitializer
|
|
|
|
%shr = lshr <2 x i64> %A, %vecinit2
|
|
|
|
ret <2 x i64> %shr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <8 x i16> @test7(<8 x i16> %A, <8 x i16> %B) {
|
|
|
|
; SSE2-LABEL: test7:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: movd %xmm1, %eax
|
|
|
|
; SSE2-NEXT: movzwl %ax, %eax
|
|
|
|
; SSE2-NEXT: movd %eax, %xmm1
|
|
|
|
; SSE2-NEXT: psraw %xmm1, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test7:
|
|
|
|
; AVX: # BB#0
|
[X86] Improved lowering of packed v8i16 vector shifts by non-constant count.
Before this patch, the backend sub-optimally expanded the non-constant shift
count of a v8i16 shift into a sequence of two 'movd' plus 'movzwl'.
With this patch the backend checks if the target features sse4.1. If so, then
it lets the shuffle legalizer deal with the expansion of the shift amount.
Example:
;;
define <8 x i16> @test(<8 x i16> %A, <8 x i16> %B) {
%shamt = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
%shl = shl <8 x i16> %A, %shamt
ret <8 x i16> %shl
}
;;
Before (with -mattr=+avx):
vmovd %xmm1, %eax
movzwl %ax, %eax
vmovd %eax, %xmm1
vpsllw %xmm1, %xmm0, %xmm0
retq
Now:
vpxor %xmm2, %xmm2, %xmm2
vpblendw $1, %xmm1, %xmm2, %xmm1
vpsllw %xmm1, %xmm0, %xmm0
retq
llvm-svn: 223660
2014-12-08 22:36:51 +08:00
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
|
[X86] Avoid introducing extra shuffles when lowering packed vector shifts.
When lowering a vector shift node, the backend checks if the shift count is a
shuffle with a splat mask. If so, then it introduces an extra dag node to
extract the splat value from the shuffle. The splat value is then used
to generate a shift count of a target specific shift.
However, if we know that the shift count is a splat shuffle, we can use the
splat index 'I' to extract the I-th element from the first shuffle operand.
The advantage is that the splat shuffle may become dead since we no longer
use it.
Example:
;;
define <4 x i32> @example(<4 x i32> %a, <4 x i32> %b) {
%c = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
%shl = shl <4 x i32> %a, %c
ret <4 x i32> %shl
}
;;
Before this patch, llc generated the following code (-mattr=+avx):
vpshufd $0, %xmm1, %xmm1 # xmm1 = xmm1[0,0,0,0]
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
With this patch, the redundant splat operation is removed from the code.
vpxor %xmm2, %xmm2
vpblendw $3, %xmm1, %xmm2, %xmm1 # xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
vpslld %xmm1, %xmm0, %xmm0
retq
llvm-svn: 223461
2014-12-05 20:13:30 +08:00
|
|
|
; AVX-NEXT: vpsraw %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit14 = shufflevector <8 x i16> %B, <8 x i16> undef, <8 x i32> zeroinitializer
|
|
|
|
%shr = ashr <8 x i16> %A, %vecinit14
|
|
|
|
ret <8 x i16> %shr
|
|
|
|
}
|
|
|
|
|
|
|
|
define <4 x i32> @test8(<4 x i32> %A, <4 x i32> %B) {
|
|
|
|
; SSE2-LABEL: test8:
|
|
|
|
; SSE2: # BB#0
|
|
|
|
; SSE2-NEXT: xorps %xmm2, %xmm2
|
|
|
|
; SSE2-NEXT: movss %xmm1, %xmm2
|
|
|
|
; SSE2-NEXT: psrad %xmm2, %xmm0
|
|
|
|
; SSE2-NEXT: retq
|
|
|
|
; AVX-LABEL: test8:
|
|
|
|
; AVX: # BB#0
|
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3,4,5,6,7]
|
|
|
|
; AVX-NEXT: vpsrad %xmm1, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: retq
|
|
|
|
entry:
|
|
|
|
%vecinit6 = shufflevector <4 x i32> %B, <4 x i32> undef, <4 x i32> zeroinitializer
|
|
|
|
%shr = ashr <4 x i32> %A, %vecinit6
|
|
|
|
ret <4 x i32> %shr
|
|
|
|
}
|