llvm-project/llvm/test/CodeGen/X86/vec_shift4.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32
; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64

define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
; X32-LABEL: shl1:
; X32:       # %bb.0: # %entry
; X32-NEXT:    pslld $23, %xmm1
; X32-NEXT:    paddd {{\.LCPI.*}}, %xmm1
; X32-NEXT:    cvttps2dq %xmm1, %xmm1
; X32-NEXT:    pmulld %xmm1, %xmm0
; X32-NEXT:    retl
;
; X64-LABEL: shl1:
; X64:       # %bb.0: # %entry
; X64-NEXT:    pslld $23, %xmm1
; X64-NEXT:    paddd {{.*}}(%rip), %xmm1
; X64-NEXT:    cvttps2dq %xmm1, %xmm1
; X64-NEXT:    pmulld %xmm1, %xmm0
; X64-NEXT:    retq
entry:
; CHECK-NOT: shll
; CHECK: pslld
; CHECK: paddd
; CHECK: cvttps2dq
; CHECK: pmulld

  %shl = shl <4 x i32> %r, %a                     ; <<4 x i32>> [#uses=1]
  %tmp2 = bitcast <4 x i32> %shl to <2 x i64>     ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %tmp2
}

define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
; X32-LABEL: shl2:
; X32:       # %bb.0: # %entry
; X32-NEXT:    movdqa %xmm0, %xmm2
; X32-NEXT:    psllw $5, %xmm1
; X32-NEXT:    movdqa %xmm0, %xmm3
; X32-NEXT:    psllw $4, %xmm3
; X32-NEXT:    pand {{\.LCPI.*}}, %xmm3
; X32-NEXT:    movdqa %xmm1, %xmm0
; X32-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
; X32-NEXT:    movdqa %xmm2, %xmm3
; X32-NEXT:    psllw $2, %xmm3
; X32-NEXT:    pand {{\.LCPI.*}}, %xmm3
; X32-NEXT:    paddb %xmm1, %xmm1
; X32-NEXT:    movdqa %xmm1, %xmm0
; X32-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
; X32-NEXT:    movdqa %xmm2, %xmm3
; X32-NEXT:    paddb %xmm2, %xmm3
; X32-NEXT:    paddb %xmm1, %xmm1
; X32-NEXT:    movdqa %xmm1, %xmm0
; X32-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
; X32-NEXT:    movdqa %xmm2, %xmm0
; X32-NEXT:    retl
;
; X64-LABEL: shl2:
; X64:       # %bb.0: # %entry
; X64-NEXT:    movdqa %xmm0, %xmm2
; X64-NEXT:    psllw $5, %xmm1
; X64-NEXT:    movdqa %xmm0, %xmm3
; X64-NEXT:    psllw $4, %xmm3
; X64-NEXT:    pand {{.*}}(%rip), %xmm3
; X64-NEXT:    movdqa %xmm1, %xmm0
; X64-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
; X64-NEXT:    movdqa %xmm2, %xmm3
; X64-NEXT:    psllw $2, %xmm3
; X64-NEXT:    pand {{.*}}(%rip), %xmm3
; X64-NEXT:    paddb %xmm1, %xmm1
; X64-NEXT:    movdqa %xmm1, %xmm0
; X64-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
; X64-NEXT:    movdqa %xmm2, %xmm3
; X64-NEXT:    paddb %xmm2, %xmm3
; X64-NEXT:    paddb %xmm1, %xmm1
; X64-NEXT:    movdqa %xmm1, %xmm0
; X64-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
; X64-NEXT:    movdqa %xmm2, %xmm0
; X64-NEXT:    retq
entry:
; CHECK-NOT: shlb
; CHECK: pblendvb
; CHECK: pblendvb
; CHECK: pblendvb
  %shl = shl <16 x i8> %r, %a                     ; <<16 x i8>> [#uses=1]
  %tmp2 = bitcast <16 x i8> %shl to <2 x i64>     ; <<2 x i64>> [#uses=1]
  ret <2 x i64> %tmp2
}
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py`
			`; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1 \| FileCheck %s --check-prefix=X32`
			`; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 \| FileCheck %s --check-prefix=X64`
~40% faster vector shl <4 x i32> on SSE 4.1 Larger improvements for smaller types coming in future patches. For: define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { entry: %shl = shl <4 x i32> %r, %a ; <<4 x i32>> [#uses=1] %tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp2 } We get: _shl: ## @shl pslld $23, %xmm1 paddd LCPI0_0, %xmm1 cvttps2dq %xmm1, %xmm1 pmulld %xmm1, %xmm0 ret Instead of: _shl: ## @shl pshufd $3, %xmm0, %xmm2 movd %xmm2, %eax pshufd $3, %xmm1, %xmm2 movd %xmm2, %ecx shll %cl, %eax movd %eax, %xmm2 pshufd $1, %xmm0, %xmm3 movd %xmm3, %eax pshufd $1, %xmm1, %xmm3 movd %xmm3, %ecx shll %cl, %eax movd %eax, %xmm3 punpckldq %xmm2, %xmm3 movd %xmm0, %eax movd %xmm1, %ecx shll %cl, %eax movd %eax, %xmm2 movhlps %xmm0, %xmm0 movd %xmm0, %eax movhlps %xmm1, %xmm1 movd %xmm1, %ecx shll %cl, %eax movd %eax, %xmm0 punpckldq %xmm0, %xmm2 movdqa %xmm2, %xmm0 punpckldq %xmm3, %xmm0 ret llvm-svn: 109549 2010-07-28 06:37:06 +08:00
Implement a vectorized algorithm for <16 x i8> << <16 x i8> This is about 4x faster and smaller than the existing scalarization. llvm-svn: 109566 2010-07-28 08:21:48 +08:00			`define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-LABEL: shl1:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; X32: # %bb.0: # %entry`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-NEXT: pslld $23, %xmm1`
			`; X32-NEXT: paddd {{\.LCPI.*}}, %xmm1`
			`; X32-NEXT: cvttps2dq %xmm1, %xmm1`
			`; X32-NEXT: pmulld %xmm1, %xmm0`
			`; X32-NEXT: retl`
			`;`
			`; X64-LABEL: shl1:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; X64: # %bb.0: # %entry`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X64-NEXT: pslld $23, %xmm1`
			`; X64-NEXT: paddd {{.*}}(%rip), %xmm1`
			`; X64-NEXT: cvttps2dq %xmm1, %xmm1`
			`; X64-NEXT: pmulld %xmm1, %xmm0`
			`; X64-NEXT: retq`
~40% faster vector shl <4 x i32> on SSE 4.1 Larger improvements for smaller types coming in future patches. For: define <2 x i64> @shl(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp { entry: %shl = shl <4 x i32> %r, %a ; <<4 x i32>> [#uses=1] %tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1] ret <2 x i64> %tmp2 } We get: _shl: ## @shl pslld $23, %xmm1 paddd LCPI0_0, %xmm1 cvttps2dq %xmm1, %xmm1 pmulld %xmm1, %xmm0 ret Instead of: _shl: ## @shl pshufd $3, %xmm0, %xmm2 movd %xmm2, %eax pshufd $3, %xmm1, %xmm2 movd %xmm2, %ecx shll %cl, %eax movd %eax, %xmm2 pshufd $1, %xmm0, %xmm3 movd %xmm3, %eax pshufd $1, %xmm1, %xmm3 movd %xmm3, %ecx shll %cl, %eax movd %eax, %xmm3 punpckldq %xmm2, %xmm3 movd %xmm0, %eax movd %xmm1, %ecx shll %cl, %eax movd %eax, %xmm2 movhlps %xmm0, %xmm0 movd %xmm0, %eax movhlps %xmm1, %xmm1 movd %xmm1, %ecx shll %cl, %eax movd %eax, %xmm0 punpckldq %xmm0, %xmm2 movdqa %xmm2, %xmm0 punpckldq %xmm3, %xmm0 ret llvm-svn: 109549 2010-07-28 06:37:06 +08:00			`entry:`
			`; CHECK-NOT: shll`
			`; CHECK: pslld`
			`; CHECK: paddd`
			`; CHECK: cvttps2dq`
			`; CHECK: pmulld`

			`%shl = shl <4 x i32> %r, %a ; <<4 x i32>> [#uses=1]`
			`%tmp2 = bitcast <4 x i32> %shl to <2 x i64> ; <<2 x i64>> [#uses=1]`
			`ret <2 x i64> %tmp2`
			`}`
Implement a vectorized algorithm for <16 x i8> << <16 x i8> This is about 4x faster and smaller than the existing scalarization. llvm-svn: 109566 2010-07-28 08:21:48 +08:00
			`define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-LABEL: shl2:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; X32: # %bb.0: # %entry`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-NEXT: movdqa %xmm0, %xmm2`
			`; X32-NEXT: psllw $5, %xmm1`
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Re-enable commit r323991 now that r325931 has been committed to make MachineOperand::isRenamable() check more conservative w.r.t. code changes and opt-in on a per-target basis. llvm-svn: 326208 2018-02-28 00:59:10 +08:00			`; X32-NEXT: movdqa %xmm0, %xmm3`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-NEXT: psllw $4, %xmm3`
			`; X32-NEXT: pand {{\.LCPI.*}}, %xmm3`
			`; X32-NEXT: movdqa %xmm1, %xmm0`
[X86] Fix printing of blendvpd/blendvps/pblendvb to include the implicit %xmm0 argument. This makes codegen output more obvious about the %xmm0 usage. llvm-svn: 294131 2017-02-06 02:33:24 +08:00			`; X32-NEXT: pblendvb %xmm0, %xmm3, %xmm2`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-NEXT: movdqa %xmm2, %xmm3`
			`; X32-NEXT: psllw $2, %xmm3`
			`; X32-NEXT: pand {{\.LCPI.*}}, %xmm3`
			`; X32-NEXT: paddb %xmm1, %xmm1`
			`; X32-NEXT: movdqa %xmm1, %xmm0`
[X86] Fix printing of blendvpd/blendvps/pblendvb to include the implicit %xmm0 argument. This makes codegen output more obvious about the %xmm0 usage. llvm-svn: 294131 2017-02-06 02:33:24 +08:00			`; X32-NEXT: pblendvb %xmm0, %xmm3, %xmm2`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-NEXT: movdqa %xmm2, %xmm3`
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Re-enable commit r323991 now that r325931 has been committed to make MachineOperand::isRenamable() check more conservative w.r.t. code changes and opt-in on a per-target basis. llvm-svn: 326208 2018-02-28 00:59:10 +08:00			`; X32-NEXT: paddb %xmm2, %xmm3`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-NEXT: paddb %xmm1, %xmm1`
			`; X32-NEXT: movdqa %xmm1, %xmm0`
[X86] Fix printing of blendvpd/blendvps/pblendvb to include the implicit %xmm0 argument. This makes codegen output more obvious about the %xmm0 usage. llvm-svn: 294131 2017-02-06 02:33:24 +08:00			`; X32-NEXT: pblendvb %xmm0, %xmm3, %xmm2`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X32-NEXT: movdqa %xmm2, %xmm0`
			`; X32-NEXT: retl`
			`;`
			`; X64-LABEL: shl2:`
[CodeGen] Unify MBB reference format in both MIR and debug output As part of the unification of the debug format and the MIR format, print MBB references as '%bb.5'. The MIR printer prints the IR name of a MBB only for block definitions. * find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)->getNumber\(\)/" << printMBBReference(\1)/g' find . \( -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#" << ([a-zA-Z0-9_]+)\.getNumber\(\)/" << printMBBReference(\1)/g' * find . \( -name ".txt" -o -name ".s" -o -name ".mir" -o -name ".cpp" -o -name ".h" -o -name ".ll" \) -type f -print0 \| xargs -0 sed -i '' -E 's/BB#([0-9]+)/%bb.\1/g' * grep -nr 'BB#' and fix Differential Revision: https://reviews.llvm.org/D40422 llvm-svn: 319665 2017-12-05 01:18:51 +08:00			`; X64: # %bb.0: # %entry`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X64-NEXT: movdqa %xmm0, %xmm2`
			`; X64-NEXT: psllw $5, %xmm1`
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Re-enable commit r323991 now that r325931 has been committed to make MachineOperand::isRenamable() check more conservative w.r.t. code changes and opt-in on a per-target basis. llvm-svn: 326208 2018-02-28 00:59:10 +08:00			`; X64-NEXT: movdqa %xmm0, %xmm3`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X64-NEXT: psllw $4, %xmm3`
			`; X64-NEXT: pand {{.*}}(%rip), %xmm3`
			`; X64-NEXT: movdqa %xmm1, %xmm0`
[X86] Fix printing of blendvpd/blendvps/pblendvb to include the implicit %xmm0 argument. This makes codegen output more obvious about the %xmm0 usage. llvm-svn: 294131 2017-02-06 02:33:24 +08:00			`; X64-NEXT: pblendvb %xmm0, %xmm3, %xmm2`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X64-NEXT: movdqa %xmm2, %xmm3`
			`; X64-NEXT: psllw $2, %xmm3`
			`; X64-NEXT: pand {{.*}}(%rip), %xmm3`
			`; X64-NEXT: paddb %xmm1, %xmm1`
			`; X64-NEXT: movdqa %xmm1, %xmm0`
[X86] Fix printing of blendvpd/blendvps/pblendvb to include the implicit %xmm0 argument. This makes codegen output more obvious about the %xmm0 usage. llvm-svn: 294131 2017-02-06 02:33:24 +08:00			`; X64-NEXT: pblendvb %xmm0, %xmm3, %xmm2`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X64-NEXT: movdqa %xmm2, %xmm3`
Re-enable "[MachineCopyPropagation] Extend pass to do COPY source forwarding" Re-enable commit r323991 now that r325931 has been committed to make MachineOperand::isRenamable() check more conservative w.r.t. code changes and opt-in on a per-target basis. llvm-svn: 326208 2018-02-28 00:59:10 +08:00			`; X64-NEXT: paddb %xmm2, %xmm3`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X64-NEXT: paddb %xmm1, %xmm1`
			`; X64-NEXT: movdqa %xmm1, %xmm0`
[X86] Fix printing of blendvpd/blendvps/pblendvb to include the implicit %xmm0 argument. This makes codegen output more obvious about the %xmm0 usage. llvm-svn: 294131 2017-02-06 02:33:24 +08:00			`; X64-NEXT: pblendvb %xmm0, %xmm3, %xmm2`
[X86][SSE] Regenerate vector shift tests llvm-svn: 274987 2016-07-10 04:55:20 +08:00			`; X64-NEXT: movdqa %xmm2, %xmm0`
			`; X64-NEXT: retq`
Implement a vectorized algorithm for <16 x i8> << <16 x i8> This is about 4x faster and smaller than the existing scalarization. llvm-svn: 109566 2010-07-28 08:21:48 +08:00			`entry:`
			`; CHECK-NOT: shlb`
			`; CHECK: pblendvb`
			`; CHECK: pblendvb`
			`; CHECK: pblendvb`
			`%shl = shl <16 x i8> %r, %a ; <<16 x i8>> [#uses=1]`
			`%tmp2 = bitcast <16 x i8> %shl to <2 x i64> ; <<2 x i64>> [#uses=1]`
			`ret <2 x i64> %tmp2`
			`}`