llvm-project/llvm/test/CodeGen/X86/shuffle-extract-subvector.ll

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

55 lines
2.5 KiB
LLVM
Raw Normal View History

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
define void @f(<4 x half>* %a, <4 x half>* %b, <8 x half>* %c) {
; CHECK-LABEL: f:
; CHECK: # %bb.0:
[LegalizeTypes][X86] Add a new strategy for type legalizing f16 type that softens it to i16, but promotes to f32 around arithmetic ops. This is based on this llvm-dev thread http://lists.llvm.org/pipermail/llvm-dev/2019-December/137521.html The current strategy for f16 is to promote type to float every except where the specific width is required like loads, stores, and bitcasts. This results in rounding occurring in odd places instead of immediately after arithmetic operations. This interacts in weird ways with the __fp16 type in clang which is a storage only type where arithmetic is always promoted to float. InstCombine can remove some fpext/fptruncs around such arithmetic and turn it into arithmetic on half. This wouldn't be so bad if SelectionDAG was able to put those fpext/fpround back in when it promotes. It is also not obvious how to handle to make the existing strategy work with STRICT fp. We need to use STRICT versions of the conversions which require chain operands. But if the conversions are created for a bitcast, there is no place to get an appropriate chain from. This patch implements a different strategy where conversions are emitted directly around arithmetic operations. And otherwise its passed around as an i16 including in arguments and return values. This can result in more conversions between arithmetic operations, but is closer to matching the IR the frontend generates for __fp16. And it will allow us to use the chain from constrained arithmetic nodes to link the STRICT_FP_TO_FP16/STRICT_FP16_TO_FP that will need to be added. I've set it up so that each target can opt into the new behavior. Converting all the targets myself was more than I was able to handle. Differential Revision: https://reviews.llvm.org/D73749
2020-02-01 14:42:07 +08:00
; CHECK-NEXT: movzwl (%rdi), %eax
; CHECK-NEXT: movzwl 2(%rdi), %ecx
; CHECK-NEXT: movw %cx, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movw %ax, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movzwl 6(%rdi), %r8d
; CHECK-NEXT: movzwl 4(%rdi), %r11d
[LegalizeTypes][X86] Add a new strategy for type legalizing f16 type that softens it to i16, but promotes to f32 around arithmetic ops. This is based on this llvm-dev thread http://lists.llvm.org/pipermail/llvm-dev/2019-December/137521.html The current strategy for f16 is to promote type to float every except where the specific width is required like loads, stores, and bitcasts. This results in rounding occurring in odd places instead of immediately after arithmetic operations. This interacts in weird ways with the __fp16 type in clang which is a storage only type where arithmetic is always promoted to float. InstCombine can remove some fpext/fptruncs around such arithmetic and turn it into arithmetic on half. This wouldn't be so bad if SelectionDAG was able to put those fpext/fpround back in when it promotes. It is also not obvious how to handle to make the existing strategy work with STRICT fp. We need to use STRICT versions of the conversions which require chain operands. But if the conversions are created for a bitcast, there is no place to get an appropriate chain from. This patch implements a different strategy where conversions are emitted directly around arithmetic operations. And otherwise its passed around as an i16 including in arguments and return values. This can result in more conversions between arithmetic operations, but is closer to matching the IR the frontend generates for __fp16. And it will allow us to use the chain from constrained arithmetic nodes to link the STRICT_FP_TO_FP16/STRICT_FP16_TO_FP that will need to be added. I've set it up so that each target can opt into the new behavior. Converting all the targets myself was more than I was able to handle. Differential Revision: https://reviews.llvm.org/D73749
2020-02-01 14:42:07 +08:00
; CHECK-NEXT: movq (%rsi), %rsi
; CHECK-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
; CHECK-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0
; CHECK-NEXT: pextrw $1, %xmm0, %r9d
; CHECK-NEXT: movd %xmm0, %r10d
; CHECK-NEXT: movl -{{[0-9]+}}(%rsp), %esi
; CHECK-NEXT: pextrw $3, %xmm0, %eax
; CHECK-NEXT: pextrw $2, %xmm0, %edi
; CHECK-NEXT: movw %r11w, 8(%rdx)
[LegalizeTypes][X86] Add a new strategy for type legalizing f16 type that softens it to i16, but promotes to f32 around arithmetic ops. This is based on this llvm-dev thread http://lists.llvm.org/pipermail/llvm-dev/2019-December/137521.html The current strategy for f16 is to promote type to float every except where the specific width is required like loads, stores, and bitcasts. This results in rounding occurring in odd places instead of immediately after arithmetic operations. This interacts in weird ways with the __fp16 type in clang which is a storage only type where arithmetic is always promoted to float. InstCombine can remove some fpext/fptruncs around such arithmetic and turn it into arithmetic on half. This wouldn't be so bad if SelectionDAG was able to put those fpext/fpround back in when it promotes. It is also not obvious how to handle to make the existing strategy work with STRICT fp. We need to use STRICT versions of the conversions which require chain operands. But if the conversions are created for a bitcast, there is no place to get an appropriate chain from. This patch implements a different strategy where conversions are emitted directly around arithmetic operations. And otherwise its passed around as an i16 including in arguments and return values. This can result in more conversions between arithmetic operations, but is closer to matching the IR the frontend generates for __fp16. And it will allow us to use the chain from constrained arithmetic nodes to link the STRICT_FP_TO_FP16/STRICT_FP16_TO_FP that will need to be added. I've set it up so that each target can opt into the new behavior. Converting all the targets myself was more than I was able to handle. Differential Revision: https://reviews.llvm.org/D73749
2020-02-01 14:42:07 +08:00
; CHECK-NEXT: movw %cx, 4(%rdx)
; CHECK-NEXT: movw %r8w, 12(%rdx)
; CHECK-NEXT: movw %si, (%rdx)
; CHECK-NEXT: movw %di, 10(%rdx)
; CHECK-NEXT: movw %ax, 14(%rdx)
; CHECK-NEXT: movw %r10w, 2(%rdx)
[LegalizeTypes][X86] Add a new strategy for type legalizing f16 type that softens it to i16, but promotes to f32 around arithmetic ops. This is based on this llvm-dev thread http://lists.llvm.org/pipermail/llvm-dev/2019-December/137521.html The current strategy for f16 is to promote type to float every except where the specific width is required like loads, stores, and bitcasts. This results in rounding occurring in odd places instead of immediately after arithmetic operations. This interacts in weird ways with the __fp16 type in clang which is a storage only type where arithmetic is always promoted to float. InstCombine can remove some fpext/fptruncs around such arithmetic and turn it into arithmetic on half. This wouldn't be so bad if SelectionDAG was able to put those fpext/fpround back in when it promotes. It is also not obvious how to handle to make the existing strategy work with STRICT fp. We need to use STRICT versions of the conversions which require chain operands. But if the conversions are created for a bitcast, there is no place to get an appropriate chain from. This patch implements a different strategy where conversions are emitted directly around arithmetic operations. And otherwise its passed around as an i16 including in arguments and return values. This can result in more conversions between arithmetic operations, but is closer to matching the IR the frontend generates for __fp16. And it will allow us to use the chain from constrained arithmetic nodes to link the STRICT_FP_TO_FP16/STRICT_FP16_TO_FP that will need to be added. I've set it up so that each target can opt into the new behavior. Converting all the targets myself was more than I was able to handle. Differential Revision: https://reviews.llvm.org/D73749
2020-02-01 14:42:07 +08:00
; CHECK-NEXT: movw %r9w, 6(%rdx)
; CHECK-NEXT: retq
%tmp4 = load <4 x half>, <4 x half>* %a
%tmp5 = load <4 x half>, <4 x half>* %b
%tmp7 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 0, i32 4>
%tmp8 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 1, i32 5>
%tmp9 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 2, i32 6>
%tmp10 = shufflevector <4 x half> %tmp4, <4 x half> %tmp5, <2 x i32> <i32 3, i32 7>
%tmp11 = extractelement <2 x half> %tmp7, i32 0
%tmp12 = insertelement <8 x half> undef, half %tmp11, i32 0
%tmp13 = extractelement <2 x half> %tmp7, i32 1
%tmp14 = insertelement <8 x half> %tmp12, half %tmp13, i32 1
%tmp15 = extractelement <2 x half> %tmp8, i32 0
%tmp16 = insertelement <8 x half> %tmp14, half %tmp15, i32 2
%tmp17 = extractelement <2 x half> %tmp8, i32 1
%tmp18 = insertelement <8 x half> %tmp16, half %tmp17, i32 3
%tmp19 = extractelement <2 x half> %tmp9, i32 0
%tmp20 = insertelement <8 x half> %tmp18, half %tmp19, i32 4
%tmp21 = extractelement <2 x half> %tmp9, i32 1
%tmp22 = insertelement <8 x half> %tmp20, half %tmp21, i32 5
%tmp23 = extractelement <2 x half> %tmp10, i32 0
%tmp24 = insertelement <8 x half> %tmp22, half %tmp23, i32 6
%tmp25 = extractelement <2 x half> %tmp10, i32 1
%tmp26 = insertelement <8 x half> %tmp24, half %tmp25, i32 7
store <8 x half> %tmp26, <8 x half>* %c
ret void
}