2015-11-24 05:33:58 +08:00
|
|
|
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
|
2016-02-01 15:56:09 +08:00
|
|
|
; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix=AVX
|
|
|
|
; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl < %s | FileCheck %s --check-prefix=KNL-32
|
|
|
|
|
[x86] Fix wrong lowering of vsetcc nodes (PR25080).
Function LowerVSETCC (in X86ISelLowering.cpp) worked under the wrong
assumption that for non-AVX512 targets, the source type and destination type
of a type-legalized setcc node were always the same type.
This assumption was unfortunately incorrect; the type legalizer is not always
able to promote the return type of a setcc to the same type as the first
operand of a setcc.
In the case of a vsetcc node, the legalizer firstly checks if the first input
operand has a legal type. If so, then it promotes the return type of the vsetcc
to that same type. Otherwise, the return type is promoted to the 'next legal
type', which, for vectors of MVT::i1 is always a 128-bit integer vector type.
Example (-mattr=+avx):
%0 = trunc <8 x i32> %a to <8 x i23>
%1 = icmp eq <8 x i23> %0, zeroinitializer
The initial selection dag for the code above is:
v8i1 = setcc t5, t7, seteq:ch
t5: v8i23 = truncate t2
t2: v8i32,ch = CopyFromReg t0, Register:v8i32 %vreg1
t7: v8i32 = build_vector of all zeroes.
The type legalizer would firstly check if 't5' has a legal type. If so, then it
would reuse that same type to promote the return type of the setcc node.
Unfortunately 't5' is of illegal type v8i23, and therefore it cannot be used to
promote the return type of the setcc node. Consequently, the setcc return type
is promoted to v8i16. Later on, 't5' is promoted to v8i32 thus leading to the
following dag node:
v8i16 = setcc t32, t25, seteq:ch
where t32 and t25 are now values of type v8i32.
Before this patch, function LowerVSETCC would have wrongly expanded the setcc
to a single X86ISD::PCMPEQ. Surprisingly, ISel was still able to match an
instruction. In our case, ISel would have matched a VPCMPEQWrr:
t37: v8i16 = X86ISD::VPCMPEQWrr t36, t25
However, t36 and t25 are both VR256, while the result type is instead of class
VR128. This inconsistency ended up causing the insertion of COPY instructions
like this:
%vreg7<def> = COPY %vreg3; VR128:%vreg7 VR256:%vreg3
Which is an invalid full copy (not a sub register copy).
Eventually, the backend would have hit an UNREACHABLE "Cannot emit physreg copy
instruction" in the attempt to expand the malformed pseudo COPY instructions.
This patch fixes the problem adding the missing logic in LowerVSETCC to handle
the corner case of a setcc with 128-bit return type and 256-bit operand type.
This problem was originally reported by Dimitry as PR25080. It has been latent
for a very long time. I have added the minimal reproducible from that bugzilla
as test setcc-lowering.ll.
Differential Revision: http://reviews.llvm.org/D13660
llvm-svn: 250085
2015-10-13 03:22:30 +08:00
|
|
|
|
|
|
|
; Verify that we don't crash during codegen due to a wrong lowering
|
|
|
|
; of a setcc node with illegal operand types and return type.
|
|
|
|
|
|
|
|
define <8 x i16> @pr25080(<8 x i32> %a) {
|
2016-02-01 15:56:09 +08:00
|
|
|
; AVX-LABEL: pr25080:
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX: # %bb.0: # %entry
|
2016-02-01 15:56:09 +08:00
|
|
|
; AVX-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0
|
|
|
|
; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
|
|
|
|
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
|
|
|
|
; AVX-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
|
2017-10-24 23:38:16 +08:00
|
|
|
; AVX-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
|
2016-02-01 15:56:09 +08:00
|
|
|
; AVX-NEXT: vpor {{.*}}(%rip), %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpsllw $15, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vpsraw $15, %xmm0, %xmm0
|
|
|
|
; AVX-NEXT: vzeroupper
|
|
|
|
; AVX-NEXT: retq
|
2017-02-12 03:27:15 +08:00
|
|
|
;
|
|
|
|
; KNL-32-LABEL: pr25080:
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL-32: # %bb.0: # %entry
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-32-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
|
2017-11-11 14:19:12 +08:00
|
|
|
; KNL-32-NEXT: vpbroadcastd {{.*#+}} ymm1 = [8388607,8388607,8388607,8388607,8388607,8388607,8388607,8388607]
|
2017-11-06 17:22:38 +08:00
|
|
|
; KNL-32-NEXT: vptestnmd %zmm1, %zmm0, %k0
|
2017-02-12 03:27:15 +08:00
|
|
|
; KNL-32-NEXT: movb $15, %al
|
|
|
|
; KNL-32-NEXT: kmovw %eax, %k1
|
|
|
|
; KNL-32-NEXT: korw %k1, %k0, %k1
|
2017-12-05 14:37:21 +08:00
|
|
|
; KNL-32-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
|
|
|
|
; KNL-32-NEXT: vpmovdw %zmm0, %ymm0
|
2018-02-01 06:04:26 +08:00
|
|
|
; KNL-32-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0
|
2017-02-12 03:27:15 +08:00
|
|
|
; KNL-32-NEXT: retl
|
[x86] Fix wrong lowering of vsetcc nodes (PR25080).
Function LowerVSETCC (in X86ISelLowering.cpp) worked under the wrong
assumption that for non-AVX512 targets, the source type and destination type
of a type-legalized setcc node were always the same type.
This assumption was unfortunately incorrect; the type legalizer is not always
able to promote the return type of a setcc to the same type as the first
operand of a setcc.
In the case of a vsetcc node, the legalizer firstly checks if the first input
operand has a legal type. If so, then it promotes the return type of the vsetcc
to that same type. Otherwise, the return type is promoted to the 'next legal
type', which, for vectors of MVT::i1 is always a 128-bit integer vector type.
Example (-mattr=+avx):
%0 = trunc <8 x i32> %a to <8 x i23>
%1 = icmp eq <8 x i23> %0, zeroinitializer
The initial selection dag for the code above is:
v8i1 = setcc t5, t7, seteq:ch
t5: v8i23 = truncate t2
t2: v8i32,ch = CopyFromReg t0, Register:v8i32 %vreg1
t7: v8i32 = build_vector of all zeroes.
The type legalizer would firstly check if 't5' has a legal type. If so, then it
would reuse that same type to promote the return type of the setcc node.
Unfortunately 't5' is of illegal type v8i23, and therefore it cannot be used to
promote the return type of the setcc node. Consequently, the setcc return type
is promoted to v8i16. Later on, 't5' is promoted to v8i32 thus leading to the
following dag node:
v8i16 = setcc t32, t25, seteq:ch
where t32 and t25 are now values of type v8i32.
Before this patch, function LowerVSETCC would have wrongly expanded the setcc
to a single X86ISD::PCMPEQ. Surprisingly, ISel was still able to match an
instruction. In our case, ISel would have matched a VPCMPEQWrr:
t37: v8i16 = X86ISD::VPCMPEQWrr t36, t25
However, t36 and t25 are both VR256, while the result type is instead of class
VR128. This inconsistency ended up causing the insertion of COPY instructions
like this:
%vreg7<def> = COPY %vreg3; VR128:%vreg7 VR256:%vreg3
Which is an invalid full copy (not a sub register copy).
Eventually, the backend would have hit an UNREACHABLE "Cannot emit physreg copy
instruction" in the attempt to expand the malformed pseudo COPY instructions.
This patch fixes the problem adding the missing logic in LowerVSETCC to handle
the corner case of a setcc with 128-bit return type and 256-bit operand type.
This problem was originally reported by Dimitry as PR25080. It has been latent
for a very long time. I have added the minimal reproducible from that bugzilla
as test setcc-lowering.ll.
Differential Revision: http://reviews.llvm.org/D13660
llvm-svn: 250085
2015-10-13 03:22:30 +08:00
|
|
|
entry:
|
|
|
|
%0 = trunc <8 x i32> %a to <8 x i23>
|
|
|
|
%1 = icmp eq <8 x i23> %0, zeroinitializer
|
|
|
|
%2 = or <8 x i1> %1, <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>
|
|
|
|
%3 = sext <8 x i1> %2 to <8 x i16>
|
|
|
|
ret <8 x i16> %3
|
|
|
|
}
|
2016-02-01 15:56:09 +08:00
|
|
|
|
2017-05-30 02:27:00 +08:00
|
|
|
define void @pr26232(i64 %a, <16 x i1> %b) {
|
2017-02-12 03:27:15 +08:00
|
|
|
; AVX-LABEL: pr26232:
|
Generalize MergeBlockIntoPredecessor. Replace uses of MergeBasicBlockIntoOnlyPred.
Summary:
Two utils methods have essentially the same functionality. This is an attempt to merge them into one.
1. lib/Transforms/Utils/Local.cpp : MergeBasicBlockIntoOnlyPred
2. lib/Transforms/Utils/BasicBlockUtils.cpp : MergeBlockIntoPredecessor
Prior to the patch:
1. MergeBasicBlockIntoOnlyPred
Updates either DomTree or DeferredDominance
Moves all instructions from Pred to BB, deletes Pred
Asserts BB has single predecessor
If address was taken, replace the block address with constant 1 (?)
2. MergeBlockIntoPredecessor
Updates DomTree, LoopInfo and MemoryDependenceResults
Moves all instruction from BB to Pred, deletes BB
Returns if doesn't have a single predecessor
Returns if BB's address was taken
After the patch:
Method 2. MergeBlockIntoPredecessor is attempting to become the new default:
Updates DomTree or DeferredDominance, and LoopInfo and MemoryDependenceResults
Moves all instruction from BB to Pred, deletes BB
Returns if doesn't have a single predecessor
Returns if BB's address was taken
Uses of MergeBasicBlockIntoOnlyPred that need to be replaced:
1. lib/Transforms/Scalar/LoopSimplifyCFG.cpp
Updated in this patch. No challenges.
2. lib/CodeGen/CodeGenPrepare.cpp
Updated in this patch.
i. eliminateFallThrough is straightforward, but I added using a temporary array to avoid the iterator invalidation.
ii. eliminateMostlyEmptyBlock(s) methods also now use a temporary array for blocks
Some interesting aspects:
- Since Pred is not deleted (BB is), the entry block does not need updating.
- The entry block was being updated with the deleted block in eliminateMostlyEmptyBlock. Added assert to make obvious that BB=SinglePred.
- isMergingEmptyBlockProfitable assumes BB is the one to be deleted.
- eliminateMostlyEmptyBlock(BB) does not delete BB on one path, it deletes its unique predecessor instead.
- adding some test owner as subscribers for the interesting tests modified:
test/CodeGen/X86/avx-cmp.ll
test/CodeGen/AMDGPU/nested-loop-conditions.ll
test/CodeGen/AMDGPU/si-annotate-cf.ll
test/CodeGen/X86/hoist-spill.ll
test/CodeGen/X86/2006-11-17-IllegalMove.ll
3. lib/Transforms/Scalar/JumpThreading.cpp
Not covered in this patch. It is the only use case using the DeferredDominance.
I would defer to Brian Rzycki to make this replacement.
Reviewers: chandlerc, spatel, davide, brzycki, bkramer, javed.absar
Subscribers: qcolombet, sanjoy, nemanjai, nhaehnle, jlebar, tpr, kbarton, RKSimon, wmi, arsenm, llvm-commits
Differential Revision: https://reviews.llvm.org/D48202
llvm-svn: 335183
2018-06-21 06:01:04 +08:00
|
|
|
; AVX: # %bb.0: # %allocas
|
2017-05-30 02:27:00 +08:00
|
|
|
; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
|
2017-02-12 03:27:15 +08:00
|
|
|
; AVX-NEXT: .p2align 4, 0x90
|
|
|
|
; AVX-NEXT: .LBB1_1: # %for_loop599
|
|
|
|
; AVX-NEXT: # =>This Inner Loop Header: Depth=1
|
2017-05-30 02:27:00 +08:00
|
|
|
; AVX-NEXT: xorl %eax, %eax
|
2017-02-12 03:27:15 +08:00
|
|
|
; AVX-NEXT: cmpq $65536, %rdi # imm = 0x10000
|
2017-05-30 02:27:00 +08:00
|
|
|
; AVX-NEXT: setl %al
|
[X86] Move promotion of vector and/or/xor from legalization to DAG combine
Summary:
I've noticed that the bitcasts we introduce for these make computeKnownBits and computeNumSignBits not work well in LegalizeVectorOps. LegalizeVectorOps legalizes bottom up while LegalizeDAG legalizes top down. The bottom up strategy for LegalizeVectorOps means operands are legalized before their uses. So we promote and/or/xor before we legalize the operands that use them making computeKnownBits/computeNumSignBits in places like LowerTruncate suboptimal. I looked at changing LegalizeVectorOps to be top down as well, but that was more disruptive and caused some regressions. I also looked at just moving promotion of binops to LegalizeDAG, but that had a few issues one around matching AND,ANDN,OR into VSELECT because I had to create ANDN as vXi64, but the other nodes hadn't legalized yet, I didn't look too hard at fixing that.
This patch seems to produce better results overall than my other attempts. We now form broadcasts of constants better in some cases. For at least some of them the AND was being introduced in LegalizeDAG, promoted to vXi64, and the BUILD_VECTOR was also legalized there. I think we got bad ordering of that. Now the promotion is out of the legalizer so we handle this better.
In the longer term I think we really should evaluate whether we should be doing this promotion at all. It's really there to reduce isel pattern count, but I'm wondering if we'd be better served just eating the pattern cost or doing C++ based isel for vector and/or/xor in X86ISelDAGToDAG. The masked and/or/xor will definitely be difficult in patterns if a bitcast gets between the vselect and the and/or/xor node. That becomes a lot of permutations to cover.
Reviewers: RKSimon, spatel
Reviewed By: RKSimon
Subscribers: llvm-commits
Differential Revision: https://reviews.llvm.org/D53107
llvm-svn: 344487
2018-10-15 09:51:58 +08:00
|
|
|
; AVX-NEXT: vmovd %eax, %xmm2
|
|
|
|
; AVX-NEXT: vpshufb %xmm1, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpand %xmm0, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpsllw $7, %xmm2, %xmm2
|
|
|
|
; AVX-NEXT: vpmovmskb %xmm2, %eax
|
2017-06-01 19:27:57 +08:00
|
|
|
; AVX-NEXT: testw %ax, %ax
|
2017-02-12 03:27:15 +08:00
|
|
|
; AVX-NEXT: jne .LBB1_1
|
2017-12-05 01:18:51 +08:00
|
|
|
; AVX-NEXT: # %bb.2: # %for_exit600
|
2017-02-12 03:27:15 +08:00
|
|
|
; AVX-NEXT: retq
|
|
|
|
;
|
2016-02-01 15:56:09 +08:00
|
|
|
; KNL-32-LABEL: pr26232:
|
Generalize MergeBlockIntoPredecessor. Replace uses of MergeBasicBlockIntoOnlyPred.
Summary:
Two utils methods have essentially the same functionality. This is an attempt to merge them into one.
1. lib/Transforms/Utils/Local.cpp : MergeBasicBlockIntoOnlyPred
2. lib/Transforms/Utils/BasicBlockUtils.cpp : MergeBlockIntoPredecessor
Prior to the patch:
1. MergeBasicBlockIntoOnlyPred
Updates either DomTree or DeferredDominance
Moves all instructions from Pred to BB, deletes Pred
Asserts BB has single predecessor
If address was taken, replace the block address with constant 1 (?)
2. MergeBlockIntoPredecessor
Updates DomTree, LoopInfo and MemoryDependenceResults
Moves all instruction from BB to Pred, deletes BB
Returns if doesn't have a single predecessor
Returns if BB's address was taken
After the patch:
Method 2. MergeBlockIntoPredecessor is attempting to become the new default:
Updates DomTree or DeferredDominance, and LoopInfo and MemoryDependenceResults
Moves all instruction from BB to Pred, deletes BB
Returns if doesn't have a single predecessor
Returns if BB's address was taken
Uses of MergeBasicBlockIntoOnlyPred that need to be replaced:
1. lib/Transforms/Scalar/LoopSimplifyCFG.cpp
Updated in this patch. No challenges.
2. lib/CodeGen/CodeGenPrepare.cpp
Updated in this patch.
i. eliminateFallThrough is straightforward, but I added using a temporary array to avoid the iterator invalidation.
ii. eliminateMostlyEmptyBlock(s) methods also now use a temporary array for blocks
Some interesting aspects:
- Since Pred is not deleted (BB is), the entry block does not need updating.
- The entry block was being updated with the deleted block in eliminateMostlyEmptyBlock. Added assert to make obvious that BB=SinglePred.
- isMergingEmptyBlockProfitable assumes BB is the one to be deleted.
- eliminateMostlyEmptyBlock(BB) does not delete BB on one path, it deletes its unique predecessor instead.
- adding some test owner as subscribers for the interesting tests modified:
test/CodeGen/X86/avx-cmp.ll
test/CodeGen/AMDGPU/nested-loop-conditions.ll
test/CodeGen/AMDGPU/si-annotate-cf.ll
test/CodeGen/X86/hoist-spill.ll
test/CodeGen/X86/2006-11-17-IllegalMove.ll
3. lib/Transforms/Scalar/JumpThreading.cpp
Not covered in this patch. It is the only use case using the DeferredDominance.
I would defer to Brian Rzycki to make this replacement.
Reviewers: chandlerc, spatel, davide, brzycki, bkramer, javed.absar
Subscribers: qcolombet, sanjoy, nemanjai, nhaehnle, jlebar, tpr, kbarton, RKSimon, wmi, arsenm, llvm-commits
Differential Revision: https://reviews.llvm.org/D48202
llvm-svn: 335183
2018-06-21 06:01:04 +08:00
|
|
|
; KNL-32: # %bb.0: # %allocas
|
2016-02-01 15:56:09 +08:00
|
|
|
; KNL-32-NEXT: pushl %esi
|
|
|
|
; KNL-32-NEXT: .cfi_def_cfa_offset 8
|
|
|
|
; KNL-32-NEXT: .cfi_offset %esi, -8
|
2018-11-10 03:05:51 +08:00
|
|
|
; KNL-32-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
|
2017-05-30 02:27:00 +08:00
|
|
|
; KNL-32-NEXT: vpslld $31, %zmm0, %zmm0
|
|
|
|
; KNL-32-NEXT: vptestmd %zmm0, %zmm0, %k0
|
2016-02-01 15:56:09 +08:00
|
|
|
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
|
|
|
; KNL-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
2018-02-21 01:41:00 +08:00
|
|
|
; KNL-32-NEXT: movl $65535, %edx # imm = 0xFFFF
|
2016-02-01 15:56:09 +08:00
|
|
|
; KNL-32-NEXT: .p2align 4, 0x90
|
|
|
|
; KNL-32-NEXT: .LBB1_1: # %for_loop599
|
|
|
|
; KNL-32-NEXT: # =>This Inner Loop Header: Depth=1
|
|
|
|
; KNL-32-NEXT: cmpl $65536, %ecx # imm = 0x10000
|
|
|
|
; KNL-32-NEXT: movl %eax, %esi
|
|
|
|
; KNL-32-NEXT: sbbl $0, %esi
|
|
|
|
; KNL-32-NEXT: movl $0, %esi
|
2018-02-21 01:41:00 +08:00
|
|
|
; KNL-32-NEXT: cmovll %edx, %esi
|
2017-05-30 02:27:00 +08:00
|
|
|
; KNL-32-NEXT: kmovw %esi, %k1
|
|
|
|
; KNL-32-NEXT: kandw %k0, %k1, %k1
|
2018-02-08 15:54:16 +08:00
|
|
|
; KNL-32-NEXT: kortestw %k1, %k1
|
2016-02-01 15:56:09 +08:00
|
|
|
; KNL-32-NEXT: jne .LBB1_1
|
2017-12-05 01:18:51 +08:00
|
|
|
; KNL-32-NEXT: # %bb.2: # %for_exit600
|
2016-02-01 15:56:09 +08:00
|
|
|
; KNL-32-NEXT: popl %esi
|
2018-04-24 18:32:08 +08:00
|
|
|
; KNL-32-NEXT: .cfi_def_cfa_offset 4
|
2016-02-01 15:56:09 +08:00
|
|
|
; KNL-32-NEXT: retl
|
|
|
|
allocas:
|
|
|
|
br label %for_test11.preheader
|
|
|
|
|
|
|
|
for_test11.preheader: ; preds = %for_test11.preheader, %allocas
|
|
|
|
br i1 undef, label %for_loop599, label %for_test11.preheader
|
|
|
|
|
|
|
|
for_loop599: ; preds = %for_loop599, %for_test11.preheader
|
|
|
|
%less_i_load605_ = icmp slt i64 %a, 65536
|
|
|
|
%less_i_load605__broadcast_init = insertelement <16 x i1> undef, i1 %less_i_load605_, i32 0
|
|
|
|
%less_i_load605__broadcast = shufflevector <16 x i1> %less_i_load605__broadcast_init, <16 x i1> undef, <16 x i32> zeroinitializer
|
2017-05-30 02:27:00 +08:00
|
|
|
%"oldMask&test607" = and <16 x i1> %less_i_load605__broadcast, %b
|
2016-02-01 15:56:09 +08:00
|
|
|
%intmask.i894 = bitcast <16 x i1> %"oldMask&test607" to i16
|
|
|
|
%res.i895 = icmp eq i16 %intmask.i894, 0
|
|
|
|
br i1 %res.i895, label %for_exit600, label %for_loop599
|
|
|
|
|
|
|
|
for_exit600: ; preds = %for_loop599
|
|
|
|
ret void
|
|
|
|
}
|