forked from OSchip/llvm-project
[x86] commute blendvb with constant condition op to allow load folding
This is a narrow fix for 1 of the problems mentioned in PR27780: https://bugs.llvm.org/show_bug.cgi?id=27780 I looked at more general solutions, but it's a mess. We canonicalize shuffle masks based on the number of elements accessed from each operand, and that's not optional. If you remove that, we'll crash because we fail to match isel patterns. So I'm waiting until we're sure that we have blendvb with constant condition and then commuting based on the load potential. Other cases like blend-with-immediate are already handled elsewhere, so this is probably not a common problem anyway. I didn't use "MayFoldLoad" because that checks for one-use and in these cases, we've screwed that up by creating a temporary PSHUFB using these operands that we're counting on to be killed later. Undoing that didn't look like a simple task because it's intertwined with determining if we actually use both operands of the shuffle or not.a Differential Revision: https://reviews.llvm.org/D53737 llvm-svn: 345390
This commit is contained in:
parent
7575c6d01b
commit
6b40768f5a
|
@ -10068,6 +10068,15 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
|
|||
// type.
|
||||
MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
|
||||
|
||||
// x86 allows load folding with blendvb from the 2nd source operand. But
|
||||
// we are still using LLVM select here (see comment below), so that's V1.
|
||||
// If V2 can be load-folded and V1 cannot be load-folded, then commute to
|
||||
// allow that load-folding possibility.
|
||||
if (!ISD::isNormalLoad(V1.getNode()) && ISD::isNormalLoad(V2.getNode())) {
|
||||
ShuffleVectorSDNode::commuteMask(Mask);
|
||||
std::swap(V1, V2);
|
||||
}
|
||||
|
||||
// Compute the VSELECT mask. Note that VSELECT is really confusing in the
|
||||
// mix of LLVM's code generator and the x86 backend. We tell the code
|
||||
// generator that boolean values in the elements of an x86 vector register
|
||||
|
|
|
@ -601,17 +601,15 @@ define <16 x i8> @load_fold_pblendvb(<16 x i8>* %px, <16 x i8> %y) {
|
|||
; SSE41-LABEL: load_fold_pblendvb:
|
||||
; SSE41: # %bb.0:
|
||||
; SSE41-NEXT: movdqa %xmm0, %xmm1
|
||||
; SSE41-NEXT: movdqa (%rdi), %xmm2
|
||||
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
|
||||
; SSE41-NEXT: pblendvb %xmm0, %xmm1, %xmm2
|
||||
; SSE41-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSE41-NEXT: movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
|
||||
; SSE41-NEXT: pblendvb %xmm0, (%rdi), %xmm1
|
||||
; SSE41-NEXT: movdqa %xmm1, %xmm0
|
||||
; SSE41-NEXT: retq
|
||||
;
|
||||
; AVX1OR2-LABEL: load_fold_pblendvb:
|
||||
; AVX1OR2: # %bb.0:
|
||||
; AVX1OR2-NEXT: vmovdqa (%rdi), %xmm1
|
||||
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
|
||||
; AVX1OR2-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
|
||||
; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
|
||||
; AVX1OR2-NEXT: vpblendvb %xmm1, (%rdi), %xmm0, %xmm0
|
||||
; AVX1OR2-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: load_fold_pblendvb:
|
||||
|
|
|
@ -1656,9 +1656,8 @@ define <32 x i8> @load_fold_pblendvb(<32 x i8>* %px, <32 x i8> %y) {
|
|||
;
|
||||
; AVX2-LABEL: load_fold_pblendvb:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255,255,255,0,255,0,0,0,255]
|
||||
; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
|
||||
; AVX2-NEXT: vpblendvb %ymm1, (%rdi), %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: load_fold_pblendvb:
|
||||
|
|
Loading…
Reference in New Issue