[X86][SSE] combineTargetShuffle - permilps(shufps(load(),x)) --> permilps(shufps(x,load()))

Moves lowerShuffleWithSHUFPS commutation code from rG30fcd29fe479 to catch cases during combine
This commit is contained in:
Simon Pilgrim 2020-01-24 15:22:56 +00:00
parent a8c3608a27
commit 3fd5d1c6e7
2 changed files with 34 additions and 22 deletions

View File

@ -13316,12 +13316,10 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
/// It makes no assumptions about whether this is the *best* lowering, it simply
/// uses it.
static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
ArrayRef<int> OriginalMask, SDValue V1,
ArrayRef<int> Mask, SDValue V1,
SDValue V2, SelectionDAG &DAG) {
SDValue LowV = V1, HighV = V2;
SmallVector<int, 4> Mask(OriginalMask.begin(), OriginalMask.end());
SmallVector<int, 4> NewMask = Mask;
SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
if (NumV2Elements == 1) {
@ -13358,14 +13356,6 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
}
} else if (NumV2Elements == 2) {
// If we are likely to fold V1 but not V2, then commute the shuffle.
if (MayFoldLoad(V1) && !MayFoldLoad(V2)) {
ShuffleVectorSDNode::commuteMask(Mask);
NewMask = Mask;
std::swap(V1, V2);
std::swap(LowV, HighV);
}
if (Mask[0] < 4 && Mask[1] < 4) {
// Handle the easy case where we have V1 in the low lanes and V2 in the
// high lanes.
@ -34598,6 +34588,28 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
}
}
// Attempt to commute shufps LHS loads:
// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
if (VT == MVT::v4f32 &&
(X86ISD::VPERMILPI == Opcode ||
(X86ISD::SHUFP == Opcode && N.getOperand(0) == N.getOperand(1)))) {
SDValue N0 = N.getOperand(0);
unsigned Imm = N.getConstantOperandVal(X86ISD::VPERMILPI == Opcode ? 1 : 2);
if (N0.getOpcode() == X86ISD::SHUFP && N->isOnlyUserOf(N0.getNode())) {
SDValue N00 = N0.getOperand(0);
SDValue N01 = N0.getOperand(1);
if (MayFoldLoad(peekThroughOneUseBitcasts(N00)) &&
!MayFoldLoad(peekThroughOneUseBitcasts(N01))) {
unsigned Imm1 = N0.getConstantOperandVal(2);
Imm1 = ((Imm1 & 0x0F) << 4) | ((Imm1 & 0xF0) >> 4);
SDValue NewN0 = DAG.getNode(X86ISD::SHUFP, DL, VT, N01, N00,
DAG.getTargetConstant(Imm1, DL, MVT::i8));
return DAG.getNode(X86ISD::SHUFP, DL, VT, NewN0, NewN0,
DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
}
}
}
switch (Opcode) {
case X86ISD::VBROADCAST: {
SDValue Src = N.getOperand(0);

View File

@ -9,22 +9,22 @@ define void @PR15298(<4 x float>* nocapture %source, <8 x float>* nocapture %des
; SSE-32: # %bb.0: # %L.entry
; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; SSE-32-NEXT: movaps 304(%ecx), %xmm0
; SSE-32-NEXT: xorps %xmm0, %xmm0
; SSE-32-NEXT: xorps %xmm1, %xmm1
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
; SSE-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
; SSE-32-NEXT: movups %xmm1, 624(%eax)
; SSE-32-NEXT: movups %xmm0, 608(%eax)
; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0]
; SSE-32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; SSE-32-NEXT: movups %xmm0, 624(%eax)
; SSE-32-NEXT: movups %xmm1, 608(%eax)
; SSE-32-NEXT: retl
;
; SSE-64-LABEL: PR15298:
; SSE-64: # %bb.0: # %L.entry
; SSE-64-NEXT: movaps 304(%rdi), %xmm0
; SSE-64-NEXT: xorps %xmm0, %xmm0
; SSE-64-NEXT: xorps %xmm1, %xmm1
; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[0,1]
; SSE-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,0,1,3]
; SSE-64-NEXT: movups %xmm1, 624(%rsi)
; SSE-64-NEXT: movups %xmm0, 608(%rsi)
; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],mem[0,0]
; SSE-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2,3,1]
; SSE-64-NEXT: movups %xmm0, 624(%rsi)
; SSE-64-NEXT: movups %xmm1, 608(%rsi)
; SSE-64-NEXT: retq
;
; AVX-32-LABEL: PR15298: