[X86] Add a DAG combine to combine INSERTPS and VBROADCAST of a scalar load. Remove corresponding isel patterns.

We had an isel pattern to perform this, but its better to
do it in DAG combine as a simplification. This also fixes the lack
of patterns for AVX512 targets.

llvm-svn: 370294
This commit is contained in:
Craig Topper 2019-08-29 05:48:48 +00:00
parent 1aadf6f39f
commit 1ec5c204b8
3 changed files with 53 additions and 59 deletions

View File

@ -33550,46 +33550,57 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
// Attempt to merge insertps Op0 with an inner target shuffle node.
SmallVector<int, 8> TargetMask0;
SmallVector<SDValue, 2> Ops0;
if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
return SDValue();
if (setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) {
bool Updated = false;
bool UseInput00 = false;
bool UseInput01 = false;
for (int i = 0; i != 4; ++i) {
int M = TargetMask0[i];
if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
// No change if element is already zero or the inserted element.
continue;
} else if (isUndefOrZero(M)) {
// If the target mask is undef/zero then we must zero the element.
InsertPSMask |= (1u << i);
Updated = true;
continue;
}
bool Updated = false;
bool UseInput00 = false;
bool UseInput01 = false;
for (int i = 0; i != 4; ++i) {
int M = TargetMask0[i];
if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
// No change if element is already zero or the inserted element.
continue;
} else if (isUndefOrZero(M)) {
// If the target mask is undef/zero then we must zero the element.
InsertPSMask |= (1u << i);
Updated = true;
continue;
// The input vector element must be inline.
if (M != i && M != (i + 4))
return SDValue();
// Determine which inputs of the target shuffle we're using.
UseInput00 |= (0 <= M && M < 4);
UseInput01 |= (4 <= M);
}
// The input vector element must be inline.
if (M != i && M != (i + 4))
return SDValue();
// If we're not using both inputs of the target shuffle then use the
// referenced input directly.
if (UseInput00 && !UseInput01) {
Updated = true;
Op0 = Ops0[0];
} else if (!UseInput00 && UseInput01) {
Updated = true;
Op0 = Ops0[1];
}
// Determine which inputs of the target shuffle we're using.
UseInput00 |= (0 <= M && M < 4);
UseInput01 |= (4 <= M);
if (Updated)
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
}
// If we're not using both inputs of the target shuffle then use the
// referenced input directly.
if (UseInput00 && !UseInput01) {
Updated = true;
Op0 = Ops0[0];
} else if (!UseInput00 && UseInput01) {
Updated = true;
Op0 = Ops0[1];
}
if (Updated)
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
DAG.getConstant(InsertPSMask, DL, MVT::i8));
// If we're inserting an element from a vbroadcast of a load, fold the
// load into the X86insertps instruction. We need to convert the scalar
// load to a vector and clear the source lane of the INSERTPS control.
if (Op1.getOpcode() == X86ISD::VBROADCAST && Op1.hasOneUse() &&
Op1.getOperand(0).hasOneUse() &&
!Op1.getOperand(0).getValueType().isVector() &&
ISD::isNormalLoad(Op1.getOperand(0).getNode()))
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0,
DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT,
Op1.getOperand(0)),
DAG.getConstant(InsertPSMask & 0x3f, DL, MVT::i8));
return SDValue();
}

View File

@ -5323,19 +5323,6 @@ let ExeDomain = SSEPackedSingle in {
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1>;
}
let Predicates = [UseAVX] in {
// If we're inserting an element from a vbroadcast of a load, fold the
// load into the X86insertps instruction.
// FIXME: Why are these here? This looks like a demanded bits issue.
// FIXME: Missing AVX512 equivalents.
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
(X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
(X86VBroadcast (v4f32 (nonvolatile_load addr:$src2))), imm:$src3)),
(VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
}
//===----------------------------------------------------------------------===//
// SSE4.1 - Round Instructions
//===----------------------------------------------------------------------===//

View File

@ -1559,9 +1559,8 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
; X86-AVX512-NEXT: vbroadcastss (%ecx,%eax,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0x81]
; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
; X86-AVX512-NEXT: vinsertps $48, (%ecx,%eax,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0x81,0x30]
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: insertps_from_broadcast_loadf32:
@ -1578,9 +1577,8 @@ define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocap
;
; X64-AVX512-LABEL: insertps_from_broadcast_loadf32:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcastss (%rdi,%rsi,4), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0c,0xb7]
; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
; X64-AVX512-NEXT: vinsertps $48, (%rdi,%rsi,4), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x04,0xb7,0x30]
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%1 = getelementptr inbounds float, float* %fb, i64 %index
%2 = load float, float* %1, align 4
@ -1611,9 +1609,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
; X86-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
; X86-AVX512: ## %bb.0:
; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
; X86-AVX512-NEXT: vbroadcastss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x08]
; X86-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
; X86-AVX512-NEXT: vinsertps $48, (%eax), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x00,0x30]
; X86-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X86-AVX512-NEXT: retl ## encoding: [0xc3]
;
; X64-SSE-LABEL: insertps_from_broadcast_loadv4f32:
@ -1631,9 +1628,8 @@ define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float
;
; X64-AVX512-LABEL: insertps_from_broadcast_loadv4f32:
; X64-AVX512: ## %bb.0:
; X64-AVX512-NEXT: vbroadcastss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0x0f]
; X64-AVX512-NEXT: vinsertps $48, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[0]
; X64-AVX512-NEXT: vinsertps $48, (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0x07,0x30]
; X64-AVX512-NEXT: ## xmm0 = xmm0[0,1,2],mem[0]
; X64-AVX512-NEXT: retq ## encoding: [0xc3]
%1 = load <4 x float>, <4 x float>* %b, align 4
%2 = extractelement <4 x float> %1, i32 0