[X86] Prefer blendps over insertps codegen for one special case

With this patch, for this one exact case, we'll generate:

  blendps %xmm0, %xmm1, $1

instead of:

  insertps %xmm0, %xmm1, $0

If there's a memory operand available for load folding and we're
optimizing for size, we'll still generate the insertps.

The detailed performance data motivation for this may be found in D7866; 
in summary, blendps has 2-3x throughput vs. insertps on widely used chips.

Differential Revision: http://reviews.llvm.org/D8332

llvm-svn: 232850
This commit is contained in:
Sanjay Patel 2015-03-20 21:19:52 +00:00
parent 03ad616143
commit c88f724fed
2 changed files with 55 additions and 19 deletions

View File

@ -10550,16 +10550,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
}
if (EltVT == MVT::f32) {
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into
// these
// bits. For example (insert (extract, 3), 2) could be matched by
// putting
// the '3' into bits [7:6] of X86ISD::INSERTPS.
// Bits [5:4] of the constant are the destination select. This is the
// value of the incoming immediate.
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// Bits [7:6] of the constant are the source select. This will always be
// zero here. The DAG Combiner may combine an extract_elt index into
// these bits. For example (insert (extract, 3), 2) could be matched by
// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
// Bits [5:4] of the constant are the destination select. This is the
// value of the incoming immediate.
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
const Function *F = DAG.getMachineFunction().getFunction();
bool MinSize = F->hasFnAttribute(Attribute::MinSize);
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
// than an insertps. Blends are simpler operations in hardware and so
// will always have equal or better performance than insertps.
// But if optimizing for size and there's a load folding opportunity,
// generate insertps because blendps does not have a 32-bit memory
// operand form.
N2 = DAG.getIntPtrConstant(1);
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
}
N2 = DAG.getIntPtrConstant(IdxVal << 4);
// Create this as a scalar to vector..
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);

View File

@ -199,28 +199,51 @@ define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
; X32-LABEL: insertps_2:
; When optimizing for speed, prefer blendps over insertps even if it means we have to
; generate a separate movss to load the scalar operand.
define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
; X32-LABEL: blendps_not_insertps_1:
; X32: ## BB#0:
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X32-NEXT: movss {{.*#+}} xmm1
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_2:
; X64-LABEL: blendps_not_insertps_1:
; X64: ## BB#0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
ret <4 x float> %tmp1
}
define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
; X32-LABEL: insertps_3:
; When optimizing for size, generate an insertps if there's a load fold opportunity.
; The difference between i386 and x86-64 ABIs for the float operand means we should
; generate an insertps for X32 but not for X64!
define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
; X32-LABEL: insertps_or_blendps:
; X32: ## BB#0:
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: insertps_3:
; X64-LABEL: insertps_or_blendps:
; X64: ## BB#0:
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
ret <4 x float> %tmp1
}
; An insert into the low 32-bits of a vector from the low 32-bits of another vector
; is always just a blendps because blendps is never more expensive than insertps.
define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
; X32-LABEL: blendps_not_insertps_2:
; X32: ## BB#0:
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X32-NEXT: retl
;
; X64-LABEL: blendps_not_insertps_2:
; X64: ## BB#0:
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
; X64-NEXT: retq
%tmp2 = extractelement <4 x float> %t2, i32 0
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0