forked from OSchip/llvm-project
[X86] Prefer blendps over insertps codegen for one special case
With this patch, for this one exact case, we'll generate: blendps %xmm0, %xmm1, $1 instead of: insertps %xmm0, %xmm1, $0 If there's a memory operand available for load folding and we're optimizing for size, we'll still generate the insertps. The detailed performance data motivation for this may be found in D7866; in summary, blendps has 2-3x throughput vs. insertps on widely used chips. Differential Revision: http://reviews.llvm.org/D8332 llvm-svn: 232850
This commit is contained in:
parent
03ad616143
commit
c88f724fed
|
@ -10550,16 +10550,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
|
|||
}
|
||||
|
||||
if (EltVT == MVT::f32) {
|
||||
// Bits [7:6] of the constant are the source select. This will always be
|
||||
// zero here. The DAG Combiner may combine an extract_elt index into
|
||||
// these
|
||||
// bits. For example (insert (extract, 3), 2) could be matched by
|
||||
// putting
|
||||
// the '3' into bits [7:6] of X86ISD::INSERTPS.
|
||||
// Bits [5:4] of the constant are the destination select. This is the
|
||||
// value of the incoming immediate.
|
||||
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
|
||||
// Bits [7:6] of the constant are the source select. This will always be
|
||||
// zero here. The DAG Combiner may combine an extract_elt index into
|
||||
// these bits. For example (insert (extract, 3), 2) could be matched by
|
||||
// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
|
||||
// Bits [5:4] of the constant are the destination select. This is the
|
||||
// value of the incoming immediate.
|
||||
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
|
||||
// combine either bitwise AND or insert of float 0.0 to set these bits.
|
||||
|
||||
const Function *F = DAG.getMachineFunction().getFunction();
|
||||
bool MinSize = F->hasFnAttribute(Attribute::MinSize);
|
||||
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
|
||||
// If this is an insertion of 32-bits into the low 32-bits of
|
||||
// a vector, we prefer to generate a blend with immediate rather
|
||||
// than an insertps. Blends are simpler operations in hardware and so
|
||||
// will always have equal or better performance than insertps.
|
||||
// But if optimizing for size and there's a load folding opportunity,
|
||||
// generate insertps because blendps does not have a 32-bit memory
|
||||
// operand form.
|
||||
N2 = DAG.getIntPtrConstant(1);
|
||||
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
|
||||
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
|
||||
}
|
||||
N2 = DAG.getIntPtrConstant(IdxVal << 4);
|
||||
// Create this as a scalar to vector..
|
||||
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
|
||||
|
|
|
@ -199,28 +199,51 @@ define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
|
|||
|
||||
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||
|
||||
define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
|
||||
; X32-LABEL: insertps_2:
|
||||
; When optimizing for speed, prefer blendps over insertps even if it means we have to
|
||||
; generate a separate movss to load the scalar operand.
|
||||
define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
|
||||
; X32-LABEL: blendps_not_insertps_1:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
|
||||
; X32-NEXT: movss {{.*#+}} xmm1
|
||||
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_2:
|
||||
; X64-LABEL: blendps_not_insertps_1:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X64-NEXT: retq
|
||||
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
|
||||
ret <4 x float> %tmp1
|
||||
}
|
||||
define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
|
||||
; X32-LABEL: insertps_3:
|
||||
|
||||
; When optimizing for size, generate an insertps if there's a load fold opportunity.
|
||||
; The difference between i386 and x86-64 ABIs for the float operand means we should
|
||||
; generate an insertps for X32 but not for X64!
|
||||
define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
|
||||
; X32-LABEL: insertps_or_blendps:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_3:
|
||||
; X64-LABEL: insertps_or_blendps:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X64-NEXT: retq
|
||||
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
|
||||
ret <4 x float> %tmp1
|
||||
}
|
||||
|
||||
; An insert into the low 32-bits of a vector from the low 32-bits of another vector
|
||||
; is always just a blendps because blendps is never more expensive than insertps.
|
||||
define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
|
||||
; X32-LABEL: blendps_not_insertps_2:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: blendps_not_insertps_2:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||
; X64-NEXT: retq
|
||||
%tmp2 = extractelement <4 x float> %t2, i32 0
|
||||
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
|
||||
|
|
Loading…
Reference in New Issue