forked from OSchip/llvm-project
[X86] Prefer blendps over insertps codegen for one special case
With this patch, for this one exact case, we'll generate: blendps %xmm0, %xmm1, $1 instead of: insertps %xmm0, %xmm1, $0 If there's a memory operand available for load folding and we're optimizing for size, we'll still generate the insertps. The detailed performance data motivation for this may be found in D7866; in summary, blendps has 2-3x throughput vs. insertps on widely used chips. Differential Revision: http://reviews.llvm.org/D8332 llvm-svn: 232850
This commit is contained in:
parent
03ad616143
commit
c88f724fed
|
@ -10550,16 +10550,29 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
|
||||||
}
|
}
|
||||||
|
|
||||||
if (EltVT == MVT::f32) {
|
if (EltVT == MVT::f32) {
|
||||||
// Bits [7:6] of the constant are the source select. This will always be
|
// Bits [7:6] of the constant are the source select. This will always be
|
||||||
// zero here. The DAG Combiner may combine an extract_elt index into
|
// zero here. The DAG Combiner may combine an extract_elt index into
|
||||||
// these
|
// these bits. For example (insert (extract, 3), 2) could be matched by
|
||||||
// bits. For example (insert (extract, 3), 2) could be matched by
|
// putting the '3' into bits [7:6] of X86ISD::INSERTPS.
|
||||||
// putting
|
// Bits [5:4] of the constant are the destination select. This is the
|
||||||
// the '3' into bits [7:6] of X86ISD::INSERTPS.
|
// value of the incoming immediate.
|
||||||
// Bits [5:4] of the constant are the destination select. This is the
|
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
|
||||||
// value of the incoming immediate.
|
|
||||||
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
|
|
||||||
// combine either bitwise AND or insert of float 0.0 to set these bits.
|
// combine either bitwise AND or insert of float 0.0 to set these bits.
|
||||||
|
|
||||||
|
const Function *F = DAG.getMachineFunction().getFunction();
|
||||||
|
bool MinSize = F->hasFnAttribute(Attribute::MinSize);
|
||||||
|
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
|
||||||
|
// If this is an insertion of 32-bits into the low 32-bits of
|
||||||
|
// a vector, we prefer to generate a blend with immediate rather
|
||||||
|
// than an insertps. Blends are simpler operations in hardware and so
|
||||||
|
// will always have equal or better performance than insertps.
|
||||||
|
// But if optimizing for size and there's a load folding opportunity,
|
||||||
|
// generate insertps because blendps does not have a 32-bit memory
|
||||||
|
// operand form.
|
||||||
|
N2 = DAG.getIntPtrConstant(1);
|
||||||
|
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
|
||||||
|
return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
|
||||||
|
}
|
||||||
N2 = DAG.getIntPtrConstant(IdxVal << 4);
|
N2 = DAG.getIntPtrConstant(IdxVal << 4);
|
||||||
// Create this as a scalar to vector..
|
// Create this as a scalar to vector..
|
||||||
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
|
N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
|
||||||
|
|
|
@ -199,28 +199,51 @@ define <4 x float> @insertps_1(<4 x float> %t1, <4 x float> %t2) nounwind {
|
||||||
|
|
||||||
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
|
declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
|
||||||
|
|
||||||
define <4 x float> @insertps_2(<4 x float> %t1, float %t2) nounwind {
|
; When optimizing for speed, prefer blendps over insertps even if it means we have to
|
||||||
; X32-LABEL: insertps_2:
|
; generate a separate movss to load the scalar operand.
|
||||||
|
define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
|
||||||
|
; X32-LABEL: blendps_not_insertps_1:
|
||||||
; X32: ## BB#0:
|
; X32: ## BB#0:
|
||||||
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
|
; X32-NEXT: movss {{.*#+}} xmm1
|
||||||
|
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||||
; X32-NEXT: retl
|
; X32-NEXT: retl
|
||||||
;
|
;
|
||||||
; X64-LABEL: insertps_2:
|
; X64-LABEL: blendps_not_insertps_1:
|
||||||
; X64: ## BB#0:
|
; X64: ## BB#0:
|
||||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||||
; X64-NEXT: retq
|
; X64-NEXT: retq
|
||||||
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
|
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
|
||||||
ret <4 x float> %tmp1
|
ret <4 x float> %tmp1
|
||||||
}
|
}
|
||||||
define <4 x float> @insertps_3(<4 x float> %t1, <4 x float> %t2) nounwind {
|
|
||||||
; X32-LABEL: insertps_3:
|
; When optimizing for size, generate an insertps if there's a load fold opportunity.
|
||||||
|
; The difference between i386 and x86-64 ABIs for the float operand means we should
|
||||||
|
; generate an insertps for X32 but not for X64!
|
||||||
|
define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
|
||||||
|
; X32-LABEL: insertps_or_blendps:
|
||||||
; X32: ## BB#0:
|
; X32: ## BB#0:
|
||||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
; X32-NEXT: insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
|
||||||
; X32-NEXT: retl
|
; X32-NEXT: retl
|
||||||
;
|
;
|
||||||
; X64-LABEL: insertps_3:
|
; X64-LABEL: insertps_or_blendps:
|
||||||
; X64: ## BB#0:
|
; X64: ## BB#0:
|
||||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||||
|
; X64-NEXT: retq
|
||||||
|
%tmp1 = insertelement <4 x float> %t1, float %t2, i32 0
|
||||||
|
ret <4 x float> %tmp1
|
||||||
|
}
|
||||||
|
|
||||||
|
; An insert into the low 32-bits of a vector from the low 32-bits of another vector
|
||||||
|
; is always just a blendps because blendps is never more expensive than insertps.
|
||||||
|
define <4 x float> @blendps_not_insertps_2(<4 x float> %t1, <4 x float> %t2) nounwind {
|
||||||
|
; X32-LABEL: blendps_not_insertps_2:
|
||||||
|
; X32: ## BB#0:
|
||||||
|
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||||
|
; X32-NEXT: retl
|
||||||
|
;
|
||||||
|
; X64-LABEL: blendps_not_insertps_2:
|
||||||
|
; X64: ## BB#0:
|
||||||
|
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
|
||||||
; X64-NEXT: retq
|
; X64-NEXT: retq
|
||||||
%tmp2 = extractelement <4 x float> %t2, i32 0
|
%tmp2 = extractelement <4 x float> %t2, i32 0
|
||||||
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
|
%tmp1 = insertelement <4 x float> %t1, float %tmp2, i32 0
|
||||||
|
|
Loading…
Reference in New Issue