forked from OSchip/llvm-project
[X86] Improved lowering of v4x32 build_vector dag nodes.
This patch improves the lowering of v4f32 and v4i32 build_vector dag nodes that are known to have at least two non-zero elements. With this patch, a build_vector that performs a blend with zero is converted into a shuffle. This is done to let the shuffle legalizer expand the dag node in a optimal way. For example, if we know that a build_vector performs a blend with zero, we can try to lower it as a movq/blend instead of always selecting an insertps. This patch also improves the logic that lowers a build_vector into a insertps with zero masking. See for example the extra test cases added to test sse41.ll. Differential Revision: http://reviews.llvm.org/D6311 llvm-svn: 222375
This commit is contained in:
parent
56c0eb2d90
commit
1b657bfcc8
|
@ -5740,76 +5740,109 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
|
|||
}
|
||||
|
||||
/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
|
||||
static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
|
||||
unsigned NonZeros, unsigned NumNonZero,
|
||||
unsigned NumZero, SelectionDAG &DAG,
|
||||
static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
|
||||
const X86Subtarget *Subtarget,
|
||||
const TargetLowering &TLI) {
|
||||
// We know there's at least one non-zero element
|
||||
unsigned FirstNonZeroIdx = 0;
|
||||
SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
|
||||
while (FirstNonZero.getOpcode() == ISD::UNDEF ||
|
||||
X86::isZeroNode(FirstNonZero)) {
|
||||
++FirstNonZeroIdx;
|
||||
FirstNonZero = Op->getOperand(FirstNonZeroIdx);
|
||||
// Find all zeroable elements.
|
||||
bool Zeroable[4];
|
||||
for (int i=0; i < 4; ++i) {
|
||||
SDValue Elt = Op->getOperand(i);
|
||||
Zeroable[i] = (Elt.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elt));
|
||||
}
|
||||
assert(std::count_if(&Zeroable[0], &Zeroable[4],
|
||||
[](bool M) { return !M; }) > 1 &&
|
||||
"We expect at least two non-zero elements!");
|
||||
|
||||
if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
|
||||
!isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
|
||||
return SDValue();
|
||||
|
||||
SDValue V = FirstNonZero.getOperand(0);
|
||||
MVT VVT = V.getSimpleValueType();
|
||||
if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
|
||||
return SDValue();
|
||||
|
||||
unsigned FirstNonZeroDst =
|
||||
cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
|
||||
unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
|
||||
unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
|
||||
unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
|
||||
|
||||
for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
|
||||
SDValue Elem = Op.getOperand(Idx);
|
||||
if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
|
||||
// We only know how to deal with build_vector nodes where elements are either
|
||||
// zeroable or extract_vector_elt with constant index.
|
||||
SDValue FirstNonZero;
|
||||
for (int i=0; i < 4; ++i) {
|
||||
if (Zeroable[i])
|
||||
continue;
|
||||
|
||||
// TODO: What else can be here? Deal with it.
|
||||
if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
|
||||
SDValue Elt = Op->getOperand(i);
|
||||
if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
|
||||
!isa<ConstantSDNode>(Elt.getOperand(1)))
|
||||
return SDValue();
|
||||
|
||||
// TODO: Some optimizations are still possible here
|
||||
// ex: Getting one element from a vector, and the rest from another.
|
||||
if (Elem.getOperand(0) != V)
|
||||
return SDValue();
|
||||
|
||||
unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
|
||||
if (Dst == Idx)
|
||||
++CorrectIdx;
|
||||
else if (IncorrectIdx == -1U) {
|
||||
IncorrectIdx = Idx;
|
||||
IncorrectDst = Dst;
|
||||
} else
|
||||
// There was already one element with an incorrect index.
|
||||
// We can't optimize this case to an insertps.
|
||||
// Make sure that this node is extracting from a 128-bit vector.
|
||||
MVT VT = Elt.getOperand(0).getSimpleValueType();
|
||||
if (!VT.is128BitVector())
|
||||
return SDValue();
|
||||
if (!FirstNonZero.getNode())
|
||||
FirstNonZero = Elt;
|
||||
}
|
||||
|
||||
if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
|
||||
SDLoc dl(Op);
|
||||
EVT VT = Op.getSimpleValueType();
|
||||
unsigned ElementMoveMask = 0;
|
||||
if (IncorrectIdx == -1U)
|
||||
ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
|
||||
else
|
||||
ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
|
||||
assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
|
||||
SDValue V1 = FirstNonZero.getOperand(0);
|
||||
MVT VT = V1.getSimpleValueType();
|
||||
|
||||
SDValue InsertpsMask =
|
||||
DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
|
||||
return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
|
||||
// See if this build_vector can be lowered as a blend with zero.
|
||||
SDValue Elt;
|
||||
unsigned EltMaskIdx, EltIdx;
|
||||
int Mask[4];
|
||||
for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
|
||||
if (Zeroable[EltIdx]) {
|
||||
// The zero vector will be on the right hand side.
|
||||
Mask[EltIdx] = EltIdx+4;
|
||||
continue;
|
||||
}
|
||||
|
||||
Elt = Op->getOperand(EltIdx);
|
||||
// By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
|
||||
EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
|
||||
if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
|
||||
break;
|
||||
Mask[EltIdx] = EltIdx;
|
||||
}
|
||||
|
||||
return SDValue();
|
||||
if (EltIdx == 4) {
|
||||
// Let the shuffle legalizer deal with blend operations.
|
||||
SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
|
||||
if (V1.getSimpleValueType() != VT)
|
||||
V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), VT, V1);
|
||||
return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, &Mask[0]);
|
||||
}
|
||||
|
||||
// See if we can lower this build_vector to a INSERTPS.
|
||||
if (!Subtarget->hasSSE41())
|
||||
return SDValue();
|
||||
|
||||
SDValue V2 = Elt.getOperand(0);
|
||||
if (Elt == FirstNonZero)
|
||||
V1 = SDValue();
|
||||
|
||||
bool CanFold = true;
|
||||
for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
|
||||
if (Zeroable[i])
|
||||
continue;
|
||||
|
||||
SDValue Current = Op->getOperand(i);
|
||||
SDValue SrcVector = Current->getOperand(0);
|
||||
if (!V1.getNode())
|
||||
V1 = SrcVector;
|
||||
CanFold = SrcVector == V1 &&
|
||||
cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
|
||||
}
|
||||
|
||||
if (!CanFold)
|
||||
return SDValue();
|
||||
|
||||
assert(V1.getNode() && "Expected at least two non-zero elements!");
|
||||
if (V1.getSimpleValueType() != MVT::v4f32)
|
||||
V1 = DAG.getNode(ISD::BITCAST, SDLoc(V1), MVT::v4f32, V1);
|
||||
if (V2.getSimpleValueType() != MVT::v4f32)
|
||||
V2 = DAG.getNode(ISD::BITCAST, SDLoc(V2), MVT::v4f32, V2);
|
||||
|
||||
// Ok, we can emit an INSERTPS instruction.
|
||||
unsigned ZMask = 0;
|
||||
for (int i = 0; i < 4; ++i)
|
||||
if (Zeroable[i])
|
||||
ZMask |= 1 << i;
|
||||
|
||||
unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
|
||||
assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
|
||||
SDValue Result = DAG.getNode(X86ISD::INSERTPS, SDLoc(Op), MVT::v4f32, V1, V2,
|
||||
DAG.getIntPtrConstant(InsertPSMask));
|
||||
return DAG.getNode(ISD::BITCAST, SDLoc(Op), VT, Result);
|
||||
}
|
||||
|
||||
/// getVShift - Return a vector logical shift node.
|
||||
|
@ -6997,8 +7030,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
|
|||
|
||||
// If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
|
||||
if (EVTBits == 32 && NumElems == 4) {
|
||||
SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
|
||||
NumZero, DAG, Subtarget, *this);
|
||||
SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this);
|
||||
if (V.getNode())
|
||||
return V;
|
||||
}
|
||||
|
|
|
@ -302,17 +302,8 @@ define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
|
|||
define <4 x i32> @PR19721(<4 x i32> %i) {
|
||||
; CHECK-LABEL: PR19721:
|
||||
; CHECK: ## BB#0:
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; CHECK-NEXT: movd %xmm1, %eax
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,1,2,3]
|
||||
; CHECK-NEXT: movd %xmm1, %ecx
|
||||
; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; CHECK-NEXT: pxor %xmm0, %xmm0
|
||||
; CHECK-NEXT: xorps %xmm1, %xmm1
|
||||
; CHECK-NEXT: movss %xmm1, %xmm0
|
||||
; CHECK-NEXT: movd %ecx, %xmm1
|
||||
; CHECK-NEXT: movd %eax, %xmm2
|
||||
; CHECK-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,1]
|
||||
; CHECK-NEXT: retl
|
||||
%bc = bitcast <4 x i32> %i to i128
|
||||
%insert = and i128 %bc, -4294967296
|
||||
|
|
|
@ -423,16 +423,18 @@ define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
|
|||
ret <4 x i32> %result
|
||||
}
|
||||
|
||||
;;;;;; Shuffles optimizable with a single insertps instruction
|
||||
;;;;;; Shuffles optimizable with a single insertps or blend instruction
|
||||
define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
|
||||
; X32-LABEL: shuf_XYZ0:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero
|
||||
; X32-NEXT: xorps %xmm1, %xmm1
|
||||
; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: shuf_XYZ0:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero
|
||||
; X64-NEXT: xorps %xmm1, %xmm1
|
||||
; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
|
||||
; X64-NEXT: retq
|
||||
%vecext = extractelement <4 x float> %x, i32 0
|
||||
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
|
||||
|
@ -447,12 +449,12 @@ define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
|
|||
define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
|
||||
; X32-LABEL: shuf_XY00:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero
|
||||
; X32-NEXT: movq %xmm0, %xmm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: shuf_XY00:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero
|
||||
; X64-NEXT: movq %xmm0, %xmm0
|
||||
; X64-NEXT: retq
|
||||
%vecext = extractelement <4 x float> %x, i32 0
|
||||
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
|
||||
|
@ -595,12 +597,14 @@ define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
|
|||
define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X32-LABEL: i32_shuf_XYZ0:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero
|
||||
; X32-NEXT: pxor %xmm1, %xmm1
|
||||
; X32-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: i32_shuf_XYZ0:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],zero
|
||||
; X64-NEXT: pxor %xmm1, %xmm1
|
||||
; X64-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
|
||||
; X64-NEXT: retq
|
||||
%vecext = extractelement <4 x i32> %x, i32 0
|
||||
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
|
||||
|
@ -615,12 +619,12 @@ define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
|
|||
define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
|
||||
; X32-LABEL: i32_shuf_XY00:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero
|
||||
; X32-NEXT: movq %xmm0, %xmm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: i32_shuf_XY00:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],zero,zero
|
||||
; X64-NEXT: movq %xmm0, %xmm0
|
||||
; X64-NEXT: retq
|
||||
%vecext = extractelement <4 x i32> %x, i32 0
|
||||
%vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
|
||||
|
@ -764,15 +768,15 @@ define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
|
|||
define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
|
||||
; X32-LABEL: test_insertps_no_undef:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: movaps %xmm0, %xmm1
|
||||
; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],zero
|
||||
; X32-NEXT: xorps %xmm1, %xmm1
|
||||
; X32-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
|
||||
; X32-NEXT: maxps %xmm1, %xmm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: test_insertps_no_undef:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: movaps %xmm0, %xmm1
|
||||
; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],zero
|
||||
; X64-NEXT: xorps %xmm1, %xmm1
|
||||
; X64-NEXT: blendps {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3]
|
||||
; X64-NEXT: maxps %xmm1, %xmm0
|
||||
; X64-NEXT: retq
|
||||
%vecext = extractelement <4 x float> %x, i32 0
|
||||
|
@ -1022,3 +1026,123 @@ define void @insertps_pr20411(i32* noalias nocapture %RET) #1 {
|
|||
store <4 x i32> %shuffle117, <4 x i32>* %ptrcast, align 4
|
||||
ret void
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
|
||||
; X32-LABEL: insertps_4:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_4:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%vecext = extractelement <4 x float> %A, i32 0
|
||||
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
|
||||
%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
|
||||
%vecext2 = extractelement <4 x float> %B, i32 2
|
||||
%vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
|
||||
%vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
|
||||
ret <4 x float> %vecinit4
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
|
||||
; X32-LABEL: insertps_5:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_5:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%vecext = extractelement <4 x float> %A, i32 0
|
||||
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
|
||||
%vecext1 = extractelement <4 x float> %B, i32 1
|
||||
%vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
|
||||
%vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
|
||||
%vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
|
||||
ret <4 x float> %vecinit4
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
|
||||
; X32-LABEL: insertps_6:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_6:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%vecext = extractelement <4 x float> %A, i32 1
|
||||
%vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
|
||||
%vecext1 = extractelement <4 x float> %B, i32 2
|
||||
%vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
|
||||
%vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
|
||||
ret <4 x float> %vecinit3
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
|
||||
; X32-LABEL: insertps_7:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_7:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%vecext = extractelement <4 x float> %A, i32 0
|
||||
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
|
||||
%vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
|
||||
%vecext2 = extractelement <4 x float> %B, i32 1
|
||||
%vecinit3 = insertelement <4 x float> %vecinit1, float %vecext2, i32 2
|
||||
%vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
|
||||
ret <4 x float> %vecinit4
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
|
||||
; X32-LABEL: insertps_8:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_8:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%vecext = extractelement <4 x float> %A, i32 0
|
||||
%vecinit = insertelement <4 x float> undef, float %vecext, i32 0
|
||||
%vecext1 = extractelement <4 x float> %B, i32 0
|
||||
%vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
|
||||
%vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 2
|
||||
%vecinit4 = insertelement <4 x float> %vecinit3, float 0.000000e+00, i32 3
|
||||
ret <4 x float> %vecinit4
|
||||
}
|
||||
|
||||
define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
|
||||
; X32-LABEL: insertps_9:
|
||||
; X32: ## BB#0:
|
||||
; X32-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
|
||||
; X32-NEXT: movaps %xmm1, %xmm0
|
||||
; X32-NEXT: retl
|
||||
;
|
||||
; X64-LABEL: insertps_9:
|
||||
; X64: ## BB#0:
|
||||
; X64-NEXT: insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
|
||||
; X64-NEXT: movaps %xmm1, %xmm0
|
||||
; X64-NEXT: retq
|
||||
entry:
|
||||
%vecext = extractelement <4 x float> %A, i32 0
|
||||
%vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
|
||||
%vecext1 = extractelement <4 x float> %B, i32 2
|
||||
%vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 2
|
||||
%vecinit3 = insertelement <4 x float> %vecinit2, float 0.000000e+00, i32 3
|
||||
ret <4 x float> %vecinit3
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue