forked from OSchip/llvm-project
[X86][SSE] Don't colaesce v4i32 extracts
We currently coalesce v4i32 extracts from all 4 elements to 2 v2i64 extracts + shifts/sign-extends. This seems to have been added back in the days when we tended to spill vectors and reload scalars, or ended up with repeated shuffles moving everything down to 0'th index. I don't think either of these are likely these days as we have better EXTRACT_VECTOR_ELT and VECTOR_SHUFFLE handling, and the existing code tends to make it very difficult for various vector and load combines. Differential Revision: https://reviews.llvm.org/D42308 llvm-svn: 323541
This commit is contained in:
parent
d567c27c84
commit
76ede609f6
|
@ -31239,102 +31239,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
|
|||
if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
|
||||
return MinMax;
|
||||
|
||||
// Only operate on vectors of 4 elements, where the alternative shuffling
|
||||
// gets to be more expensive.
|
||||
if (SrcVT != MVT::v4i32)
|
||||
return SDValue();
|
||||
|
||||
// Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
|
||||
// single use which is a sign-extend or zero-extend, and all elements are
|
||||
// used.
|
||||
SmallVector<SDNode *, 4> Uses;
|
||||
unsigned ExtractedElements = 0;
|
||||
for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
|
||||
UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
|
||||
if (UI.getUse().getResNo() != InputVector.getResNo())
|
||||
return SDValue();
|
||||
|
||||
SDNode *Extract = *UI;
|
||||
if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
|
||||
return SDValue();
|
||||
|
||||
if (Extract->getValueType(0) != MVT::i32)
|
||||
return SDValue();
|
||||
if (!Extract->hasOneUse())
|
||||
return SDValue();
|
||||
if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
|
||||
Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
|
||||
return SDValue();
|
||||
if (!isa<ConstantSDNode>(Extract->getOperand(1)))
|
||||
return SDValue();
|
||||
|
||||
// Record which element was extracted.
|
||||
ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
|
||||
Uses.push_back(Extract);
|
||||
}
|
||||
|
||||
// If not all the elements were used, this may not be worthwhile.
|
||||
if (ExtractedElements != 15)
|
||||
return SDValue();
|
||||
|
||||
// Ok, we've now decided to do the transformation.
|
||||
// If 64-bit shifts are legal, use the extract-shift sequence,
|
||||
// otherwise bounce the vector off the cache.
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
SDValue Vals[4];
|
||||
|
||||
if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
|
||||
SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
|
||||
auto &DL = DAG.getDataLayout();
|
||||
EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
|
||||
SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
|
||||
DAG.getConstant(0, dl, VecIdxTy));
|
||||
SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
|
||||
DAG.getConstant(1, dl, VecIdxTy));
|
||||
|
||||
SDValue ShAmt = DAG.getConstant(
|
||||
32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
|
||||
Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
|
||||
Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
|
||||
DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
|
||||
Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
|
||||
Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
|
||||
DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
|
||||
} else {
|
||||
// Store the value to a temporary stack slot.
|
||||
SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
|
||||
SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
|
||||
MachinePointerInfo());
|
||||
|
||||
EVT ElementType = SrcVT.getVectorElementType();
|
||||
unsigned EltSize = ElementType.getSizeInBits() / 8;
|
||||
|
||||
// Replace each use (extract) with a load of the appropriate element.
|
||||
for (unsigned i = 0; i < 4; ++i) {
|
||||
uint64_t Offset = EltSize * i;
|
||||
auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
|
||||
SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
|
||||
|
||||
SDValue ScalarAddr =
|
||||
DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
|
||||
|
||||
// Load the scalar.
|
||||
Vals[i] =
|
||||
DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
|
||||
}
|
||||
}
|
||||
|
||||
// Replace the extracts
|
||||
for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
|
||||
UE = Uses.end(); UI != UE; ++UI) {
|
||||
SDNode *Extract = *UI;
|
||||
|
||||
uint64_t IdxVal = Extract->getConstantOperandVal(1);
|
||||
DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
|
||||
}
|
||||
|
||||
// The replacement was made in place; return N so it won't be revisited.
|
||||
return SDValue(N, 0);
|
||||
}
|
||||
|
||||
/// If a vector select has an operand that is -1 or 0, try to simplify the
|
||||
|
|
|
@ -7,21 +7,24 @@
|
|||
; rdar://7398554
|
||||
|
||||
; When doing vector gather-scatter index calculation with 32-bit indices,
|
||||
; use an efficient mov/shift sequence rather than shuffling each individual
|
||||
; element out of the index vector.
|
||||
; minimize shuffling of each individual element out of the index vector.
|
||||
|
||||
define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
|
||||
; LIN-SSE2-LABEL: foo:
|
||||
; LIN-SSE2: # %bb.0:
|
||||
; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
|
||||
; LIN-SSE2-NEXT: pand (%rdx), %xmm0
|
||||
; LIN-SSE2-NEXT: movd %xmm0, %eax
|
||||
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; LIN-SSE2-NEXT: movd %xmm1, %ecx
|
||||
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; LIN-SSE2-NEXT: movq %xmm1, %rax
|
||||
; LIN-SSE2-NEXT: movq %xmm0, %rcx
|
||||
; LIN-SSE2-NEXT: movslq %ecx, %rdx
|
||||
; LIN-SSE2-NEXT: sarq $32, %rcx
|
||||
; LIN-SSE2-NEXT: movslq %eax, %rsi
|
||||
; LIN-SSE2-NEXT: sarq $32, %rax
|
||||
; LIN-SSE2-NEXT: movd %xmm1, %edx
|
||||
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; LIN-SSE2-NEXT: movd %xmm0, %esi
|
||||
; LIN-SSE2-NEXT: cltq
|
||||
; LIN-SSE2-NEXT: movslq %ecx, %rcx
|
||||
; LIN-SSE2-NEXT: movslq %edx, %rdx
|
||||
; LIN-SSE2-NEXT: movslq %esi, %rsi
|
||||
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; LIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
||||
; LIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
|
@ -32,14 +35,16 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
|
|||
; LIN-SSE4: # %bb.0:
|
||||
; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
|
||||
; LIN-SSE4-NEXT: pand (%rdx), %xmm0
|
||||
; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
|
||||
; LIN-SSE4-NEXT: movq %xmm0, %rcx
|
||||
; LIN-SSE4-NEXT: movslq %ecx, %rdx
|
||||
; LIN-SSE4-NEXT: sarq $32, %rcx
|
||||
; LIN-SSE4-NEXT: movslq %eax, %rsi
|
||||
; LIN-SSE4-NEXT: movd %xmm0, %eax
|
||||
; LIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
|
||||
; LIN-SSE4-NEXT: pextrd $2, %xmm0, %edx
|
||||
; LIN-SSE4-NEXT: pextrd $3, %xmm0, %esi
|
||||
; LIN-SSE4-NEXT: cltq
|
||||
; LIN-SSE4-NEXT: movslq %ecx, %rcx
|
||||
; LIN-SSE4-NEXT: movslq %edx, %rdx
|
||||
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
||||
; LIN-SSE4-NEXT: sarq $32, %rax
|
||||
; LIN-SSE4-NEXT: movslq %esi, %rax
|
||||
; LIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; LIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
|
||||
; LIN-SSE4-NEXT: retq
|
||||
|
@ -48,13 +53,17 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
|
|||
; WIN-SSE2: # %bb.0:
|
||||
; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
|
||||
; WIN-SSE2-NEXT: pand (%r8), %xmm0
|
||||
; WIN-SSE2-NEXT: movd %xmm0, %r8d
|
||||
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; WIN-SSE2-NEXT: movd %xmm1, %r9d
|
||||
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; WIN-SSE2-NEXT: movq %xmm1, %rax
|
||||
; WIN-SSE2-NEXT: movq %xmm0, %rdx
|
||||
; WIN-SSE2-NEXT: movslq %edx, %r8
|
||||
; WIN-SSE2-NEXT: sarq $32, %rdx
|
||||
; WIN-SSE2-NEXT: movslq %eax, %r9
|
||||
; WIN-SSE2-NEXT: sarq $32, %rax
|
||||
; WIN-SSE2-NEXT: movd %xmm1, %r10d
|
||||
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; WIN-SSE2-NEXT: movd %xmm0, %edx
|
||||
; WIN-SSE2-NEXT: movslq %r8d, %rax
|
||||
; WIN-SSE2-NEXT: movslq %r9d, %r8
|
||||
; WIN-SSE2-NEXT: movslq %r10d, %r9
|
||||
; WIN-SSE2-NEXT: movslq %edx, %rdx
|
||||
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; WIN-SSE2-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
||||
; WIN-SSE2-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
|
@ -65,14 +74,16 @@ define <4 x double> @foo(double* %p, <4 x i32>* %i, <4 x i32>* %h) nounwind {
|
|||
; WIN-SSE4: # %bb.0:
|
||||
; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
|
||||
; WIN-SSE4-NEXT: pand (%r8), %xmm0
|
||||
; WIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
|
||||
; WIN-SSE4-NEXT: movq %xmm0, %rdx
|
||||
; WIN-SSE4-NEXT: movslq %edx, %r8
|
||||
; WIN-SSE4-NEXT: sarq $32, %rdx
|
||||
; WIN-SSE4-NEXT: movslq %eax, %r9
|
||||
; WIN-SSE4-NEXT: movd %xmm0, %eax
|
||||
; WIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
|
||||
; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
|
||||
; WIN-SSE4-NEXT: pextrd $3, %xmm0, %r9d
|
||||
; WIN-SSE4-NEXT: cltq
|
||||
; WIN-SSE4-NEXT: movslq %edx, %rdx
|
||||
; WIN-SSE4-NEXT: movslq %r8d, %r8
|
||||
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero
|
||||
; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
|
||||
; WIN-SSE4-NEXT: sarq $32, %rax
|
||||
; WIN-SSE4-NEXT: movslq %r9d, %rax
|
||||
; WIN-SSE4-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero
|
||||
; WIN-SSE4-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
|
||||
; WIN-SSE4-NEXT: retq
|
||||
|
@ -127,22 +138,22 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
|
|||
; LIN-SSE2: # %bb.0:
|
||||
; LIN-SSE2-NEXT: movdqa (%rsi), %xmm0
|
||||
; LIN-SSE2-NEXT: pand (%rdx), %xmm0
|
||||
; LIN-SSE2-NEXT: movd %xmm0, %eax
|
||||
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; LIN-SSE2-NEXT: movd %xmm1, %edx
|
||||
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; LIN-SSE2-NEXT: movq %xmm1, %rax
|
||||
; LIN-SSE2-NEXT: movq %rax, %rdx
|
||||
; LIN-SSE2-NEXT: shrq $32, %rdx
|
||||
; LIN-SSE2-NEXT: movq %xmm0, %rsi
|
||||
; LIN-SSE2-NEXT: movq %rsi, %rdi
|
||||
; LIN-SSE2-NEXT: shrq $32, %rdi
|
||||
; LIN-SSE2-NEXT: andl %ecx, %esi
|
||||
; LIN-SSE2-NEXT: andl %ecx, %eax
|
||||
; LIN-SSE2-NEXT: andq %rcx, %rdi
|
||||
; LIN-SSE2-NEXT: movd %xmm1, %esi
|
||||
; LIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; LIN-SSE2-NEXT: movd %xmm0, %edi
|
||||
; LIN-SSE2-NEXT: andq %rcx, %rax
|
||||
; LIN-SSE2-NEXT: andq %rcx, %rdx
|
||||
; LIN-SSE2-NEXT: movq %rdi, %xmm1
|
||||
; LIN-SSE2-NEXT: movq %rsi, %xmm0
|
||||
; LIN-SSE2-NEXT: andq %rcx, %rsi
|
||||
; LIN-SSE2-NEXT: andq %rcx, %rdi
|
||||
; LIN-SSE2-NEXT: movq %rax, %xmm0
|
||||
; LIN-SSE2-NEXT: movq %rdx, %xmm1
|
||||
; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; LIN-SSE2-NEXT: movq %rdx, %xmm2
|
||||
; LIN-SSE2-NEXT: movq %rax, %xmm1
|
||||
; LIN-SSE2-NEXT: movq %rdi, %xmm2
|
||||
; LIN-SSE2-NEXT: movq %rsi, %xmm1
|
||||
; LIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; LIN-SSE2-NEXT: retq
|
||||
;
|
||||
|
@ -150,21 +161,19 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
|
|||
; LIN-SSE4: # %bb.0:
|
||||
; LIN-SSE4-NEXT: movdqa (%rsi), %xmm0
|
||||
; LIN-SSE4-NEXT: pand (%rdx), %xmm0
|
||||
; LIN-SSE4-NEXT: pextrq $1, %xmm0, %rax
|
||||
; LIN-SSE4-NEXT: movq %rax, %rdx
|
||||
; LIN-SSE4-NEXT: shrq $32, %rdx
|
||||
; LIN-SSE4-NEXT: movq %xmm0, %rsi
|
||||
; LIN-SSE4-NEXT: movq %rsi, %rdi
|
||||
; LIN-SSE4-NEXT: shrq $32, %rdi
|
||||
; LIN-SSE4-NEXT: andl %ecx, %esi
|
||||
; LIN-SSE4-NEXT: andl %ecx, %eax
|
||||
; LIN-SSE4-NEXT: andq %rcx, %rdi
|
||||
; LIN-SSE4-NEXT: movd %xmm0, %eax
|
||||
; LIN-SSE4-NEXT: pextrd $1, %xmm0, %edx
|
||||
; LIN-SSE4-NEXT: pextrd $2, %xmm0, %esi
|
||||
; LIN-SSE4-NEXT: pextrd $3, %xmm0, %edi
|
||||
; LIN-SSE4-NEXT: andq %rcx, %rax
|
||||
; LIN-SSE4-NEXT: andq %rcx, %rdx
|
||||
; LIN-SSE4-NEXT: movq %rdi, %xmm1
|
||||
; LIN-SSE4-NEXT: movq %rsi, %xmm0
|
||||
; LIN-SSE4-NEXT: andq %rcx, %rsi
|
||||
; LIN-SSE4-NEXT: andq %rcx, %rdi
|
||||
; LIN-SSE4-NEXT: movq %rdx, %xmm1
|
||||
; LIN-SSE4-NEXT: movq %rax, %xmm0
|
||||
; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; LIN-SSE4-NEXT: movq %rdx, %xmm2
|
||||
; LIN-SSE4-NEXT: movq %rax, %xmm1
|
||||
; LIN-SSE4-NEXT: movq %rdi, %xmm2
|
||||
; LIN-SSE4-NEXT: movq %rsi, %xmm1
|
||||
; LIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; LIN-SSE4-NEXT: retq
|
||||
;
|
||||
|
@ -172,21 +181,21 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
|
|||
; WIN-SSE2: # %bb.0:
|
||||
; WIN-SSE2-NEXT: movdqa (%rdx), %xmm0
|
||||
; WIN-SSE2-NEXT: pand (%r8), %xmm0
|
||||
; WIN-SSE2-NEXT: movd %xmm0, %eax
|
||||
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
|
||||
; WIN-SSE2-NEXT: movd %xmm1, %ecx
|
||||
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
|
||||
; WIN-SSE2-NEXT: movq %xmm1, %r8
|
||||
; WIN-SSE2-NEXT: movq %r8, %rcx
|
||||
; WIN-SSE2-NEXT: shrq $32, %rcx
|
||||
; WIN-SSE2-NEXT: movq %xmm0, %rax
|
||||
; WIN-SSE2-NEXT: movq %rax, %rdx
|
||||
; WIN-SSE2-NEXT: shrq $32, %rdx
|
||||
; WIN-SSE2-NEXT: andl %r9d, %eax
|
||||
; WIN-SSE2-NEXT: andl %r9d, %r8d
|
||||
; WIN-SSE2-NEXT: andq %r9, %rdx
|
||||
; WIN-SSE2-NEXT: movd %xmm1, %r8d
|
||||
; WIN-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
|
||||
; WIN-SSE2-NEXT: movd %xmm0, %edx
|
||||
; WIN-SSE2-NEXT: andq %r9, %rax
|
||||
; WIN-SSE2-NEXT: andq %r9, %rcx
|
||||
; WIN-SSE2-NEXT: movq %rdx, %xmm1
|
||||
; WIN-SSE2-NEXT: andq %r9, %r8
|
||||
; WIN-SSE2-NEXT: andq %r9, %rdx
|
||||
; WIN-SSE2-NEXT: movq %rax, %xmm0
|
||||
; WIN-SSE2-NEXT: movq %rcx, %xmm1
|
||||
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; WIN-SSE2-NEXT: movq %rcx, %xmm2
|
||||
; WIN-SSE2-NEXT: movq %rdx, %xmm2
|
||||
; WIN-SSE2-NEXT: movq %r8, %xmm1
|
||||
; WIN-SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; WIN-SSE2-NEXT: retq
|
||||
|
@ -195,53 +204,47 @@ define <4 x i64> @old(double* %p, <4 x i32>* %i, <4 x i32>* %h, i64 %f) nounwind
|
|||
; WIN-SSE4: # %bb.0:
|
||||
; WIN-SSE4-NEXT: movdqa (%rdx), %xmm0
|
||||
; WIN-SSE4-NEXT: pand (%r8), %xmm0
|
||||
; WIN-SSE4-NEXT: pextrq $1, %xmm0, %r8
|
||||
; WIN-SSE4-NEXT: movq %r8, %rcx
|
||||
; WIN-SSE4-NEXT: shrq $32, %rcx
|
||||
; WIN-SSE4-NEXT: movq %xmm0, %rax
|
||||
; WIN-SSE4-NEXT: movq %rax, %rdx
|
||||
; WIN-SSE4-NEXT: shrq $32, %rdx
|
||||
; WIN-SSE4-NEXT: andl %r9d, %eax
|
||||
; WIN-SSE4-NEXT: andl %r9d, %r8d
|
||||
; WIN-SSE4-NEXT: andq %r9, %rdx
|
||||
; WIN-SSE4-NEXT: movd %xmm0, %eax
|
||||
; WIN-SSE4-NEXT: pextrd $1, %xmm0, %ecx
|
||||
; WIN-SSE4-NEXT: pextrd $2, %xmm0, %r8d
|
||||
; WIN-SSE4-NEXT: pextrd $3, %xmm0, %edx
|
||||
; WIN-SSE4-NEXT: andq %r9, %rax
|
||||
; WIN-SSE4-NEXT: andq %r9, %rcx
|
||||
; WIN-SSE4-NEXT: movq %rdx, %xmm1
|
||||
; WIN-SSE4-NEXT: andq %r9, %r8
|
||||
; WIN-SSE4-NEXT: andq %r9, %rdx
|
||||
; WIN-SSE4-NEXT: movq %rcx, %xmm1
|
||||
; WIN-SSE4-NEXT: movq %rax, %xmm0
|
||||
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; WIN-SSE4-NEXT: movq %rcx, %xmm2
|
||||
; WIN-SSE4-NEXT: movq %rdx, %xmm2
|
||||
; WIN-SSE4-NEXT: movq %r8, %xmm1
|
||||
; WIN-SSE4-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; WIN-SSE4-NEXT: retq
|
||||
;
|
||||
; LIN32-LABEL: old:
|
||||
; LIN32: # %bb.0:
|
||||
; LIN32-NEXT: pushl %ebp
|
||||
; LIN32-NEXT: movl %esp, %ebp
|
||||
; LIN32-NEXT: pushl %edi
|
||||
; LIN32-NEXT: pushl %esi
|
||||
; LIN32-NEXT: andl $-16, %esp
|
||||
; LIN32-NEXT: subl $32, %esp
|
||||
; LIN32-NEXT: movl 20(%ebp), %eax
|
||||
; LIN32-NEXT: movl 16(%ebp), %ecx
|
||||
; LIN32-NEXT: movl 12(%ebp), %edx
|
||||
; LIN32-NEXT: movaps (%edx), %xmm0
|
||||
; LIN32-NEXT: andps (%ecx), %xmm0
|
||||
; LIN32-NEXT: movaps %xmm0, (%esp)
|
||||
; LIN32-NEXT: movl (%esp), %ecx
|
||||
; LIN32-NEXT: andl %eax, %ecx
|
||||
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
|
||||
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
|
||||
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
|
||||
; LIN32-NEXT: movdqa (%edx), %xmm0
|
||||
; LIN32-NEXT: pand (%ecx), %xmm0
|
||||
; LIN32-NEXT: movd %xmm0, %ecx
|
||||
; LIN32-NEXT: pextrd $1, %xmm0, %edx
|
||||
; LIN32-NEXT: pextrd $2, %xmm0, %esi
|
||||
; LIN32-NEXT: pextrd $3, %xmm0, %edi
|
||||
; LIN32-NEXT: andl %eax, %ecx
|
||||
; LIN32-NEXT: andl %eax, %edx
|
||||
; LIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
|
||||
; LIN32-NEXT: andl %eax, %esi
|
||||
; LIN32-NEXT: andl {{[0-9]+}}(%esp), %eax
|
||||
; LIN32-NEXT: andl %eax, %edi
|
||||
; LIN32-NEXT: movd %edx, %xmm1
|
||||
; LIN32-NEXT: movd %ecx, %xmm0
|
||||
; LIN32-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; LIN32-NEXT: movd %eax, %xmm2
|
||||
; LIN32-NEXT: movd %edi, %xmm2
|
||||
; LIN32-NEXT: movd %esi, %xmm1
|
||||
; LIN32-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
|
||||
; LIN32-NEXT: leal -4(%ebp), %esp
|
||||
; LIN32-NEXT: popl %esi
|
||||
; LIN32-NEXT: popl %ebp
|
||||
; LIN32-NEXT: popl %edi
|
||||
; LIN32-NEXT: retl
|
||||
%a = load <4 x i32>, <4 x i32>* %i
|
||||
%b = load <4 x i32>, <4 x i32>* %h
|
||||
|
|
|
@ -153,109 +153,51 @@ define <4 x i32> @_mul4xi32b(<4 x i32>, <4 x i32>) {
|
|||
define <4 x i64> @_mul4xi32toi64a(<4 x i32>, <4 x i32>) {
|
||||
; SSE2-LABEL: _mul4xi32toi64a:
|
||||
; SSE2: # %bb.0:
|
||||
; SSE2-NEXT: movq %xmm1, %rax
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,0,1]
|
||||
; SSE2-NEXT: movq %xmm1, %rcx
|
||||
; SSE2-NEXT: movd %ecx, %xmm1
|
||||
; SSE2-NEXT: shrq $32, %rcx
|
||||
; SSE2-NEXT: movq %xmm0, %rdx
|
||||
; SSE2-NEXT: movd %edx, %xmm2
|
||||
; SSE2-NEXT: shrq $32, %rdx
|
||||
; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
|
||||
; SSE2-NEXT: movq %xmm0, %rsi
|
||||
; SSE2-NEXT: movd %esi, %xmm3
|
||||
; SSE2-NEXT: shrq $32, %rsi
|
||||
; SSE2-NEXT: movd %esi, %xmm0
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
|
||||
; SSE2-NEXT: movd %edx, %xmm0
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
|
||||
; SSE2-NEXT: movd %ecx, %xmm0
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
||||
; SSE2-NEXT: movd %eax, %xmm0
|
||||
; SSE2-NEXT: shrq $32, %rax
|
||||
; SSE2-NEXT: pmuludq %xmm3, %xmm1
|
||||
; SSE2-NEXT: movd %eax, %xmm3
|
||||
; SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
||||
; SSE2-NEXT: pmuludq %xmm2, %xmm0
|
||||
; SSE2-NEXT: pxor %xmm3, %xmm3
|
||||
; SSE2-NEXT: movdqa %xmm0, %xmm4
|
||||
; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3]
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
|
||||
; SSE2-NEXT: movdqa %xmm1, %xmm2
|
||||
; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
|
||||
; SSE2-NEXT: pmuludq %xmm4, %xmm2
|
||||
; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
|
||||
; SSE2-NEXT: pmuludq %xmm1, %xmm0
|
||||
; SSE2-NEXT: movdqa %xmm2, %xmm1
|
||||
; SSE2-NEXT: retq
|
||||
;
|
||||
; SSE42-LABEL: _mul4xi32toi64a:
|
||||
; SSE42: # %bb.0:
|
||||
; SSE42-NEXT: movq %xmm1, %rax
|
||||
; SSE42-NEXT: pextrq $1, %xmm1, %rcx
|
||||
; SSE42-NEXT: movd %ecx, %xmm1
|
||||
; SSE42-NEXT: shrq $32, %rcx
|
||||
; SSE42-NEXT: movq %xmm0, %rdx
|
||||
; SSE42-NEXT: movd %edx, %xmm2
|
||||
; SSE42-NEXT: shrq $32, %rdx
|
||||
; SSE42-NEXT: pextrq $1, %xmm0, %rsi
|
||||
; SSE42-NEXT: movd %esi, %xmm3
|
||||
; SSE42-NEXT: shrq $32, %rsi
|
||||
; SSE42-NEXT: movd %esi, %xmm0
|
||||
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
|
||||
; SSE42-NEXT: movd %edx, %xmm0
|
||||
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
|
||||
; SSE42-NEXT: movd %ecx, %xmm0
|
||||
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
|
||||
; SSE42-NEXT: movd %eax, %xmm0
|
||||
; SSE42-NEXT: shrq $32, %rax
|
||||
; SSE42-NEXT: pmuludq %xmm3, %xmm1
|
||||
; SSE42-NEXT: movd %eax, %xmm3
|
||||
; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
||||
; SSE42-NEXT: pmuludq %xmm2, %xmm0
|
||||
; SSE42-NEXT: pxor %xmm3, %xmm3
|
||||
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
|
||||
; SSE42-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
|
||||
; SSE42-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
|
||||
; SSE42-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
|
||||
; SSE42-NEXT: pmuludq %xmm0, %xmm1
|
||||
; SSE42-NEXT: pmuludq %xmm4, %xmm2
|
||||
; SSE42-NEXT: movdqa %xmm2, %xmm0
|
||||
; SSE42-NEXT: retq
|
||||
;
|
||||
; AVX1-LABEL: _mul4xi32toi64a:
|
||||
; AVX1: # %bb.0:
|
||||
; AVX1-NEXT: vmovq %xmm0, %rax
|
||||
; AVX1-NEXT: vmovd %eax, %xmm2
|
||||
; AVX1-NEXT: shrq $32, %rax
|
||||
; AVX1-NEXT: vmovq %xmm1, %rcx
|
||||
; AVX1-NEXT: vpextrq $1, %xmm0, %rdx
|
||||
; AVX1-NEXT: vmovd %edx, %xmm0
|
||||
; AVX1-NEXT: shrq $32, %rdx
|
||||
; AVX1-NEXT: vpextrq $1, %xmm1, %rsi
|
||||
; AVX1-NEXT: vmovd %esi, %xmm1
|
||||
; AVX1-NEXT: shrq $32, %rsi
|
||||
; AVX1-NEXT: vmovd %esi, %xmm3
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
|
||||
; AVX1-NEXT: vmovd %edx, %xmm3
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
||||
; AVX1-NEXT: vmovd %ecx, %xmm3
|
||||
; AVX1-NEXT: shrq $32, %rcx
|
||||
; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
|
||||
; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
||||
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovd %ecx, %xmm1
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
|
||||
; AVX1-NEXT: vmovd %eax, %xmm3
|
||||
; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
||||
; AVX1-NEXT: vpmuludq %xmm1, %xmm2, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
; AVX2-LABEL: _mul4xi32toi64a:
|
||||
; AVX2: # %bb.0:
|
||||
; AVX2-NEXT: vmovq %xmm1, %rax
|
||||
; AVX2-NEXT: vmovd %eax, %xmm2
|
||||
; AVX2-NEXT: shrq $32, %rax
|
||||
; AVX2-NEXT: vpextrq $1, %xmm1, %rcx
|
||||
; AVX2-NEXT: vmovq %xmm0, %rdx
|
||||
; AVX2-NEXT: vmovd %edx, %xmm1
|
||||
; AVX2-NEXT: shrq $32, %rdx
|
||||
; AVX2-NEXT: vpextrq $1, %xmm0, %rsi
|
||||
; AVX2-NEXT: vmovd %esi, %xmm0
|
||||
; AVX2-NEXT: shrq $32, %rsi
|
||||
; AVX2-NEXT: vmovd %esi, %xmm3
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm3[0]
|
||||
; AVX2-NEXT: vmovd %edx, %xmm3
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
|
||||
; AVX2-NEXT: vmovd %ecx, %xmm3
|
||||
; AVX2-NEXT: shrq $32, %rcx
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX2-NEXT: vmovd %ecx, %xmm1
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
|
||||
; AVX2-NEXT: vmovd %eax, %xmm3
|
||||
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
|
||||
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm3 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm0
|
||||
; AVX2-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
|
||||
; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
|
||||
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
|
||||
; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0
|
||||
; AVX2-NEXT: retq
|
||||
%f00 = extractelement <4 x i32> %0, i32 0
|
||||
|
|
|
@ -36,12 +36,14 @@ define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noa
|
|||
; X64: # %bb.0: # %begin
|
||||
; X64-NEXT: movdqu (%rdx), %xmm0
|
||||
; X64-NEXT: pslld $4, %xmm0
|
||||
; X64-NEXT: movq %xmm0, %rax
|
||||
; X64-NEXT: movd %xmm0, %eax
|
||||
; X64-NEXT: movslq %eax, %r8
|
||||
; X64-NEXT: sarq $32, %rax
|
||||
; X64-NEXT: pextrq $1, %xmm0, %rdx
|
||||
; X64-NEXT: movslq %edx, %rcx
|
||||
; X64-NEXT: sarq $32, %rdx
|
||||
; X64-NEXT: pextrd $1, %xmm0, %ecx
|
||||
; X64-NEXT: movslq %ecx, %rcx
|
||||
; X64-NEXT: pextrd $2, %xmm0, %edx
|
||||
; X64-NEXT: movslq %edx, %rdx
|
||||
; X64-NEXT: pextrd $3, %xmm0, %eax
|
||||
; X64-NEXT: cltq
|
||||
; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; X64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
|
|
|
@ -12,19 +12,16 @@ define void @func(<4 x float> %vx) {
|
|||
; CHECK-NEXT: pushq %rax
|
||||
; CHECK-NEXT: .cfi_def_cfa_offset 16
|
||||
; CHECK-NEXT: pand {{.*}}(%rip), %xmm0
|
||||
; CHECK-NEXT: pextrq $1, %xmm0, %rax
|
||||
; CHECK-NEXT: movzwl %ax, %ecx
|
||||
; CHECK-NEXT: shrq $32, %rax
|
||||
; CHECK-NEXT: movq %xmm0, %rdx
|
||||
; CHECK-NEXT: movzwl %dx, %r8d
|
||||
; CHECK-NEXT: movq %rdx, %r9
|
||||
; CHECK-NEXT: shrq $32, %r9
|
||||
; CHECK-NEXT: movd %xmm0, %r8d
|
||||
; CHECK-NEXT: leaq stuff(%r8), %rdi
|
||||
; CHECK-NEXT: leaq stuff(%r9), %rsi
|
||||
; CHECK-NEXT: leaq stuff(%rcx), %rdx
|
||||
; CHECK-NEXT: leaq stuff(%rax), %rcx
|
||||
; CHECK-NEXT: pextrd $1, %xmm0, %eax
|
||||
; CHECK-NEXT: leaq stuff(%rax), %rsi
|
||||
; CHECK-NEXT: pextrd $2, %xmm0, %edx
|
||||
; CHECK-NEXT: pextrd $3, %xmm0, %ecx
|
||||
; CHECK-NEXT: leaq stuff(%rdx), %rdx
|
||||
; CHECK-NEXT: leaq stuff(%rcx), %rcx
|
||||
; CHECK-NEXT: leaq stuff+8(%r8), %r8
|
||||
; CHECK-NEXT: leaq stuff+8(%r9), %r9
|
||||
; CHECK-NEXT: leaq stuff+8(%rax), %r9
|
||||
; CHECK-NEXT: callq toto
|
||||
; CHECK-NEXT: popq %rax
|
||||
; CHECK-NEXT: retq
|
||||
|
|
|
@ -37,44 +37,42 @@ define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
|
|||
define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
|
||||
; SSSE3-LABEL: var_shuffle_v4i32:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: movd %xmm1, %eax
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
|
||||
; SSSE3-NEXT: movd %xmm2, %ecx
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
|
||||
; SSSE3-NEXT: movq %xmm2, %rax
|
||||
; SSSE3-NEXT: movq %rax, %rcx
|
||||
; SSSE3-NEXT: sarq $32, %rcx
|
||||
; SSSE3-NEXT: movq %xmm1, %rdx
|
||||
; SSSE3-NEXT: movq %rdx, %rsi
|
||||
; SSSE3-NEXT: sarq $32, %rsi
|
||||
; SSSE3-NEXT: andl $3, %edx
|
||||
; SSSE3-NEXT: movd %xmm2, %edx
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
||||
; SSSE3-NEXT: movd %xmm1, %esi
|
||||
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSSE3-NEXT: andl $3, %esi
|
||||
; SSSE3-NEXT: andl $3, %eax
|
||||
; SSSE3-NEXT: andl $3, %ecx
|
||||
; SSSE3-NEXT: andl $3, %edx
|
||||
; SSSE3-NEXT: andl $3, %esi
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: var_shuffle_v4i32:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpextrq $1, %xmm1, %rax
|
||||
; AVX-NEXT: movq %rax, %rcx
|
||||
; AVX-NEXT: sarq $32, %rcx
|
||||
; AVX-NEXT: vmovq %xmm1, %rdx
|
||||
; AVX-NEXT: movq %rdx, %rsi
|
||||
; AVX-NEXT: sarq $32, %rsi
|
||||
; AVX-NEXT: andl $3, %edx
|
||||
; AVX-NEXT: vmovd %xmm1, %eax
|
||||
; AVX-NEXT: vpextrd $1, %xmm1, %ecx
|
||||
; AVX-NEXT: vpextrd $2, %xmm1, %edx
|
||||
; AVX-NEXT: vpextrd $3, %xmm1, %esi
|
||||
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; AVX-NEXT: andl $3, %esi
|
||||
; AVX-NEXT: andl $3, %eax
|
||||
; AVX-NEXT: andl $3, %ecx
|
||||
; AVX-NEXT: andl $3, %edx
|
||||
; AVX-NEXT: andl $3, %esi
|
||||
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: retq
|
||||
%index0 = extractelement <4 x i32> %indices, i32 0
|
||||
%index1 = extractelement <4 x i32> %indices, i32 1
|
||||
|
@ -287,40 +285,38 @@ define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) noun
|
|||
define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
|
||||
; SSSE3-LABEL: var_shuffle_v4f32:
|
||||
; SSSE3: # %bb.0:
|
||||
; SSSE3-NEXT: movd %xmm1, %eax
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,2,3]
|
||||
; SSSE3-NEXT: movd %xmm2, %ecx
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
|
||||
; SSSE3-NEXT: movq %xmm2, %rax
|
||||
; SSSE3-NEXT: movq %rax, %rcx
|
||||
; SSSE3-NEXT: sarq $32, %rcx
|
||||
; SSSE3-NEXT: movq %xmm1, %rdx
|
||||
; SSSE3-NEXT: movq %rdx, %rsi
|
||||
; SSSE3-NEXT: sarq $32, %rsi
|
||||
; SSSE3-NEXT: andl $3, %edx
|
||||
; SSSE3-NEXT: movd %xmm2, %edx
|
||||
; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
|
||||
; SSSE3-NEXT: movd %xmm1, %esi
|
||||
; SSSE3-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; SSSE3-NEXT: andl $3, %esi
|
||||
; SSSE3-NEXT: andl $3, %eax
|
||||
; SSSE3-NEXT: andl $3, %ecx
|
||||
; SSSE3-NEXT: andl $3, %edx
|
||||
; SSSE3-NEXT: andl $3, %esi
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; SSSE3-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; SSSE3-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
|
||||
; SSSE3-NEXT: retq
|
||||
;
|
||||
; AVX-LABEL: var_shuffle_v4f32:
|
||||
; AVX: # %bb.0:
|
||||
; AVX-NEXT: vpextrq $1, %xmm1, %rax
|
||||
; AVX-NEXT: movq %rax, %rcx
|
||||
; AVX-NEXT: sarq $32, %rcx
|
||||
; AVX-NEXT: vmovq %xmm1, %rdx
|
||||
; AVX-NEXT: movq %rdx, %rsi
|
||||
; AVX-NEXT: sarq $32, %rsi
|
||||
; AVX-NEXT: andl $3, %edx
|
||||
; AVX-NEXT: vmovd %xmm1, %eax
|
||||
; AVX-NEXT: vpextrd $1, %xmm1, %ecx
|
||||
; AVX-NEXT: vpextrd $2, %xmm1, %edx
|
||||
; AVX-NEXT: vpextrd $3, %xmm1, %esi
|
||||
; AVX-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; AVX-NEXT: andl $3, %esi
|
||||
; AVX-NEXT: andl $3, %eax
|
||||
; AVX-NEXT: andl $3, %ecx
|
||||
; AVX-NEXT: andl $3, %edx
|
||||
; AVX-NEXT: andl $3, %esi
|
||||
; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
|
||||
; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
|
|
|
@ -119,36 +119,32 @@ define <8 x i32> @var_shuffle_v8i32(<8 x i32> %v, <8 x i32> %indices) nounwind {
|
|||
; AVX1-NEXT: movq %rsp, %rbp
|
||||
; AVX1-NEXT: andq $-32, %rsp
|
||||
; AVX1-NEXT: subq $64, %rsp
|
||||
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
|
||||
; AVX1-NEXT: movq %r8, %rcx
|
||||
; AVX1-NEXT: shrq $30, %rcx
|
||||
; AVX1-NEXT: vmovq %xmm1, %r9
|
||||
; AVX1-NEXT: movq %r9, %rsi
|
||||
; AVX1-NEXT: shrq $30, %rsi
|
||||
; AVX1-NEXT: vmovd %xmm1, %r8d
|
||||
; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
|
||||
; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
|
||||
; AVX1-NEXT: vpextrd $3, %xmm1, %esi
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
|
||||
; AVX1-NEXT: movq %r10, %rdi
|
||||
; AVX1-NEXT: shrq $30, %rdi
|
||||
; AVX1-NEXT: vmovq %xmm1, %rax
|
||||
; AVX1-NEXT: movq %rax, %rdx
|
||||
; AVX1-NEXT: shrq $30, %rdx
|
||||
; AVX1-NEXT: vmovd %xmm1, %edi
|
||||
; AVX1-NEXT: vpextrd $1, %xmm1, %eax
|
||||
; AVX1-NEXT: vpextrd $2, %xmm1, %ecx
|
||||
; AVX1-NEXT: vpextrd $3, %xmm1, %edx
|
||||
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
|
||||
; AVX1-NEXT: andl $7, %r9d
|
||||
; AVX1-NEXT: andl $28, %esi
|
||||
; AVX1-NEXT: andl $7, %r8d
|
||||
; AVX1-NEXT: andl $28, %ecx
|
||||
; AVX1-NEXT: andl $7, %eax
|
||||
; AVX1-NEXT: andl $28, %edx
|
||||
; AVX1-NEXT: andl $7, %r9d
|
||||
; AVX1-NEXT: andl $7, %r10d
|
||||
; AVX1-NEXT: andl $28, %edi
|
||||
; AVX1-NEXT: andl $7, %esi
|
||||
; AVX1-NEXT: andl $7, %edi
|
||||
; AVX1-NEXT: andl $7, %eax
|
||||
; AVX1-NEXT: andl $7, %ecx
|
||||
; AVX1-NEXT: andl $7, %edx
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpinsrd $1, (%rsp,%rdx), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $3, (%rsp,%rdi), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $1, (%rsp,%rax,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $2, (%rsp,%rcx,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $3, (%rsp,%rdx,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpinsrd $1, (%rsp,%rsi), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $2, (%rsp,%r8,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $3, (%rsp,%rcx), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $1, (%rsp,%r9,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $2, (%rsp,%r10,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: movq %rbp, %rsp
|
||||
; AVX1-NEXT: popq %rbp
|
||||
|
@ -1212,28 +1208,24 @@ define <8 x float> @var_shuffle_v8f32(<8 x float> %v, <8 x i32> %indices) nounwi
|
|||
; AVX1-NEXT: movq %rsp, %rbp
|
||||
; AVX1-NEXT: andq $-32, %rsp
|
||||
; AVX1-NEXT: subq $64, %rsp
|
||||
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
|
||||
; AVX1-NEXT: movq %r8, %rcx
|
||||
; AVX1-NEXT: shrq $30, %rcx
|
||||
; AVX1-NEXT: vmovq %xmm1, %r9
|
||||
; AVX1-NEXT: movq %r9, %rdx
|
||||
; AVX1-NEXT: shrq $30, %rdx
|
||||
; AVX1-NEXT: vmovd %xmm1, %esi
|
||||
; AVX1-NEXT: vpextrd $1, %xmm1, %r8d
|
||||
; AVX1-NEXT: vpextrd $2, %xmm1, %r9d
|
||||
; AVX1-NEXT: vpextrd $3, %xmm1, %r10d
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
|
||||
; AVX1-NEXT: vpextrq $1, %xmm1, %r10
|
||||
; AVX1-NEXT: movq %r10, %rdi
|
||||
; AVX1-NEXT: shrq $30, %rdi
|
||||
; AVX1-NEXT: vmovq %xmm1, %rax
|
||||
; AVX1-NEXT: movq %rax, %rsi
|
||||
; AVX1-NEXT: shrq $30, %rsi
|
||||
; AVX1-NEXT: vmovd %xmm1, %edx
|
||||
; AVX1-NEXT: vpextrd $1, %xmm1, %edi
|
||||
; AVX1-NEXT: vpextrd $2, %xmm1, %eax
|
||||
; AVX1-NEXT: vpextrd $3, %xmm1, %ecx
|
||||
; AVX1-NEXT: vmovaps %ymm0, (%rsp)
|
||||
; AVX1-NEXT: andl $7, %r9d
|
||||
; AVX1-NEXT: andl $28, %edx
|
||||
; AVX1-NEXT: andl $7, %esi
|
||||
; AVX1-NEXT: andl $7, %r8d
|
||||
; AVX1-NEXT: andl $28, %ecx
|
||||
; AVX1-NEXT: andl $7, %eax
|
||||
; AVX1-NEXT: andl $28, %esi
|
||||
; AVX1-NEXT: andl $7, %r9d
|
||||
; AVX1-NEXT: andl $7, %r10d
|
||||
; AVX1-NEXT: andl $28, %edi
|
||||
; AVX1-NEXT: andl $7, %edx
|
||||
; AVX1-NEXT: andl $7, %edi
|
||||
; AVX1-NEXT: andl $7, %eax
|
||||
; AVX1-NEXT: andl $7, %ecx
|
||||
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
|
||||
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
|
@ -1375,36 +1367,32 @@ define <4 x i64> @var_shuffle_v4i64_from_v2i64(<2 x i64> %v, <4 x i64> %indices)
|
|||
define <8 x i32> @var_shuffle_v8i32_from_v4i32(<4 x i32> %v, <8 x i32> %indices) unnamed_addr nounwind {
|
||||
; AVX1-LABEL: var_shuffle_v8i32_from_v4i32:
|
||||
; AVX1: # %bb.0: # %entry
|
||||
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
|
||||
; AVX1-NEXT: movq %r8, %r10
|
||||
; AVX1-NEXT: shrq $30, %r10
|
||||
; AVX1-NEXT: vmovq %xmm1, %r9
|
||||
; AVX1-NEXT: movq %r9, %rsi
|
||||
; AVX1-NEXT: shrq $30, %rsi
|
||||
; AVX1-NEXT: vmovd %xmm1, %r8d
|
||||
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; AVX1-NEXT: andl $3, %r9d
|
||||
; AVX1-NEXT: andl $12, %esi
|
||||
; AVX1-NEXT: andl $3, %r8d
|
||||
; AVX1-NEXT: andl $12, %r10d
|
||||
; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
|
||||
; AVX1-NEXT: andl $3, %r9d
|
||||
; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
|
||||
; AVX1-NEXT: andl $3, %r10d
|
||||
; AVX1-NEXT: vpextrd $3, %xmm1, %esi
|
||||
; AVX1-NEXT: andl $3, %esi
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
||||
; AVX1-NEXT: movq %rax, %rdi
|
||||
; AVX1-NEXT: shrq $30, %rdi
|
||||
; AVX1-NEXT: vmovq %xmm0, %rcx
|
||||
; AVX1-NEXT: movq %rcx, %rdx
|
||||
; AVX1-NEXT: shrq $30, %rdx
|
||||
; AVX1-NEXT: andl $3, %ecx
|
||||
; AVX1-NEXT: andl $12, %edx
|
||||
; AVX1-NEXT: vmovd %xmm0, %edi
|
||||
; AVX1-NEXT: andl $3, %edi
|
||||
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
|
||||
; AVX1-NEXT: andl $3, %eax
|
||||
; AVX1-NEXT: andl $12, %edi
|
||||
; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
|
||||
; AVX1-NEXT: andl $3, %ecx
|
||||
; AVX1-NEXT: vpextrd $3, %xmm0, %edx
|
||||
; AVX1-NEXT: andl $3, %edx
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rdx), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rax,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdi), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rax,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%rcx,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rdx,4), %xmm0, %xmm0
|
||||
; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%rsi), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r8,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%r10), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $1, -24(%rsp,%r9,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $2, -24(%rsp,%r10,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vpinsrd $3, -24(%rsp,%rsi,4), %xmm1, %xmm1
|
||||
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
|
||||
; AVX1-NEXT: retq
|
||||
;
|
||||
|
@ -2402,28 +2390,24 @@ define <4 x double> @var_shuffle_v4f64_from_v2f64(<2 x double> %v, <4 x i64> %in
|
|||
define <8 x float> @var_shuffle_v8f32_from_v4f32(<4 x float> %v, <8 x i32> %indices) unnamed_addr nounwind {
|
||||
; AVX1-LABEL: var_shuffle_v8f32_from_v4f32:
|
||||
; AVX1: # %bb.0: # %entry
|
||||
; AVX1-NEXT: vpextrq $1, %xmm1, %r8
|
||||
; AVX1-NEXT: movq %r8, %r10
|
||||
; AVX1-NEXT: shrq $30, %r10
|
||||
; AVX1-NEXT: vmovq %xmm1, %r9
|
||||
; AVX1-NEXT: movq %r9, %rdx
|
||||
; AVX1-NEXT: shrq $30, %rdx
|
||||
; AVX1-NEXT: vmovd %xmm1, %r8d
|
||||
; AVX1-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp)
|
||||
; AVX1-NEXT: andl $3, %r9d
|
||||
; AVX1-NEXT: andl $12, %edx
|
||||
; AVX1-NEXT: andl $3, %r8d
|
||||
; AVX1-NEXT: andl $12, %r10d
|
||||
; AVX1-NEXT: vpextrd $1, %xmm1, %r9d
|
||||
; AVX1-NEXT: andl $3, %r9d
|
||||
; AVX1-NEXT: vpextrd $2, %xmm1, %r10d
|
||||
; AVX1-NEXT: andl $3, %r10d
|
||||
; AVX1-NEXT: vpextrd $3, %xmm1, %esi
|
||||
; AVX1-NEXT: andl $3, %esi
|
||||
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm0
|
||||
; AVX1-NEXT: vpextrq $1, %xmm0, %rax
|
||||
; AVX1-NEXT: movq %rax, %rdi
|
||||
; AVX1-NEXT: shrq $30, %rdi
|
||||
; AVX1-NEXT: vmovq %xmm0, %rcx
|
||||
; AVX1-NEXT: movq %rcx, %rsi
|
||||
; AVX1-NEXT: shrq $30, %rsi
|
||||
; AVX1-NEXT: andl $3, %ecx
|
||||
; AVX1-NEXT: andl $12, %esi
|
||||
; AVX1-NEXT: vmovd %xmm0, %edi
|
||||
; AVX1-NEXT: andl $3, %edi
|
||||
; AVX1-NEXT: vpextrd $1, %xmm0, %eax
|
||||
; AVX1-NEXT: andl $3, %eax
|
||||
; AVX1-NEXT: andl $12, %edi
|
||||
; AVX1-NEXT: vpextrd $2, %xmm0, %ecx
|
||||
; AVX1-NEXT: andl $3, %ecx
|
||||
; AVX1-NEXT: vpextrd $3, %xmm0, %edx
|
||||
; AVX1-NEXT: andl $3, %edx
|
||||
; AVX1-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
|
||||
; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
|
||||
|
@ -2475,19 +2459,17 @@ define <4 x i32> @var_shuffle_v4i32_from_v8i32(<8 x i32> %v, <4 x i32> %indices)
|
|||
; AVX-NEXT: movq %rsp, %rbp
|
||||
; AVX-NEXT: andq $-32, %rsp
|
||||
; AVX-NEXT: subq $64, %rsp
|
||||
; AVX-NEXT: vmovq %xmm1, %rax
|
||||
; AVX-NEXT: movq %rax, %rcx
|
||||
; AVX-NEXT: shrq $30, %rcx
|
||||
; AVX-NEXT: andl $28, %ecx
|
||||
; AVX-NEXT: vpextrq $1, %xmm1, %rdx
|
||||
; AVX-NEXT: movq %rdx, %rsi
|
||||
; AVX-NEXT: sarq $32, %rsi
|
||||
; AVX-NEXT: andl $7, %eax
|
||||
; AVX-NEXT: andl $7, %edx
|
||||
; AVX-NEXT: vmovd %xmm1, %eax
|
||||
; AVX-NEXT: vmovaps %ymm0, (%rsp)
|
||||
; AVX-NEXT: andl $7, %eax
|
||||
; AVX-NEXT: vpextrd $1, %xmm1, %ecx
|
||||
; AVX-NEXT: andl $7, %ecx
|
||||
; AVX-NEXT: vpextrd $2, %xmm1, %edx
|
||||
; AVX-NEXT: andl $7, %edx
|
||||
; AVX-NEXT: vpextrd $3, %xmm1, %esi
|
||||
; AVX-NEXT: andl $7, %esi
|
||||
; AVX-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
|
||||
; AVX-NEXT: vpinsrd $1, (%rsp,%rcx), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $1, (%rsp,%rcx,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $2, (%rsp,%rdx,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: vpinsrd $3, (%rsp,%rsi,4), %xmm0, %xmm0
|
||||
; AVX-NEXT: movq %rbp, %rsp
|
||||
|
|
Loading…
Reference in New Issue