forked from OSchip/llvm-project
[DAGCombine] visitEXTRACT_SUBVECTOR - 'little to big' extract_subvector(bitcast()) support
This moves the X86 specific transform from rL364407 into DAGCombiner to generically handle 'little to big' cases (for example: extract_subvector(v2i64 bitcast(v16i8))). This allows us to remove both the x86 implementation and the aarch64 bitcast(extract_subvector(bitcast())) combine. Earlier patches that dealt with regressions initially exposed by this patch: rG5e5e99c041e4 rG0b38af89e2c0 Patch by: @RKSimon (Simon Pilgrim) Differential Revision: https://reviews.llvm.org/D63815
This commit is contained in:
parent
0860db966a
commit
8cefc37be5
|
@ -18515,7 +18515,23 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
|
|||
return DAG.getBitcast(NVT, NewExtract);
|
||||
}
|
||||
}
|
||||
// TODO - handle (DestNumElts % SrcNumElts) == 0
|
||||
if ((DestNumElts % SrcNumElts) == 0) {
|
||||
unsigned DestSrcRatio = DestNumElts / SrcNumElts;
|
||||
if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) {
|
||||
unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio;
|
||||
EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
|
||||
SrcVT.getScalarType(), NewExtNumElts);
|
||||
if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
|
||||
TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
|
||||
unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
|
||||
SDLoc DL(N);
|
||||
SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
|
||||
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
|
||||
V.getOperand(0), NewIndex);
|
||||
return DAG.getBitcast(NVT, NewExtract);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Combine:
|
||||
|
|
|
@ -618,7 +618,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
|
|||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
setTargetDAGCombine(ISD::SIGN_EXTEND);
|
||||
setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
|
||||
setTargetDAGCombine(ISD::BITCAST);
|
||||
setTargetDAGCombine(ISD::CONCAT_VECTORS);
|
||||
setTargetDAGCombine(ISD::STORE);
|
||||
if (Subtarget->supportsAddressTopByteIgnored())
|
||||
|
@ -10185,74 +10184,6 @@ static SDValue performSRLCombine(SDNode *N,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
static SDValue performBitcastCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
SelectionDAG &DAG) {
|
||||
// Wait 'til after everything is legalized to try this. That way we have
|
||||
// legal vector types and such.
|
||||
if (DCI.isBeforeLegalizeOps())
|
||||
return SDValue();
|
||||
|
||||
// Remove extraneous bitcasts around an extract_subvector.
|
||||
// For example,
|
||||
// (v4i16 (bitconvert
|
||||
// (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
|
||||
// becomes
|
||||
// (extract_subvector ((v8i16 ...), (i64 4)))
|
||||
|
||||
// Only interested in 64-bit vectors as the ultimate result.
|
||||
EVT VT = N->getValueType(0);
|
||||
if (!VT.isVector() || VT.isScalableVector())
|
||||
return SDValue();
|
||||
if (VT.getSimpleVT().getSizeInBits() != 64)
|
||||
return SDValue();
|
||||
// Is the operand an extract_subvector starting at the beginning or halfway
|
||||
// point of the vector? A low half may also come through as an
|
||||
// EXTRACT_SUBREG, so look for that, too.
|
||||
SDValue Op0 = N->getOperand(0);
|
||||
if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
|
||||
!(Op0->isMachineOpcode() &&
|
||||
Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
|
||||
return SDValue();
|
||||
uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
|
||||
if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
|
||||
if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
|
||||
return SDValue();
|
||||
} else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
|
||||
if (idx != AArch64::dsub)
|
||||
return SDValue();
|
||||
// The dsub reference is equivalent to a lane zero subvector reference.
|
||||
idx = 0;
|
||||
}
|
||||
// Look through the bitcast of the input to the extract.
|
||||
if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
|
||||
return SDValue();
|
||||
SDValue Source = Op0->getOperand(0)->getOperand(0);
|
||||
// If the source type has twice the number of elements as our destination
|
||||
// type, we know this is an extract of the high or low half of the vector.
|
||||
EVT SVT = Source->getValueType(0);
|
||||
if (!SVT.isVector() ||
|
||||
SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
|
||||
return SDValue();
|
||||
|
||||
LLVM_DEBUG(
|
||||
dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
|
||||
|
||||
// Create the simplified form to just extract the low or high half of the
|
||||
// vector directly rather than bothering with the bitcasts.
|
||||
SDLoc dl(N);
|
||||
unsigned NumElements = VT.getVectorNumElements();
|
||||
if (idx) {
|
||||
SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
|
||||
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
|
||||
} else {
|
||||
SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
|
||||
return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
|
||||
Source, SubReg),
|
||||
0);
|
||||
}
|
||||
}
|
||||
|
||||
static SDValue performConcatVectorsCombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
SelectionDAG &DAG) {
|
||||
|
@ -12453,8 +12384,6 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
|
|||
return performExtendCombine(N, DCI, DAG);
|
||||
case ISD::SIGN_EXTEND_INREG:
|
||||
return performSignExtendInRegCombine(N, DCI, DAG);
|
||||
case ISD::BITCAST:
|
||||
return performBitcastCombine(N, DCI, DAG);
|
||||
case ISD::CONCAT_VECTORS:
|
||||
return performConcatVectorsCombine(N, DCI, DAG);
|
||||
case ISD::SELECT:
|
||||
|
|
|
@ -45103,7 +45103,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
|
|||
SDValue InVec = N->getOperand(0);
|
||||
SDValue InVecBC = peekThroughBitcasts(InVec);
|
||||
EVT InVecVT = InVec.getValueType();
|
||||
EVT InVecBCVT = InVecBC.getValueType();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
|
||||
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
|
||||
|
@ -45147,31 +45146,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
|
|||
VT, SDLoc(N),
|
||||
InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
|
||||
|
||||
// Try to move vector bitcast after extract_subv by scaling extraction index:
|
||||
// extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
|
||||
// TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
|
||||
if (InVec != InVecBC && InVecBCVT.isVector()) {
|
||||
unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
|
||||
unsigned DestNumElts = InVecVT.getVectorNumElements();
|
||||
if ((DestNumElts % SrcNumElts) == 0) {
|
||||
unsigned DestSrcRatio = DestNumElts / SrcNumElts;
|
||||
if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
|
||||
unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
|
||||
EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
|
||||
InVecBCVT.getScalarType(), NewExtNumElts);
|
||||
if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
|
||||
TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
|
||||
unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
|
||||
SDLoc DL(N);
|
||||
SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
|
||||
SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
|
||||
InVecBC, NewIndex);
|
||||
return DAG.getBitcast(VT, NewExtract);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we are extracting from an insert into a zero vector, replace with a
|
||||
// smaller insert into zero if we don't access less than the original
|
||||
// subvector. Don't do this for i1 vectors.
|
||||
|
|
|
@ -42,17 +42,10 @@ define void @blam() {
|
|||
; the fastness of unaligned accesses was not specified correctly.
|
||||
|
||||
define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
|
||||
; SPLITTING-LABEL: merge_vec_extract_stores:
|
||||
; SPLITTING: // %bb.0:
|
||||
; SPLITTING-NEXT: ext v1.16b, v0.16b, v0.16b, #8
|
||||
; SPLITTING-NEXT: str d0, [x0, #24]
|
||||
; SPLITTING-NEXT: str d1, [x0, #32]
|
||||
; SPLITTING-NEXT: ret
|
||||
;
|
||||
; MISALIGNED-LABEL: merge_vec_extract_stores:
|
||||
; MISALIGNED: // %bb.0:
|
||||
; MISALIGNED-NEXT: stur q0, [x0, #24]
|
||||
; MISALIGNED-NEXT: ret
|
||||
; CHECK-LABEL: merge_vec_extract_stores:
|
||||
; CHECK: // %bb.0:
|
||||
; CHECK-NEXT: stur q0, [x0, #24]
|
||||
; CHECK-NEXT: ret
|
||||
%idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
|
||||
%idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
|
||||
|
||||
|
@ -62,9 +55,4 @@ define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
|
|||
store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
|
||||
store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
|
||||
ret void
|
||||
|
||||
|
||||
; FIXME: Ideally we would like to use a generic target for this test, but this relies
|
||||
; on suppressing store pairs.
|
||||
|
||||
}
|
||||
|
|
|
@ -9,8 +9,8 @@ declare <8 x i8> @llvm.arm.neon.vtbl2(<8 x i8> %shuffle.i.i307, <8 x i8> %shuffl
|
|||
; they are defined on VPRs and used on VPRs.
|
||||
;
|
||||
; CHECK-LABEL: motivatingExample:
|
||||
; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
|
||||
; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
|
||||
; CHECK: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
|
||||
; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
|
||||
; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
|
||||
; CHECK-NEXT: vstr [[RES]], [r1]
|
||||
; CHECK-NEXT: bx lr
|
||||
|
|
|
@ -130,9 +130,9 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
|
|||
; AVX512F-NEXT: shrq $32, %rdi
|
||||
; AVX512F-NEXT: shrq $48, %rax
|
||||
; AVX512F-NEXT: shrl $16, %ecx
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
|
||||
; AVX512F-NEXT: vpavgb %ymm4, %ymm5, %ymm4
|
||||
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm4
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: kmovw %ecx, %k2
|
||||
; AVX512F-NEXT: kmovw %eax, %k3
|
||||
|
@ -142,14 +142,14 @@ define <64 x i8> @avg_v64i8_mask(<64 x i8> %a, <64 x i8> %b, <64 x i8> %src, i64
|
|||
; AVX512F-NEXT: vpternlogd $255, %zmm5, %zmm5, %zmm5 {%k3} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm5, %xmm5
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm5, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm4, %zmm4, %zmm4 {%k2} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm4, %xmm4
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm3
|
||||
; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: avg_v64i8_mask:
|
||||
|
@ -178,9 +178,9 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
|
|||
; AVX512F-NEXT: shrq $32, %rdi
|
||||
; AVX512F-NEXT: shrq $48, %rax
|
||||
; AVX512F-NEXT: shrl $16, %ecx
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
|
||||
; AVX512F-NEXT: vpavgb %ymm2, %ymm3, %ymm2
|
||||
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; AVX512F-NEXT: vpavgb %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: kmovw %ecx, %k2
|
||||
; AVX512F-NEXT: kmovw %eax, %k3
|
||||
|
@ -190,14 +190,14 @@ define <64 x i8> @avg_v64i8_maskz(<64 x i8> %a, <64 x i8> %b, i64 %mask) nounwin
|
|||
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k3} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm2, %xmm2
|
||||
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm1, %xmm1
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k2} {z}
|
||||
; AVX512F-NEXT: vpmovdb %zmm3, %xmm3
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
||||
; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: avg_v64i8_maskz:
|
||||
|
@ -330,18 +330,18 @@ define <32 x i16> @avg_v32i16_mask(<32 x i16> %a, <32 x i16> %b, <32 x i16> %src
|
|||
; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm3
|
||||
; AVX512F-NEXT: kmovw %edi, %k1
|
||||
; AVX512F-NEXT: shrl $16, %edi
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm4
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm5
|
||||
; AVX512F-NEXT: vpavgw %ymm4, %ymm5, %ymm4
|
||||
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm4
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: kmovw %edi, %k2
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
|
||||
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm3, %ymm1
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm3, %zmm3, %zmm3 {%k1} {z}
|
||||
; AVX512F-NEXT: vpmovdw %zmm3, %ymm3
|
||||
; AVX512F-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm0, %ymm3, %ymm0
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
|
||||
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vpblendvb %ymm1, %ymm4, %ymm2, %ymm1
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: avg_v32i16_mask:
|
||||
|
@ -366,18 +366,18 @@ define <32 x i16> @avg_v32i16_maskz(<32 x i16> %a, <32 x i16> %b, i32 %mask) nou
|
|||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: kmovw %edi, %k1
|
||||
; AVX512F-NEXT: shrl $16, %edi
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
|
||||
; AVX512F-NEXT: vpavgw %ymm2, %ymm3, %ymm2
|
||||
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; AVX512F-NEXT: vpavgw %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: kmovw %edi, %k2
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
|
||||
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vpand %ymm0, %ymm1, %ymm0
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
|
||||
; AVX512F-NEXT: vpmovdw %zmm1, %ymm1
|
||||
; AVX512F-NEXT: vpand %ymm2, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k1} {z}
|
||||
; AVX512F-NEXT: vpmovdw %zmm2, %ymm2
|
||||
; AVX512F-NEXT: vpand %ymm0, %ymm2, %ymm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512BWVL-LABEL: avg_v32i16_maskz:
|
||||
|
|
|
@ -1975,9 +1975,9 @@ define <16 x i32> @pmaddwd_32(<32 x i16> %A, <32 x i16> %B) {
|
|||
;
|
||||
; AVX512F-LABEL: pmaddwd_32:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
|
||||
; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
|
||||
; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
|
@ -2188,9 +2188,9 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
|
|||
;
|
||||
; AVX512F-LABEL: jumbled_indices16:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm3
|
||||
; AVX512F-NEXT: vpmaddwd %ymm2, %ymm3, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm1, %ymm3
|
||||
; AVX512F-NEXT: vpmaddwd %ymm3, %ymm2, %ymm2
|
||||
; AVX512F-NEXT: vpmaddwd %ymm1, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0
|
||||
; AVX512F-NEXT: retq
|
||||
|
|
|
@ -6374,9 +6374,9 @@ define void @truncstore_v32i16_v32i8(<32 x i16> %x, <32 x i8>* %p, <32 x i8> %ma
|
|||
;
|
||||
; AVX512F-LABEL: truncstore_v32i16_v32i8:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm2
|
||||
; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3
|
||||
; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm1, %ymm1
|
||||
; AVX512F-NEXT: vpacksswb %ymm2, %ymm0, %ymm0
|
||||
; AVX512F-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
|
||||
; AVX512F-NEXT: vpmovmskb %ymm1, %eax
|
||||
|
|
|
@ -725,33 +725,33 @@ define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_
|
|||
define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
|
||||
; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
|
||||
; AVX512F: # %bb.0:
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm1
|
||||
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512F-NEXT: vpshufb %xmm2, %xmm0, %xmm2
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; AVX512F-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512F-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
; AVX512F-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; AVX512F-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; AVX512F-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
|
||||
; AVX512F-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; AVX512F-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512F-NEXT: vzeroupper
|
||||
; AVX512F-NEXT: retq
|
||||
;
|
||||
; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
|
||||
; AVX512VL: # %bb.0:
|
||||
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm1
|
||||
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm2 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VL-NEXT: vpshufb %xmm2, %xmm1, %xmm1
|
||||
; AVX512VL-NEXT: vpshufb %xmm2, %xmm0, %xmm2
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
|
||||
; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
|
||||
; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm2
|
||||
; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm3 = <1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u>
|
||||
; AVX512VL-NEXT: vpshufb %xmm3, %xmm2, %xmm2
|
||||
; AVX512VL-NEXT: vpshufb %xmm3, %xmm0, %xmm0
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; AVX512VL-NEXT: vextracti128 $1, %ymm1, %xmm2
|
||||
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,5,9,14,u,u,u,u,u,u,u,u]
|
||||
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
|
||||
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
|
||||
; AVX512VL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,1,5,9,13,u,u,u,u,u,u,u,u]
|
||||
; AVX512VL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
|
||||
; AVX512VL-NEXT: vpblendd {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3]
|
||||
; AVX512VL-NEXT: vzeroupper
|
||||
; AVX512VL-NEXT: retq
|
||||
;
|
||||
|
|
Loading…
Reference in New Issue