forked from OSchip/llvm-project
[ARM] Split large truncating MVE stores
MVE does not have a simple sign extend instruction that can move elements across lanes. We currently often end up moving each lane into and out of a GPR, in order to get elements into the correct places. When we have a store of a trunc (or a extend of a load), we can instead just split the store/load in two, using the narrowing/widening load/store instructions from each half of the vector. This does that for stores. It happens very early in a store combine, so as to easily detect the truncates. (It would be possible to do this later, but that would involve looking through a buildvector of extract elements. Not impossible but this way seemed simpler). By enabling store combines we also get a vmovdrr combine for free, helping some other tests. Differential Revision: https://reviews.llvm.org/D67828 llvm-svn: 372717
This commit is contained in:
parent
9379d19ff8
commit
49d851f403
|
@ -901,7 +901,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
|
|||
setTargetDAGCombine(ISD::SIGN_EXTEND);
|
||||
setTargetDAGCombine(ISD::ZERO_EXTEND);
|
||||
setTargetDAGCombine(ISD::ANY_EXTEND);
|
||||
setTargetDAGCombine(ISD::STORE);
|
||||
setTargetDAGCombine(ISD::FP_TO_SINT);
|
||||
setTargetDAGCombine(ISD::FP_TO_UINT);
|
||||
setTargetDAGCombine(ISD::FDIV);
|
||||
|
@ -922,6 +921,7 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
|
|||
setTargetDAGCombine(ISD::BUILD_VECTOR);
|
||||
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
|
||||
setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
|
||||
setTargetDAGCombine(ISD::STORE);
|
||||
}
|
||||
|
||||
if (!Subtarget->hasFP64()) {
|
||||
|
@ -13120,95 +13120,161 @@ static SDValue PerformLOADCombine(SDNode *N,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
// Optimize trunc store (of multiple scalars) to shuffle and store. First,
|
||||
// pack all of the elements in one place. Next, store to memory in fewer
|
||||
// chunks.
|
||||
SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) {
|
||||
SDValue StVal = St->getValue();
|
||||
EVT VT = StVal.getValueType();
|
||||
if (!St->isTruncatingStore() || !VT.isVector())
|
||||
return SDValue();
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
EVT StVT = St->getMemoryVT();
|
||||
unsigned NumElems = VT.getVectorNumElements();
|
||||
assert(StVT != VT && "Cannot truncate to the same type");
|
||||
unsigned FromEltSz = VT.getScalarSizeInBits();
|
||||
unsigned ToEltSz = StVT.getScalarSizeInBits();
|
||||
|
||||
// From, To sizes and ElemCount must be pow of two
|
||||
if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
|
||||
return SDValue();
|
||||
|
||||
// We are going to use the original vector elt for storing.
|
||||
// Accumulated smaller vector elements must be a multiple of the store size.
|
||||
if (0 != (NumElems * FromEltSz) % ToEltSz)
|
||||
return SDValue();
|
||||
|
||||
unsigned SizeRatio = FromEltSz / ToEltSz;
|
||||
assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
|
||||
|
||||
// Create a type on which we perform the shuffle.
|
||||
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
|
||||
NumElems * SizeRatio);
|
||||
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
|
||||
|
||||
SDLoc DL(St);
|
||||
SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
|
||||
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
|
||||
for (unsigned i = 0; i < NumElems; ++i)
|
||||
ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
|
||||
: i * SizeRatio;
|
||||
|
||||
// Can't shuffle using an illegal type.
|
||||
if (!TLI.isTypeLegal(WideVecVT))
|
||||
return SDValue();
|
||||
|
||||
SDValue Shuff = DAG.getVectorShuffle(
|
||||
WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
|
||||
// At this point all of the data is stored at the bottom of the
|
||||
// register. We now need to save it to mem.
|
||||
|
||||
// Find the largest store unit
|
||||
MVT StoreType = MVT::i8;
|
||||
for (MVT Tp : MVT::integer_valuetypes()) {
|
||||
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
|
||||
StoreType = Tp;
|
||||
}
|
||||
// Didn't find a legal store type.
|
||||
if (!TLI.isTypeLegal(StoreType))
|
||||
return SDValue();
|
||||
|
||||
// Bitcast the original vector into a vector of store-size units
|
||||
EVT StoreVecVT =
|
||||
EVT::getVectorVT(*DAG.getContext(), StoreType,
|
||||
VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
|
||||
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
|
||||
SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
|
||||
SmallVector<SDValue, 8> Chains;
|
||||
SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
|
||||
TLI.getPointerTy(DAG.getDataLayout()));
|
||||
SDValue BasePtr = St->getBasePtr();
|
||||
|
||||
// Perform one or more big stores into memory.
|
||||
unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
|
||||
for (unsigned I = 0; I < E; I++) {
|
||||
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
|
||||
ShuffWide, DAG.getIntPtrConstant(I, DL));
|
||||
SDValue Ch =
|
||||
DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
|
||||
St->getAlignment(), St->getMemOperand()->getFlags());
|
||||
BasePtr =
|
||||
DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
|
||||
Chains.push_back(Ch);
|
||||
}
|
||||
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
||||
}
|
||||
|
||||
// Try taking a single vector store from an truncate (which would otherwise turn
|
||||
// into an expensive buildvector) and splitting it into a series of narrowing
|
||||
// stores.
|
||||
SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG) {
|
||||
if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
|
||||
return SDValue();
|
||||
SDValue Trunc = St->getValue();
|
||||
if (Trunc->getOpcode() != ISD::TRUNCATE)
|
||||
return SDValue();
|
||||
EVT FromVT = Trunc->getOperand(0).getValueType();
|
||||
EVT ToVT = Trunc.getValueType();
|
||||
if (!ToVT.isVector())
|
||||
return SDValue();
|
||||
assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements());
|
||||
EVT ToEltVT = ToVT.getVectorElementType();
|
||||
EVT FromEltVT = FromVT.getVectorElementType();
|
||||
|
||||
unsigned NumElements = 0;
|
||||
if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8))
|
||||
NumElements = 4;
|
||||
if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
|
||||
NumElements = 8;
|
||||
if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
|
||||
FromVT.getVectorNumElements() % NumElements != 0)
|
||||
return SDValue();
|
||||
|
||||
SDLoc DL(St);
|
||||
// Details about the old store
|
||||
SDValue Ch = St->getChain();
|
||||
SDValue BasePtr = St->getBasePtr();
|
||||
unsigned Alignment = St->getOriginalAlignment();
|
||||
MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
|
||||
AAMDNodes AAInfo = St->getAAInfo();
|
||||
|
||||
EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
|
||||
EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
|
||||
|
||||
SmallVector<SDValue, 4> Stores;
|
||||
for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
|
||||
unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
|
||||
SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
|
||||
|
||||
SDValue Extract =
|
||||
DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
|
||||
DAG.getConstant(i * NumElements, DL, MVT::i32));
|
||||
SDValue Store = DAG.getTruncStore(
|
||||
Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
|
||||
NewToVT, Alignment, MMOFlags, AAInfo);
|
||||
Stores.push_back(Store);
|
||||
}
|
||||
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
|
||||
}
|
||||
|
||||
/// PerformSTORECombine - Target-specific dag combine xforms for
|
||||
/// ISD::STORE.
|
||||
static SDValue PerformSTORECombine(SDNode *N,
|
||||
TargetLowering::DAGCombinerInfo &DCI) {
|
||||
TargetLowering::DAGCombinerInfo &DCI,
|
||||
const ARMSubtarget *Subtarget) {
|
||||
StoreSDNode *St = cast<StoreSDNode>(N);
|
||||
if (St->isVolatile())
|
||||
return SDValue();
|
||||
|
||||
// Optimize trunc store (of multiple scalars) to shuffle and store. First,
|
||||
// pack all of the elements in one place. Next, store to memory in fewer
|
||||
// chunks.
|
||||
SDValue StVal = St->getValue();
|
||||
EVT VT = StVal.getValueType();
|
||||
if (St->isTruncatingStore() && VT.isVector()) {
|
||||
SelectionDAG &DAG = DCI.DAG;
|
||||
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
|
||||
EVT StVT = St->getMemoryVT();
|
||||
unsigned NumElems = VT.getVectorNumElements();
|
||||
assert(StVT != VT && "Cannot truncate to the same type");
|
||||
unsigned FromEltSz = VT.getScalarSizeInBits();
|
||||
unsigned ToEltSz = StVT.getScalarSizeInBits();
|
||||
|
||||
// From, To sizes and ElemCount must be pow of two
|
||||
if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue();
|
||||
if (Subtarget->hasNEON())
|
||||
if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
|
||||
return Store;
|
||||
|
||||
// We are going to use the original vector elt for storing.
|
||||
// Accumulated smaller vector elements must be a multiple of the store size.
|
||||
if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue();
|
||||
|
||||
unsigned SizeRatio = FromEltSz / ToEltSz;
|
||||
assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
|
||||
|
||||
// Create a type on which we perform the shuffle.
|
||||
EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
|
||||
NumElems*SizeRatio);
|
||||
assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
|
||||
|
||||
SDLoc DL(St);
|
||||
SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
|
||||
SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
|
||||
for (unsigned i = 0; i < NumElems; ++i)
|
||||
ShuffleVec[i] = DAG.getDataLayout().isBigEndian()
|
||||
? (i + 1) * SizeRatio - 1
|
||||
: i * SizeRatio;
|
||||
|
||||
// Can't shuffle using an illegal type.
|
||||
if (!TLI.isTypeLegal(WideVecVT)) return SDValue();
|
||||
|
||||
SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec,
|
||||
DAG.getUNDEF(WideVec.getValueType()),
|
||||
ShuffleVec);
|
||||
// At this point all of the data is stored at the bottom of the
|
||||
// register. We now need to save it to mem.
|
||||
|
||||
// Find the largest store unit
|
||||
MVT StoreType = MVT::i8;
|
||||
for (MVT Tp : MVT::integer_valuetypes()) {
|
||||
if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
|
||||
StoreType = Tp;
|
||||
}
|
||||
// Didn't find a legal store type.
|
||||
if (!TLI.isTypeLegal(StoreType))
|
||||
return SDValue();
|
||||
|
||||
// Bitcast the original vector into a vector of store-size units
|
||||
EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
|
||||
StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits());
|
||||
assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
|
||||
SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
|
||||
SmallVector<SDValue, 8> Chains;
|
||||
SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
|
||||
TLI.getPointerTy(DAG.getDataLayout()));
|
||||
SDValue BasePtr = St->getBasePtr();
|
||||
|
||||
// Perform one or more big stores into memory.
|
||||
unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits();
|
||||
for (unsigned I = 0; I < E; I++) {
|
||||
SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
|
||||
StoreType, ShuffWide,
|
||||
DAG.getIntPtrConstant(I, DL));
|
||||
SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr,
|
||||
St->getPointerInfo(), St->getAlignment(),
|
||||
St->getMemOperand()->getFlags());
|
||||
BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
|
||||
Increment);
|
||||
Chains.push_back(Ch);
|
||||
}
|
||||
return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
|
||||
}
|
||||
if (Subtarget->hasMVEIntegerOps())
|
||||
if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
|
||||
return NewToken;
|
||||
|
||||
if (!ISD::isNormalStore(St))
|
||||
return SDValue();
|
||||
|
@ -13260,7 +13326,7 @@ static SDValue PerformSTORECombine(SDNode *N,
|
|||
}
|
||||
|
||||
// If this is a legal vector store, try to combine it into a VST1_UPD.
|
||||
if (ISD::isNormalStore(N) && VT.isVector() &&
|
||||
if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
|
||||
DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT))
|
||||
return CombineBaseUpdate(N, DCI);
|
||||
|
||||
|
@ -14170,7 +14236,7 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
|
|||
case ARMISD::BFI: return PerformBFICombine(N, DCI);
|
||||
case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
|
||||
case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
|
||||
case ISD::STORE: return PerformSTORECombine(N, DCI);
|
||||
case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
|
||||
case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
|
||||
case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
|
||||
case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
|
||||
|
|
|
@ -130,7 +130,7 @@ define void @store_d(double* %a, double %b) {
|
|||
entry:
|
||||
; CHECK-LABEL: store_d:
|
||||
; NOREGS: strd r2, r3, [r0]
|
||||
; ONLYREGS: vstr d0, [r0]
|
||||
; ONLYREGS: strd r2, r3, [r0]
|
||||
; HARD: vstr d0, [r0]
|
||||
store double %b, double* %a, align 8
|
||||
ret void
|
||||
|
|
|
@ -521,15 +521,11 @@ define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) {
|
|||
; CHECK-LE-NEXT: bfi r3, r1, #0, #1
|
||||
; CHECK-LE-NEXT: and r1, r3, #3
|
||||
; CHECK-LE-NEXT: lsls r2, r3, #31
|
||||
; CHECK-LE-NEXT: ittt ne
|
||||
; CHECK-LE-NEXT: vmovne r2, s1
|
||||
; CHECK-LE-NEXT: vmovne r3, s0
|
||||
; CHECK-LE-NEXT: strdne r3, r2, [r0]
|
||||
; CHECK-LE-NEXT: it ne
|
||||
; CHECK-LE-NEXT: vstrne d0, [r0]
|
||||
; CHECK-LE-NEXT: lsls r1, r1, #30
|
||||
; CHECK-LE-NEXT: ittt mi
|
||||
; CHECK-LE-NEXT: vmovmi r1, s3
|
||||
; CHECK-LE-NEXT: vmovmi r2, s2
|
||||
; CHECK-LE-NEXT: strdmi r2, r1, [r0, #8]
|
||||
; CHECK-LE-NEXT: it mi
|
||||
; CHECK-LE-NEXT: vstrmi d1, [r0, #8]
|
||||
; CHECK-LE-NEXT: add sp, #4
|
||||
; CHECK-LE-NEXT: bx lr
|
||||
;
|
||||
|
@ -558,25 +554,11 @@ define arm_aapcs_vfpcc void @masked_v2i64(<2 x i64> *%dest, <2 x i64> %a) {
|
|||
; CHECK-BE-NEXT: bfi r3, r1, #0, #1
|
||||
; CHECK-BE-NEXT: and r1, r3, #3
|
||||
; CHECK-BE-NEXT: lsls r2, r3, #31
|
||||
; CHECK-BE-NEXT: bne .LBB19_3
|
||||
; CHECK-BE-NEXT: @ %bb.1: @ %else
|
||||
; CHECK-BE-NEXT: it ne
|
||||
; CHECK-BE-NEXT: vstrne d0, [r0]
|
||||
; CHECK-BE-NEXT: lsls r1, r1, #30
|
||||
; CHECK-BE-NEXT: bmi .LBB19_4
|
||||
; CHECK-BE-NEXT: .LBB19_2: @ %else2
|
||||
; CHECK-BE-NEXT: add sp, #4
|
||||
; CHECK-BE-NEXT: bx lr
|
||||
; CHECK-BE-NEXT: .LBB19_3: @ %cond.store
|
||||
; CHECK-BE-NEXT: vrev64.32 q1, q0
|
||||
; CHECK-BE-NEXT: vmov r2, s5
|
||||
; CHECK-BE-NEXT: vmov r3, s4
|
||||
; CHECK-BE-NEXT: strd r3, r2, [r0]
|
||||
; CHECK-BE-NEXT: lsls r1, r1, #30
|
||||
; CHECK-BE-NEXT: bpl .LBB19_2
|
||||
; CHECK-BE-NEXT: .LBB19_4: @ %cond.store1
|
||||
; CHECK-BE-NEXT: vrev64.32 q1, q0
|
||||
; CHECK-BE-NEXT: vmov r1, s7
|
||||
; CHECK-BE-NEXT: vmov r2, s6
|
||||
; CHECK-BE-NEXT: strd r2, r1, [r0, #8]
|
||||
; CHECK-BE-NEXT: it mi
|
||||
; CHECK-BE-NEXT: vstrmi d1, [r0, #8]
|
||||
; CHECK-BE-NEXT: add sp, #4
|
||||
; CHECK-BE-NEXT: bx lr
|
||||
entry:
|
||||
|
|
Loading…
Reference in New Issue