[SystemZ] Patchset for expanding memcpy/memset using at most two stores.

* Set MaxStoresPerMemcpy and MaxStoresPerMemset to 2.

* Optimize stores of replicated values in SystemZ::combineSTORE(). This
  handles the now expanded memory operations and as well some other
  pre-existing cases.

* Reject a big displacement in isLegalAddressingMode() for a vector type.

* Return true from shouldConsiderGEPOffsetSplit().

Reviewed By: Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D122105
This commit is contained in:
Jonas Paulsson 2022-03-08 15:51:43 -05:00
parent afc21c7e79
commit eaa78035c6
8 changed files with 1215 additions and 14 deletions

View File

@ -3486,7 +3486,8 @@ public:
/// Return true if the number of memory ops is below the threshold (Limit).
/// It returns the types of the sequence of memory ops to perform
/// memset / memcpy by reference.
bool findOptimalMemOpLowering(std::vector<EVT> &MemOps, unsigned Limit,
virtual bool
findOptimalMemOpLowering(std::vector<EVT> &MemOps, unsigned Limit,
const MemOp &Op, unsigned DstAS, unsigned SrcAS,
const AttributeList &FuncAttributes) const;

View File

@ -669,7 +669,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
// We want to use MVC in preference to even a single load/store pair.
MaxStoresPerMemcpy = 0;
MaxStoresPerMemcpy = Subtarget.hasVector() ? 2 : 0;
MaxStoresPerMemcpyOptSize = 0;
// The main memset sequence is a byte store followed by an MVC.
@ -677,7 +677,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
// generated by target-independent code don't when the byte value is
// variable. E.g. "STC <reg>;MHI <reg>,257;STH <reg>" is not better
// than "STC;MVC". Handle the choice in target-specific code instead.
MaxStoresPerMemset = 0;
MaxStoresPerMemset = Subtarget.hasVector() ? 2 : 0;
MaxStoresPerMemsetOptSize = 0;
// Default to having -disable-strictnode-mutation on
@ -793,14 +793,17 @@ bool SystemZVectorConstantInfo::isVectorConstantLegal(
return tryValue(SplatBitsZ | Middle);
}
SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) {
IntBits = FPImm.bitcastToAPInt().zextOrSelf(128);
isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
SplatBits = FPImm.bitcastToAPInt();
unsigned Width = SplatBits.getBitWidth();
IntBits <<= (SystemZ::VectorBits - Width);
SystemZVectorConstantInfo::SystemZVectorConstantInfo(APInt IntImm) {
if (IntImm.isSingleWord()) {
IntBits = APInt(128, IntImm.getZExtValue());
IntBits <<= (SystemZ::VectorBits - IntImm.getBitWidth());
} else
IntBits = IntImm;
assert(IntBits.getBitWidth() == 128 && "Unsupported APInt.");
// Find the smallest splat.
SplatBits = IntImm;
unsigned Width = SplatBits.getBitWidth();
while (Width > 8) {
unsigned HalfSize = Width / 2;
APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize);
@ -976,7 +979,8 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
if (!isInt<20>(AM.BaseOffs))
return false;
AddressingMode SupportedAM(true, true);
bool RequireD12 = Subtarget.hasVector() && Ty->isVectorTy();
AddressingMode SupportedAM(!RequireD12, true);
if (I != nullptr)
SupportedAM = supportedAddressingMode(I, Subtarget.hasVector());
@ -991,6 +995,28 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL,
return AM.Scale == 0 || AM.Scale == 1;
}
bool SystemZTargetLowering::findOptimalMemOpLowering(
std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
unsigned SrcAS, const AttributeList &FuncAttributes) const {
const int MVCFastLen = 16;
// Don't expand Op into scalar loads/stores in these cases:
if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
return false; // Small memcpy: Use MVC
if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
return false; // Small memset (first byte with STC/MVI): Use MVC
if (Op.isZeroMemset())
return false; // Memset zero: Use XC
return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS,
SrcAS, FuncAttributes);
}
EVT SystemZTargetLowering::getOptimalMemOpType(const MemOp &Op,
const AttributeList &FuncAttributes) const {
return Subtarget.hasVector() ? MVT::v2i64 : MVT::Other;
}
bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const {
if (!FromType->isIntegerTy() || !ToType->isIntegerTy())
return false;
@ -6329,6 +6355,23 @@ static bool isVectorElementSwap(ArrayRef<int> M, EVT VT) {
return true;
}
static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) {
for (auto *U : StoredVal->uses()) {
if (StoreSDNode *ST = dyn_cast<StoreSDNode>(U)) {
EVT CurrMemVT = ST->getMemoryVT().getScalarType();
if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16)
continue;
} else if (isa<BuildVectorSDNode>(U)) {
SDValue BuildVector = SDValue(U, 0);
if (DAG.isSplatValue(BuildVector, true/*AllowUndefs*/) &&
isOnlyUsedByStores(BuildVector, DAG))
continue;
}
return false;
}
return true;
}
SDValue SystemZTargetLowering::combineSTORE(
SDNode *N, DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@ -6387,6 +6430,82 @@ SDValue SystemZTargetLowering::combineSTORE(
}
}
// Replicate a reg or immediate with VREP instead of scalar multiply or
// immediate load. It seems best to do this during the first DAGCombine as
// it is straight-forward to handle the zero-extend node in the initial
// DAG, and also not worry about the keeping the new MemVT legal (e.g. when
// extracting an i16 element from a v16i8 vector).
if (Subtarget.hasVector() && DCI.Level == BeforeLegalizeTypes &&
isOnlyUsedByStores(Op1, DAG)) {
SDValue Word = SDValue();
EVT WordVT;
// Find a replicated immediate and return it if found in Word and its
// type in WordVT.
auto FindReplicatedImm = [&](ConstantSDNode *C, unsigned TotBytes) {
// Some constants are better handled with a scalar store.
if (C->getAPIntValue().getBitWidth() > 64 || C->isAllOnes() ||
isInt<16>(C->getSExtValue()) || MemVT.getStoreSize() <= 2)
return;
SystemZVectorConstantInfo VCI(APInt(TotBytes * 8, C->getZExtValue()));
if (VCI.isVectorConstantLegal(Subtarget) &&
VCI.Opcode == SystemZISD::REPLICATE) {
Word = DAG.getConstant(VCI.OpVals[0], SDLoc(SN), MVT::i32);
WordVT = VCI.VecVT.getScalarType();
}
};
// Find a replicated register and return it if found in Word and its type
// in WordVT.
auto FindReplicatedReg = [&](SDValue MulOp) {
EVT MulVT = MulOp.getValueType();
if (MulOp->getOpcode() == ISD::MUL &&
(MulVT == MVT::i16 || MulVT == MVT::i32 || MulVT == MVT::i64)) {
// Find a zero extended value and its type.
SDValue LHS = MulOp->getOperand(0);
if (LHS->getOpcode() == ISD::ZERO_EXTEND)
WordVT = LHS->getOperand(0).getValueType();
else if (LHS->getOpcode() == ISD::AssertZext)
WordVT = cast<VTSDNode>(LHS->getOperand(1))->getVT();
else
return;
// Find a replicating constant, e.g. 0x00010001.
if (auto *C = dyn_cast<ConstantSDNode>(MulOp->getOperand(1))) {
SystemZVectorConstantInfo VCI(
APInt(MulVT.getSizeInBits(), C->getZExtValue()));
if (VCI.isVectorConstantLegal(Subtarget) &&
VCI.Opcode == SystemZISD::REPLICATE && VCI.OpVals[0] == 1 &&
WordVT == VCI.VecVT.getScalarType())
Word = DAG.getZExtOrTrunc(LHS->getOperand(0), SDLoc(SN), WordVT);
}
}
};
if (isa<BuildVectorSDNode>(Op1) &&
DAG.isSplatValue(Op1, true/*AllowUndefs*/)) {
SDValue SplatVal = Op1->getOperand(0);
if (auto *C = dyn_cast<ConstantSDNode>(SplatVal))
FindReplicatedImm(C, SplatVal.getValueType().getStoreSize());
else
FindReplicatedReg(SplatVal);
} else {
if (auto *C = dyn_cast<ConstantSDNode>(Op1))
FindReplicatedImm(C, MemVT.getStoreSize());
else
FindReplicatedReg(Op1);
}
if (Word != SDValue()) {
assert(MemVT.getSizeInBits() % WordVT.getSizeInBits() == 0 &&
"Bad type handling");
unsigned NumElts = MemVT.getSizeInBits() / WordVT.getSizeInBits();
EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), WordVT, NumElts);
SDValue SplatVal = DAG.getSplatVector(SplatVT, SDLoc(SN), Word);
return DAG.getStore(SN->getChain(), SDLoc(SN), SplatVal,
SN->getBasePtr(), SN->getMemOperand());
}
}
return SDValue();
}

View File

@ -457,6 +457,12 @@ public:
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment,
MachineMemOperand::Flags Flags,
bool *Fast) const override;
bool
findOptimalMemOpLowering(std::vector<EVT> &MemOps, unsigned Limit,
const MemOp &Op, unsigned DstAS, unsigned SrcAS,
const AttributeList &FuncAttributes) const override;
EVT getOptimalMemOpType(const MemOp &Op,
const AttributeList &FuncAttributes) const override;
bool isTruncateFree(Type *, Type *) const override;
bool isTruncateFree(EVT, EVT) const override;
@ -467,6 +473,8 @@ public:
return VT == MVT::i32 || VT == MVT::i64;
}
bool shouldConsiderGEPOffsetSplit() const override { return true; }
const char *getTargetNodeName(unsigned Opcode) const override;
std::pair<unsigned, const TargetRegisterClass *>
getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
@ -767,12 +775,15 @@ private:
APInt SplatUndef; // Bits correspoding to undef operands of the BVN.
unsigned SplatBitSize = 0;
bool isFP128 = false;
public:
unsigned Opcode = 0;
SmallVector<unsigned, 2> OpVals;
MVT VecVT;
SystemZVectorConstantInfo(APFloat FPImm);
SystemZVectorConstantInfo(APInt IntImm);
SystemZVectorConstantInfo(APFloat FPImm)
: SystemZVectorConstantInfo(FPImm.bitcastToAPInt()) {
isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad());
}
SystemZVectorConstantInfo(BuildVectorSDNode *BVN);
bool isVectorConstantLegal(const SystemZSubtarget &Subtarget);
};

View File

@ -0,0 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
;
; Test that the big offsets are handled by only one AGFI.
define void @fun(i64* %Src, i64* %Dst) {
; CHECK-LABEL: fun:
; CHECK: # %bb.0:
; CHECK-NEXT: agfi %r2, 1048576
; CHECK-NEXT: lg %r0, 0(%r2)
; CHECK-NEXT: stg %r0, 0(%r3)
; CHECK-NEXT: lg %r0, 8(%r2)
; CHECK-NEXT: stg %r0, 0(%r3)
; CHECK-NEXT: br %r14
%S0 = getelementptr i64, i64* %Src, i64 131072
%V0 = load i64, i64* %S0
store volatile i64 %V0, i64* %Dst
%S1 = getelementptr i64, i64* %Src, i64 131073
%V1 = load i64, i64* %S1
store volatile i64 %V1, i64* %Dst
ret void
}

View File

@ -0,0 +1,29 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
;
; Test that DAGCombiner does not change the addressing as the displacements
; are known to be out of range. Only one addition is needed.
define void @fun(<2 x i64>* %Src, <2 x i64>* %Dst) {
; CHECK-LABEL: fun:
; CHECK: # %bb.0:
; CHECK-NEXT: aghi %r2, 4096
; CHECK-NEXT: vl %v0, 0(%r2), 3
; CHECK-NEXT: vst %v0, 0(%r3), 3
; CHECK-NEXT: vl %v0, 16(%r2), 3
; CHECK-NEXT: vst %v0, 0(%r3), 3
; CHECK-NEXT: br %r14
%1 = bitcast <2 x i64>* %Src to i8*
%splitgep = getelementptr i8, i8* %1, i64 4096
%2 = bitcast i8* %splitgep to <2 x i64>*
%V0 = load <2 x i64>, <2 x i64>* %2, align 8
store volatile <2 x i64> %V0, <2 x i64>* %Dst, align 8
%3 = getelementptr i8, i8* %splitgep, i64 16
%4 = bitcast i8* %3 to <2 x i64>*
%V1 = load <2 x i64>, <2 x i64>* %4, align 8
store volatile <2 x i64> %V1, <2 x i64>* %Dst, align 8
ret void
}

View File

@ -0,0 +1,217 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mcpu=z15 < %s -mtriple=s390x-linux-gnu | FileCheck %s
;
; Test memcpys of small constant lengths that should not be done with MVC.
declare void @llvm.memcpy.p0i8.p0i8.i64(i8 *nocapture, i8 *nocapture, i64, i1) nounwind
define void @fun16(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun16:
; CHECK: # %bb.0:
; CHECK-NEXT: mvc 0(16,%r3), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 16, i1 false)
ret void
}
define void @fun17(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun17:
; CHECK: # %bb.0:
; CHECK-NEXT: lb %r0, 16(%r2)
; CHECK-NEXT: stc %r0, 16(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 17, i1 false)
ret void
}
define void @fun18(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun18:
; CHECK: # %bb.0:
; CHECK-NEXT: lh %r0, 16(%r2)
; CHECK-NEXT: sth %r0, 16(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 18, i1 false)
ret void
}
define void @fun19(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun19:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 15(%r2)
; CHECK-NEXT: st %r0, 15(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 19, i1 false)
ret void
}
define void @fun20(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun20:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 16(%r2)
; CHECK-NEXT: st %r0, 16(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 20, i1 false)
ret void
}
define void @fun21(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun21:
; CHECK: # %bb.0:
; CHECK-NEXT: lg %r0, 13(%r2)
; CHECK-NEXT: stg %r0, 13(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 21, i1 false)
ret void
}
define void @fun22(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun22:
; CHECK: # %bb.0:
; CHECK-NEXT: lg %r0, 14(%r2)
; CHECK-NEXT: stg %r0, 14(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 22, i1 false)
ret void
}
define void @fun23(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun23:
; CHECK: # %bb.0:
; CHECK-NEXT: lg %r0, 15(%r2)
; CHECK-NEXT: stg %r0, 15(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 23, i1 false)
ret void
}
define void @fun24(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun24:
; CHECK: # %bb.0:
; CHECK-NEXT: lg %r0, 16(%r2)
; CHECK-NEXT: stg %r0, 16(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 24, i1 false)
ret void
}
define void @fun25(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun25:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 9(%r2)
; CHECK-NEXT: vst %v0, 9(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 25, i1 false)
ret void
}
define void @fun26(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun26:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 10(%r2)
; CHECK-NEXT: vst %v0, 10(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 26, i1 false)
ret void
}
define void @fun27(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun27:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 11(%r2)
; CHECK-NEXT: vst %v0, 11(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 27, i1 false)
ret void
}
define void @fun28(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun28:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 12(%r2)
; CHECK-NEXT: vst %v0, 12(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 28, i1 false)
ret void
}
define void @fun29(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun29:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 13(%r2)
; CHECK-NEXT: vst %v0, 13(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 29, i1 false)
ret void
}
define void @fun30(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun30:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 14(%r2)
; CHECK-NEXT: vst %v0, 14(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 30, i1 false)
ret void
}
define void @fun31(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun31:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 15(%r2)
; CHECK-NEXT: vst %v0, 15(%r3)
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 31, i1 false)
ret void
}
define void @fun32(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun32:
; CHECK: # %bb.0:
; CHECK-NEXT: vl %v0, 16(%r2), 4
; CHECK-NEXT: vst %v0, 16(%r3), 4
; CHECK-NEXT: vl %v0, 0(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 32, i1 false)
ret void
}
define void @fun33(i8* %Src, i8* %Dst, i8 %val) {
; CHECK-LABEL: fun33:
; CHECK: # %bb.0:
; CHECK-NEXT: mvc 0(33,%r3), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 33, i1 false)
ret void
}

View File

@ -0,0 +1,420 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mcpu=z15 %s -mtriple=s390x-linux-gnu -o - | FileCheck %s
;
; Test memsets of small constant lengths, that should not be done with MVC.
declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
define void @reg17(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg17:
; CHECK: # %bb.0:
; CHECK-NEXT: stc %r3, 0(%r2)
; CHECK-NEXT: mvc 1(16,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 17, i1 false)
ret void
}
define void @reg18(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg18:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: vsteh %v0, 16(%r2), 0
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 18, i1 false)
ret void
}
define void @reg19(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg19:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vstef %v0, 15(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 19, i1 false)
ret void
}
define void @reg20(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg20:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vstef %v0, 16(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 20, i1 false)
ret void
}
define void @reg21(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg21:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vsteg %v0, 13(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 21, i1 false)
ret void
}
define void @reg22(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg22:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vsteg %v0, 14(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 22, i1 false)
ret void
}
define void @reg23(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg23:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vsteg %v0, 15(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 23, i1 false)
ret void
}
define void @reg24(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg24:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vsteg %v0, 16(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 24, i1 false)
ret void
}
define void @reg25(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg25:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 9(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 25, i1 false)
ret void
}
define void @reg26(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg26:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 10(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 26, i1 false)
ret void
}
define void @reg27(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg27:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 11(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 27, i1 false)
ret void
}
define void @reg28(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg28:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 12(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 28, i1 false)
ret void
}
define void @reg29(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg29:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 13(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 29, i1 false)
ret void
}
define void @reg30(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg30:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 14(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 30, i1 false)
ret void
}
define void @reg31(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg31:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 15(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 31, i1 false)
ret void
}
define void @reg32(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg32:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r3, %r3
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vst %v0, 16(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 32, i1 false)
ret void
}
define void @reg33(i8* %Dst, i8 %val) {
; CHECK-LABEL: reg33:
; CHECK: # %bb.0:
; CHECK-NEXT: stc %r3, 0(%r2)
; CHECK-NEXT: mvc 1(32,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 33, i1 false)
ret void
}
;; Immediate value
define void @imm17(i8* %Dst) {
; CHECK-LABEL: imm17:
; CHECK: # %bb.0:
; CHECK-NEXT: mvi 0(%r2), 1
; CHECK-NEXT: mvc 1(16,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 17, i1 false)
ret void
}
define void @imm18(i8* %Dst) {
; CHECK-LABEL: imm18:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: mvhhi 16(%r2), -1
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 18, i1 false)
ret void
}
define void @zero18(i8* %Dst) {
; CHECK-LABEL: zero18:
; CHECK: # %bb.0:
; CHECK-NEXT: xc 0(18,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 0, i64 18, i1 false)
ret void
}
define void @imm19(i8* %Dst) {
; CHECK-LABEL: imm19:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vstef %v0, 15(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 19, i1 false)
ret void
}
define void @imm20(i8* %Dst) {
; CHECK-LABEL: imm20:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: mvhi 16(%r2), -1
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 20, i1 false)
ret void
}
define void @imm21(i8* %Dst) {
; CHECK-LABEL: imm21:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vsteg %v0, 13(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 21, i1 false)
ret void
}
define void @imm22(i8* %Dst) {
; CHECK-LABEL: imm22:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: mvghi 14(%r2), -1
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 22, i1 false)
ret void
}
define void @imm23(i8* %Dst) {
; CHECK-LABEL: imm23:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vsteg %v0, 15(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 23, i1 false)
ret void
}
define void @imm24(i8* %Dst) {
; CHECK-LABEL: imm24:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: mvghi 16(%r2), -1
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 24, i1 false)
ret void
}
define void @imm25(i8* %Dst) {
; CHECK-LABEL: imm25:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vst %v0, 9(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 25, i1 false)
ret void
}
define void @imm26(i8* %Dst) {
; CHECK-LABEL: imm26:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 10(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 26, i1 false)
ret void
}
define void @zero26(i8* %Dst) {
; CHECK-LABEL: zero26:
; CHECK: # %bb.0:
; CHECK-NEXT: xc 0(26,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 0, i64 26, i1 false)
ret void
}
define void @imm27(i8* %Dst) {
; CHECK-LABEL: imm27:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vst %v0, 11(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 27, i1 false)
ret void
}
define void @imm28(i8* %Dst) {
; CHECK-LABEL: imm28:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 12(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 28, i1 false)
ret void
}
define void @imm29(i8* %Dst) {
; CHECK-LABEL: imm29:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vst %v0, 13(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 29, i1 false)
ret void
}
define void @imm30(i8* %Dst) {
; CHECK-LABEL: imm30:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 14(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 30, i1 false)
ret void
}
define void @imm31(i8* %Dst) {
; CHECK-LABEL: imm31:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vst %v0, 15(%r2)
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 31, i1 false)
ret void
}
define void @imm32(i8* %Dst) {
; CHECK-LABEL: imm32:
; CHECK: # %bb.0:
; CHECK-NEXT: vgbm %v0, 65535
; CHECK-NEXT: vst %v0, 16(%r2), 4
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 32, i1 false)
ret void
}
define void @zero32(i8* %Dst) {
; CHECK-LABEL: zero32:
; CHECK: # %bb.0:
; CHECK-NEXT: xc 0(32,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 0, i64 32, i1 false)
ret void
}
define void @imm33(i8* %Dst) {
; CHECK-LABEL: imm33:
; CHECK: # %bb.0:
; CHECK-NEXT: mvi 0(%r2), 1
; CHECK-NEXT: mvc 1(32,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 33, i1 false)
ret void
}

View File

@ -0,0 +1,380 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s
;
; Test storing of replicated values using vector replicate type instructions.
;; Replicated registers
define void @fun_2x1b(i8* %Src, i16* %Dst) {
; CHECK-LABEL: fun_2x1b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepb %v0, 0(%r2)
; CHECK-NEXT: vsteh %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%i = load i8, i8* %Src
%ZE = zext i8 %i to i16
%Val = mul i16 %ZE, 257
store i16 %Val, i16* %Dst
ret void
}
; Test multiple stores of same value.
define void @fun_4x1b(i8* %Src, i32* %Dst, i32* %Dst2) {
; CHECK-LABEL: fun_4x1b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepb %v0, 0(%r2)
; CHECK-NEXT: vstef %v0, 0(%r3), 0
; CHECK-NEXT: vstef %v0, 0(%r4), 0
; CHECK-NEXT: br %r14
%i = load i8, i8* %Src
%ZE = zext i8 %i to i32
%Val = mul i32 %ZE, 16843009
store i32 %Val, i32* %Dst
store i32 %Val, i32* %Dst2
ret void
}
define void @fun_8x1b(i8* %Src, i64* %Dst) {
; CHECK-LABEL: fun_8x1b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepb %v0, 0(%r2)
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%i = load i8, i8* %Src
%ZE = zext i8 %i to i64
%Val = mul i64 %ZE, 72340172838076673
store i64 %Val, i64* %Dst
ret void
}
; A second truncated store of same value.
define void @fun_8x1b_4x1b(i8* %Src, i64* %Dst, i32* %Dst2) {
; CHECK-LABEL: fun_8x1b_4x1b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepb %v0, 0(%r2)
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: vstef %v0, 0(%r4), 0
; CHECK-NEXT: br %r14
%i = load i8, i8* %Src
%ZE = zext i8 %i to i64
%Val = mul i64 %ZE, 72340172838076673
store i64 %Val, i64* %Dst
%TrVal = trunc i64 %Val to i32
store i32 %TrVal, i32* %Dst2
ret void
}
define void @fun_2x2b(i16* %Src, i32* %Dst) {
; CHECK-LABEL: fun_2x2b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlreph %v0, 0(%r2)
; CHECK-NEXT: vstef %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%i = load i16, i16* %Src
%ZE = zext i16 %i to i32
%Val = mul i32 %ZE, 65537
store i32 %Val, i32* %Dst
ret void
}
define void @fun_4x2b(i16* %Src, i64* %Dst) {
; CHECK-LABEL: fun_4x2b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlreph %v0, 0(%r2)
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%i = load i16, i16* %Src
%ZE = zext i16 %i to i64
%Val = mul i64 %ZE, 281479271743489
store i64 %Val, i64* %Dst
ret void
}
define void @fun_2x4b(i32* %Src, i64* %Dst) {
; CHECK-LABEL: fun_2x4b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepf %v0, 0(%r2)
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%i = load i32, i32* %Src
%ZE = zext i32 %i to i64
%Val = mul i64 %ZE, 4294967297
store i64 %Val, i64* %Dst
ret void
}
;; Replicated registers already in a vector.
; Test multiple stores of same value.
define void @fun_2Eltsx8x1b(i8* %Src, <2 x i64>* %Dst, <2 x i64>* %Dst2) {
; CHECK-LABEL: fun_2Eltsx8x1b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepb %v0, 0(%r2)
; CHECK-NEXT: vst %v0, 0(%r3), 3
; CHECK-NEXT: vst %v0, 0(%r4), 3
; CHECK-NEXT: br %r14
%i = load i8, i8* %Src
%ZE = zext i8 %i to i64
%Mul = mul i64 %ZE, 72340172838076673
%tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0
%Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
store <2 x i64> %Val, <2 x i64>* %Dst
store <2 x i64> %Val, <2 x i64>* %Dst2
ret void
}
define void @fun_4Eltsx2x2b(i16* %Src, <4 x i32>* %Dst) {
; CHECK-LABEL: fun_4Eltsx2x2b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlreph %v0, 0(%r2)
; CHECK-NEXT: vst %v0, 0(%r3), 3
; CHECK-NEXT: br %r14
%i = load i16, i16* %Src
%ZE = zext i16 %i to i32
%Mul = mul i32 %ZE, 65537
%tmp = insertelement <4 x i32> undef, i32 %Mul, i32 0
%Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer
store <4 x i32> %Val, <4 x i32>* %Dst
ret void
}
define void @fun_6Eltsx2x2b(i16* %Src, <6 x i32>* %Dst) {
; CHECK-LABEL: fun_6Eltsx2x2b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlreph %v0, 0(%r2)
; CHECK-NEXT: vsteg %v0, 16(%r3), 0
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
%i = load i16, i16* %Src
%ZE = zext i16 %i to i32
%Mul = mul i32 %ZE, 65537
%tmp = insertelement <6 x i32> undef, i32 %Mul, i32 0
%Val = shufflevector <6 x i32> %tmp, <6 x i32> undef, <6 x i32> zeroinitializer
store <6 x i32> %Val, <6 x i32>* %Dst
ret void
}
define void @fun_2Eltsx2x4b(i32* %Src, <2 x i64>* %Dst) {
; CHECK-LABEL: fun_2Eltsx2x4b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepf %v0, 0(%r2)
; CHECK-NEXT: vst %v0, 0(%r3), 3
; CHECK-NEXT: br %r14
%i = load i32, i32* %Src
%ZE = zext i32 %i to i64
%Mul = mul i64 %ZE, 4294967297
%tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0
%Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer
store <2 x i64> %Val, <2 x i64>* %Dst
ret void
}
define void @fun_5Eltsx2x4b(i32* %Src, <5 x i64>* %Dst) {
; CHECK-LABEL: fun_5Eltsx2x4b:
; CHECK: # %bb.0:
; CHECK-NEXT: vlrepf %v0, 0(%r2)
; CHECK-NEXT: vsteg %v0, 32(%r3), 0
; CHECK-NEXT: vst %v0, 16(%r3), 4
; CHECK-NEXT: vst %v0, 0(%r3), 4
; CHECK-NEXT: br %r14
%i = load i32, i32* %Src
%ZE = zext i32 %i to i64
%Mul = mul i64 %ZE, 4294967297
%tmp = insertelement <5 x i64> undef, i64 %Mul, i32 0
%Val = shufflevector <5 x i64> %tmp, <5 x i64> undef, <5 x i32> zeroinitializer
store <5 x i64> %Val, <5 x i64>* %Dst
ret void
}
; Test replicating an incoming argument.
define void @fun_8x1b_arg(i8 %Arg, i64* %Dst) {
; CHECK-LABEL: fun_8x1b_arg:
; CHECK: # %bb.0:
; CHECK-NEXT: vlvgp %v0, %r2, %r2
; CHECK-NEXT: vrepb %v0, %v0, 7
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%ZE = zext i8 %Arg to i64
%Val = mul i64 %ZE, 72340172838076673
store i64 %Val, i64* %Dst
ret void
}
; A replication of a non-local value (ISD::AssertZext case).
define void @fun_nonlocalval() {
; CHECK-LABEL: fun_nonlocalval:
; CHECK: # %bb.0:
; CHECK-NEXT: lhi %r0, 0
; CHECK-NEXT: ciblh %r0, 0, 0(%r14)
; CHECK-NEXT: .LBB13_1: # %bb2
; CHECK-NEXT: llgf %r0, 0(%r1)
; CHECK-NEXT: vlvgp %v0, %r0, %r0
; CHECK-NEXT: vrepf %v0, %v0, 1
; CHECK-NEXT: vst %v0, 0(%r1), 3
; CHECK-NEXT: br %r14
%i = load i32, i32* undef, align 4
br i1 undef, label %bb2, label %bb7
bb2: ; preds = %bb1
%i3 = zext i32 %i to i64
%i4 = mul nuw i64 %i3, 4294967297
%i5 = insertelement <2 x i64> poison, i64 %i4, i64 0
%i6 = shufflevector <2 x i64> %i5, <2 x i64> poison, <2 x i32> zeroinitializer
store <2 x i64> %i6, <2 x i64>* undef, align 8
ret void
bb7:
ret void
}
;; Replicated immediates
; Some cases where scalar instruction is better
define void @fun_8x1i_zero(i64* %Dst) {
; CHECK-LABEL: fun_8x1i_zero:
; CHECK: # %bb.0:
; CHECK-NEXT: mvghi 0(%r2), 0
; CHECK-NEXT: br %r14
store i64 0, i64* %Dst
ret void
}
define void @fun_4x1i_minus1(i32* %Dst) {
; CHECK-LABEL: fun_4x1i_minus1:
; CHECK: # %bb.0:
; CHECK-NEXT: mvhi 0(%r2), -1
; CHECK-NEXT: br %r14
store i32 -1, i32* %Dst
ret void
}
define void @fun_4x1i_allones(i32* %Dst) {
; CHECK-LABEL: fun_4x1i_allones:
; CHECK: # %bb.0:
; CHECK-NEXT: mvhi 0(%r2), -1
; CHECK-NEXT: br %r14
store i32 4294967295, i32* %Dst
ret void
}
define void @fun_2i(i16* %Dst) {
; CHECK-LABEL: fun_2i:
; CHECK: # %bb.0:
; CHECK-NEXT: mvhhi 0(%r2), 1
; CHECK-NEXT: br %r14
store i16 1, i16* %Dst
ret void
}
define void @fun_2x2i(i32* %Dst) {
; CHECK-LABEL: fun_2x2i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepih %v0, 1
; CHECK-NEXT: vstef %v0, 0(%r2), 0
; CHECK-NEXT: br %r14
store i32 65537, i32* %Dst
ret void
}
define void @fun_4x2i(i64* %Dst) {
; CHECK-LABEL: fun_4x2i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepih %v0, 1
; CHECK-NEXT: vsteg %v0, 0(%r2), 0
; CHECK-NEXT: br %r14
store i64 281479271743489, i64* %Dst
ret void
}
define void @fun_2x4i(i64* %Dst) {
; CHECK-LABEL: fun_2x4i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepif %v0, 1
; CHECK-NEXT: vsteg %v0, 0(%r2), 0
; CHECK-NEXT: br %r14
store i64 4294967297, i64* %Dst
ret void
}
; Store replicated immediate twice using the same vector.
define void @fun_4x1i(i32* %Dst, i32* %Dst2) {
; CHECK-LABEL: fun_4x1i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 3
; CHECK-NEXT: vstef %v0, 0(%r2), 0
; CHECK-NEXT: vstef %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
store i32 50529027, i32* %Dst
store i32 50529027, i32* %Dst2
ret void
}
define void @fun_8x1i(i64* %Dst, i64* %Dst2) {
; CHECK-LABEL: fun_8x1i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 1
; CHECK-NEXT: vsteg %v0, 0(%r2), 0
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
store i64 72340172838076673, i64* %Dst
store i64 72340172838076673, i64* %Dst2
ret void
}
; Similar, but with vectors.
define void @fun_4Eltsx4x1i_2Eltsx4x1i(<4 x i32>* %Dst, <2 x i32>* %Dst2) {
; CHECK-LABEL: fun_4Eltsx4x1i_2Eltsx4x1i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 3
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%tmp = insertelement <4 x i32> undef, i32 50529027, i32 0
%Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer
store <4 x i32> %Val, <4 x i32>* %Dst
%tmp2 = insertelement <2 x i32> undef, i32 50529027, i32 0
%Val2 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer
store <2 x i32> %Val2, <2 x i32>* %Dst2
ret void
}
; Same, but 64-bit store is scalar.
define void @fun_4Eltsx4x1i_8x1i(<4 x i32>* %Dst, i64* %Dst2) {
; CHECK-LABEL: fun_4Eltsx4x1i_8x1i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepib %v0, 3
; CHECK-NEXT: vst %v0, 0(%r2), 3
; CHECK-NEXT: vsteg %v0, 0(%r3), 0
; CHECK-NEXT: br %r14
%tmp = insertelement <4 x i32> undef, i32 50529027, i32 0
%Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer
store <4 x i32> %Val, <4 x i32>* %Dst
store i64 217020518514230019, i64* %Dst2
ret void
}
define void @fun_3Eltsx2x4i(<3 x i64>* %Dst) {
; CHECK-LABEL: fun_3Eltsx2x4i:
; CHECK: # %bb.0:
; CHECK-NEXT: vrepif %v0, 1
; CHECK-NEXT: vsteg %v0, 16(%r2), 0
; CHECK-NEXT: vst %v0, 0(%r2), 4
; CHECK-NEXT: br %r14
%tmp = insertelement <3 x i64> undef, i64 4294967297, i32 0
%Val = shufflevector <3 x i64> %tmp, <3 x i64> undef, <3 x i32> zeroinitializer
store <3 x i64> %Val, <3 x i64>* %Dst
ret void
}
; i128 replicated '1': not using vrepib, but should compile.
define void @fun_16x1i(i128* %Dst) {
; CHECK-LABEL: fun_16x1i:
; CHECK: # %bb.0:
; CHECK-NEXT: llihf %r0, 16843009
; CHECK-NEXT: oilf %r0, 16843009
; CHECK-NEXT: stg %r0, 8(%r2)
; CHECK-NEXT: stg %r0, 0(%r2)
; CHECK-NEXT: br %r14
store i128 1334440654591915542993625911497130241, i128* %Dst
ret void
}