From eaa78035c6a59b0607878f463ad7b7b7444f2c60 Mon Sep 17 00:00:00 2001 From: Jonas Paulsson Date: Tue, 8 Mar 2022 15:51:43 -0500 Subject: [PATCH] [SystemZ] Patchset for expanding memcpy/memset using at most two stores. * Set MaxStoresPerMemcpy and MaxStoresPerMemset to 2. * Optimize stores of replicated values in SystemZ::combineSTORE(). This handles the now expanded memory operations and as well some other pre-existing cases. * Reject a big displacement in isLegalAddressingMode() for a vector type. * Return true from shouldConsiderGEPOffsetSplit(). Reviewed By: Ulrich Weigand Differential Revision: https://reviews.llvm.org/D122105 --- llvm/include/llvm/CodeGen/TargetLowering.h | 7 +- .../Target/SystemZ/SystemZISelLowering.cpp | 137 +++++- llvm/lib/Target/SystemZ/SystemZISelLowering.h | 15 +- .../SystemZ/codegenprepare-gepoffs-split.ll | 24 + llvm/test/CodeGen/SystemZ/dag-combine-06.ll | 29 ++ llvm/test/CodeGen/SystemZ/memcpy-03.ll | 217 +++++++++ llvm/test/CodeGen/SystemZ/memset-08.ll | 420 ++++++++++++++++++ .../CodeGen/SystemZ/store-replicated-vals.ll | 380 ++++++++++++++++ 8 files changed, 1215 insertions(+), 14 deletions(-) create mode 100644 llvm/test/CodeGen/SystemZ/codegenprepare-gepoffs-split.ll create mode 100644 llvm/test/CodeGen/SystemZ/dag-combine-06.ll create mode 100644 llvm/test/CodeGen/SystemZ/memcpy-03.ll create mode 100644 llvm/test/CodeGen/SystemZ/memset-08.ll create mode 100644 llvm/test/CodeGen/SystemZ/store-replicated-vals.ll diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 06f3cee7a3fd..3c8dd39fd6b9 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3486,9 +3486,10 @@ public: /// Return true if the number of memory ops is below the threshold (Limit). /// It returns the types of the sequence of memory ops to perform /// memset / memcpy by reference. - bool findOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, - const MemOp &Op, unsigned DstAS, unsigned SrcAS, - const AttributeList &FuncAttributes) const; + virtual bool + findOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, + const MemOp &Op, unsigned DstAS, unsigned SrcAS, + const AttributeList &FuncAttributes) const; /// Check to see if the specified operand of the specified instruction is a /// constant integer. If so, check to see if there are any bits set in the diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 364b634db5ca..8f21ef916d3f 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -669,7 +669,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); // We want to use MVC in preference to even a single load/store pair. - MaxStoresPerMemcpy = 0; + MaxStoresPerMemcpy = Subtarget.hasVector() ? 2 : 0; MaxStoresPerMemcpyOptSize = 0; // The main memset sequence is a byte store followed by an MVC. @@ -677,7 +677,7 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, // generated by target-independent code don't when the byte value is // variable. E.g. "STC ;MHI ,257;STH " is not better // than "STC;MVC". Handle the choice in target-specific code instead. - MaxStoresPerMemset = 0; + MaxStoresPerMemset = Subtarget.hasVector() ? 2 : 0; MaxStoresPerMemsetOptSize = 0; // Default to having -disable-strictnode-mutation on @@ -793,14 +793,17 @@ bool SystemZVectorConstantInfo::isVectorConstantLegal( return tryValue(SplatBitsZ | Middle); } -SystemZVectorConstantInfo::SystemZVectorConstantInfo(APFloat FPImm) { - IntBits = FPImm.bitcastToAPInt().zextOrSelf(128); - isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad()); - SplatBits = FPImm.bitcastToAPInt(); - unsigned Width = SplatBits.getBitWidth(); - IntBits <<= (SystemZ::VectorBits - Width); +SystemZVectorConstantInfo::SystemZVectorConstantInfo(APInt IntImm) { + if (IntImm.isSingleWord()) { + IntBits = APInt(128, IntImm.getZExtValue()); + IntBits <<= (SystemZ::VectorBits - IntImm.getBitWidth()); + } else + IntBits = IntImm; + assert(IntBits.getBitWidth() == 128 && "Unsupported APInt."); // Find the smallest splat. + SplatBits = IntImm; + unsigned Width = SplatBits.getBitWidth(); while (Width > 8) { unsigned HalfSize = Width / 2; APInt HighValue = SplatBits.lshr(HalfSize).trunc(HalfSize); @@ -976,7 +979,8 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL, if (!isInt<20>(AM.BaseOffs)) return false; - AddressingMode SupportedAM(true, true); + bool RequireD12 = Subtarget.hasVector() && Ty->isVectorTy(); + AddressingMode SupportedAM(!RequireD12, true); if (I != nullptr) SupportedAM = supportedAddressingMode(I, Subtarget.hasVector()); @@ -991,6 +995,28 @@ bool SystemZTargetLowering::isLegalAddressingMode(const DataLayout &DL, return AM.Scale == 0 || AM.Scale == 1; } +bool SystemZTargetLowering::findOptimalMemOpLowering( + std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, + unsigned SrcAS, const AttributeList &FuncAttributes) const { + const int MVCFastLen = 16; + + // Don't expand Op into scalar loads/stores in these cases: + if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen) + return false; // Small memcpy: Use MVC + if (Op.isMemset() && Op.size() - 1 <= MVCFastLen) + return false; // Small memset (first byte with STC/MVI): Use MVC + if (Op.isZeroMemset()) + return false; // Memset zero: Use XC + + return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS, + SrcAS, FuncAttributes); +} + +EVT SystemZTargetLowering::getOptimalMemOpType(const MemOp &Op, + const AttributeList &FuncAttributes) const { + return Subtarget.hasVector() ? MVT::v2i64 : MVT::Other; +} + bool SystemZTargetLowering::isTruncateFree(Type *FromType, Type *ToType) const { if (!FromType->isIntegerTy() || !ToType->isIntegerTy()) return false; @@ -6329,6 +6355,23 @@ static bool isVectorElementSwap(ArrayRef M, EVT VT) { return true; } +static bool isOnlyUsedByStores(SDValue StoredVal, SelectionDAG &DAG) { + for (auto *U : StoredVal->uses()) { + if (StoreSDNode *ST = dyn_cast(U)) { + EVT CurrMemVT = ST->getMemoryVT().getScalarType(); + if (CurrMemVT.isRound() && CurrMemVT.getStoreSize() <= 16) + continue; + } else if (isa(U)) { + SDValue BuildVector = SDValue(U, 0); + if (DAG.isSplatValue(BuildVector, true/*AllowUndefs*/) && + isOnlyUsedByStores(BuildVector, DAG)) + continue; + } + return false; + } + return true; +} + SDValue SystemZTargetLowering::combineSTORE( SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -6387,6 +6430,82 @@ SDValue SystemZTargetLowering::combineSTORE( } } + // Replicate a reg or immediate with VREP instead of scalar multiply or + // immediate load. It seems best to do this during the first DAGCombine as + // it is straight-forward to handle the zero-extend node in the initial + // DAG, and also not worry about the keeping the new MemVT legal (e.g. when + // extracting an i16 element from a v16i8 vector). + if (Subtarget.hasVector() && DCI.Level == BeforeLegalizeTypes && + isOnlyUsedByStores(Op1, DAG)) { + SDValue Word = SDValue(); + EVT WordVT; + + // Find a replicated immediate and return it if found in Word and its + // type in WordVT. + auto FindReplicatedImm = [&](ConstantSDNode *C, unsigned TotBytes) { + // Some constants are better handled with a scalar store. + if (C->getAPIntValue().getBitWidth() > 64 || C->isAllOnes() || + isInt<16>(C->getSExtValue()) || MemVT.getStoreSize() <= 2) + return; + SystemZVectorConstantInfo VCI(APInt(TotBytes * 8, C->getZExtValue())); + if (VCI.isVectorConstantLegal(Subtarget) && + VCI.Opcode == SystemZISD::REPLICATE) { + Word = DAG.getConstant(VCI.OpVals[0], SDLoc(SN), MVT::i32); + WordVT = VCI.VecVT.getScalarType(); + } + }; + + // Find a replicated register and return it if found in Word and its type + // in WordVT. + auto FindReplicatedReg = [&](SDValue MulOp) { + EVT MulVT = MulOp.getValueType(); + if (MulOp->getOpcode() == ISD::MUL && + (MulVT == MVT::i16 || MulVT == MVT::i32 || MulVT == MVT::i64)) { + // Find a zero extended value and its type. + SDValue LHS = MulOp->getOperand(0); + if (LHS->getOpcode() == ISD::ZERO_EXTEND) + WordVT = LHS->getOperand(0).getValueType(); + else if (LHS->getOpcode() == ISD::AssertZext) + WordVT = cast(LHS->getOperand(1))->getVT(); + else + return; + // Find a replicating constant, e.g. 0x00010001. + if (auto *C = dyn_cast(MulOp->getOperand(1))) { + SystemZVectorConstantInfo VCI( + APInt(MulVT.getSizeInBits(), C->getZExtValue())); + if (VCI.isVectorConstantLegal(Subtarget) && + VCI.Opcode == SystemZISD::REPLICATE && VCI.OpVals[0] == 1 && + WordVT == VCI.VecVT.getScalarType()) + Word = DAG.getZExtOrTrunc(LHS->getOperand(0), SDLoc(SN), WordVT); + } + } + }; + + if (isa(Op1) && + DAG.isSplatValue(Op1, true/*AllowUndefs*/)) { + SDValue SplatVal = Op1->getOperand(0); + if (auto *C = dyn_cast(SplatVal)) + FindReplicatedImm(C, SplatVal.getValueType().getStoreSize()); + else + FindReplicatedReg(SplatVal); + } else { + if (auto *C = dyn_cast(Op1)) + FindReplicatedImm(C, MemVT.getStoreSize()); + else + FindReplicatedReg(Op1); + } + + if (Word != SDValue()) { + assert(MemVT.getSizeInBits() % WordVT.getSizeInBits() == 0 && + "Bad type handling"); + unsigned NumElts = MemVT.getSizeInBits() / WordVT.getSizeInBits(); + EVT SplatVT = EVT::getVectorVT(*DAG.getContext(), WordVT, NumElts); + SDValue SplatVal = DAG.getSplatVector(SplatVT, SDLoc(SN), Word); + return DAG.getStore(SN->getChain(), SDLoc(SN), SplatVal, + SN->getBasePtr(), SN->getMemOperand()); + } + } + return SDValue(); } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index aeb0419c8bb2..16a0a95ffd3f 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -457,6 +457,12 @@ public: bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, Align Alignment, MachineMemOperand::Flags Flags, bool *Fast) const override; + bool + findOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, + const MemOp &Op, unsigned DstAS, unsigned SrcAS, + const AttributeList &FuncAttributes) const override; + EVT getOptimalMemOpType(const MemOp &Op, + const AttributeList &FuncAttributes) const override; bool isTruncateFree(Type *, Type *) const override; bool isTruncateFree(EVT, EVT) const override; @@ -467,6 +473,8 @@ public: return VT == MVT::i32 || VT == MVT::i64; } + bool shouldConsiderGEPOffsetSplit() const override { return true; } + const char *getTargetNodeName(unsigned Opcode) const override; std::pair getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, @@ -767,12 +775,15 @@ private: APInt SplatUndef; // Bits correspoding to undef operands of the BVN. unsigned SplatBitSize = 0; bool isFP128 = false; - public: unsigned Opcode = 0; SmallVector OpVals; MVT VecVT; - SystemZVectorConstantInfo(APFloat FPImm); + SystemZVectorConstantInfo(APInt IntImm); + SystemZVectorConstantInfo(APFloat FPImm) + : SystemZVectorConstantInfo(FPImm.bitcastToAPInt()) { + isFP128 = (&FPImm.getSemantics() == &APFloat::IEEEquad()); + } SystemZVectorConstantInfo(BuildVectorSDNode *BVN); bool isVectorConstantLegal(const SystemZSubtarget &Subtarget); }; diff --git a/llvm/test/CodeGen/SystemZ/codegenprepare-gepoffs-split.ll b/llvm/test/CodeGen/SystemZ/codegenprepare-gepoffs-split.ll new file mode 100644 index 000000000000..54108e06df4c --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/codegenprepare-gepoffs-split.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s +; +; Test that the big offsets are handled by only one AGFI. + +define void @fun(i64* %Src, i64* %Dst) { +; CHECK-LABEL: fun: +; CHECK: # %bb.0: +; CHECK-NEXT: agfi %r2, 1048576 +; CHECK-NEXT: lg %r0, 0(%r2) +; CHECK-NEXT: stg %r0, 0(%r3) +; CHECK-NEXT: lg %r0, 8(%r2) +; CHECK-NEXT: stg %r0, 0(%r3) +; CHECK-NEXT: br %r14 + %S0 = getelementptr i64, i64* %Src, i64 131072 + %V0 = load i64, i64* %S0 + store volatile i64 %V0, i64* %Dst + + %S1 = getelementptr i64, i64* %Src, i64 131073 + %V1 = load i64, i64* %S1 + store volatile i64 %V1, i64* %Dst + + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/dag-combine-06.ll b/llvm/test/CodeGen/SystemZ/dag-combine-06.ll new file mode 100644 index 000000000000..beecbde0a9ad --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/dag-combine-06.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s +; +; Test that DAGCombiner does not change the addressing as the displacements +; are known to be out of range. Only one addition is needed. + +define void @fun(<2 x i64>* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun: +; CHECK: # %bb.0: +; CHECK-NEXT: aghi %r2, 4096 +; CHECK-NEXT: vl %v0, 0(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: vl %v0, 16(%r2), 3 +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %1 = bitcast <2 x i64>* %Src to i8* + + %splitgep = getelementptr i8, i8* %1, i64 4096 + %2 = bitcast i8* %splitgep to <2 x i64>* + %V0 = load <2 x i64>, <2 x i64>* %2, align 8 + store volatile <2 x i64> %V0, <2 x i64>* %Dst, align 8 + + %3 = getelementptr i8, i8* %splitgep, i64 16 + %4 = bitcast i8* %3 to <2 x i64>* + %V1 = load <2 x i64>, <2 x i64>* %4, align 8 + store volatile <2 x i64> %V1, <2 x i64>* %Dst, align 8 + + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/memcpy-03.ll b/llvm/test/CodeGen/SystemZ/memcpy-03.ll new file mode 100644 index 000000000000..b478d14ed0eb --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/memcpy-03.ll @@ -0,0 +1,217 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=z15 < %s -mtriple=s390x-linux-gnu | FileCheck %s +; +; Test memcpys of small constant lengths that should not be done with MVC. + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8 *nocapture, i8 *nocapture, i64, i1) nounwind + +define void @fun16(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun16: +; CHECK: # %bb.0: +; CHECK-NEXT: mvc 0(16,%r3), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 16, i1 false) + ret void +} + +define void @fun17(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun17: +; CHECK: # %bb.0: +; CHECK-NEXT: lb %r0, 16(%r2) +; CHECK-NEXT: stc %r0, 16(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 17, i1 false) + ret void +} + +define void @fun18(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun18: +; CHECK: # %bb.0: +; CHECK-NEXT: lh %r0, 16(%r2) +; CHECK-NEXT: sth %r0, 16(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 18, i1 false) + ret void +} + +define void @fun19(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun19: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 15(%r2) +; CHECK-NEXT: st %r0, 15(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 19, i1 false) + ret void +} + +define void @fun20(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun20: +; CHECK: # %bb.0: +; CHECK-NEXT: l %r0, 16(%r2) +; CHECK-NEXT: st %r0, 16(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 20, i1 false) + ret void +} + +define void @fun21(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun21: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r0, 13(%r2) +; CHECK-NEXT: stg %r0, 13(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 21, i1 false) + ret void +} + +define void @fun22(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun22: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r0, 14(%r2) +; CHECK-NEXT: stg %r0, 14(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 22, i1 false) + ret void +} + +define void @fun23(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun23: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r0, 15(%r2) +; CHECK-NEXT: stg %r0, 15(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 23, i1 false) + ret void +} + +define void @fun24(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun24: +; CHECK: # %bb.0: +; CHECK-NEXT: lg %r0, 16(%r2) +; CHECK-NEXT: stg %r0, 16(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 24, i1 false) + ret void +} + +define void @fun25(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun25: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 9(%r2) +; CHECK-NEXT: vst %v0, 9(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 25, i1 false) + ret void +} + +define void @fun26(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun26: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 10(%r2) +; CHECK-NEXT: vst %v0, 10(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 26, i1 false) + ret void +} + +define void @fun27(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun27: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 11(%r2) +; CHECK-NEXT: vst %v0, 11(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 27, i1 false) + ret void +} + +define void @fun28(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun28: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 12(%r2) +; CHECK-NEXT: vst %v0, 12(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 28, i1 false) + ret void +} + +define void @fun29(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun29: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 13(%r2) +; CHECK-NEXT: vst %v0, 13(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 29, i1 false) + ret void +} + +define void @fun30(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun30: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 14(%r2) +; CHECK-NEXT: vst %v0, 14(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 30, i1 false) + ret void +} + +define void @fun31(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun31: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 15(%r2) +; CHECK-NEXT: vst %v0, 15(%r3) +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 31, i1 false) + ret void +} + +define void @fun32(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun32: +; CHECK: # %bb.0: +; CHECK-NEXT: vl %v0, 16(%r2), 4 +; CHECK-NEXT: vst %v0, 16(%r3), 4 +; CHECK-NEXT: vl %v0, 0(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 32, i1 false) + ret void +} + +define void @fun33(i8* %Src, i8* %Dst, i8 %val) { +; CHECK-LABEL: fun33: +; CHECK: # %bb.0: +; CHECK-NEXT: mvc 0(33,%r3), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 %Dst, i8* align 16 %Src, i64 33, i1 false) + ret void +} + diff --git a/llvm/test/CodeGen/SystemZ/memset-08.ll b/llvm/test/CodeGen/SystemZ/memset-08.ll new file mode 100644 index 000000000000..8e2c503a24c8 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/memset-08.ll @@ -0,0 +1,420 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mcpu=z15 %s -mtriple=s390x-linux-gnu -o - | FileCheck %s +; +; Test memsets of small constant lengths, that should not be done with MVC. + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) + +define void @reg17(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg17: +; CHECK: # %bb.0: +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: mvc 1(16,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 17, i1 false) + ret void +} + +define void @reg18(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg18: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: vsteh %v0, 16(%r2), 0 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 18, i1 false) + ret void +} + +define void @reg19(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg19: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vstef %v0, 15(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 19, i1 false) + ret void +} + +define void @reg20(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg20: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vstef %v0, 16(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 20, i1 false) + ret void +} + +define void @reg21(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg21: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vsteg %v0, 13(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 21, i1 false) + ret void +} + +define void @reg22(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg22: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vsteg %v0, 14(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 22, i1 false) + ret void +} + +define void @reg23(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg23: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vsteg %v0, 15(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 23, i1 false) + ret void +} + +define void @reg24(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg24: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vsteg %v0, 16(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 24, i1 false) + ret void +} + +define void @reg25(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg25: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 9(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 25, i1 false) + ret void +} + +define void @reg26(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg26: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 10(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 26, i1 false) + ret void +} + +define void @reg27(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg27: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 11(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 27, i1 false) + ret void +} + +define void @reg28(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg28: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 12(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 28, i1 false) + ret void +} + +define void @reg29(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg29: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 13(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 29, i1 false) + ret void +} + +define void @reg30(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg30: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 14(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 30, i1 false) + ret void +} + +define void @reg31(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg31: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 15(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 31, i1 false) + ret void +} + +define void @reg32(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg32: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r3, %r3 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vst %v0, 16(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 32, i1 false) + ret void +} + +define void @reg33(i8* %Dst, i8 %val) { +; CHECK-LABEL: reg33: +; CHECK: # %bb.0: +; CHECK-NEXT: stc %r3, 0(%r2) +; CHECK-NEXT: mvc 1(32,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 %val, i64 33, i1 false) + ret void +} + +;; Immediate value + +define void @imm17(i8* %Dst) { +; CHECK-LABEL: imm17: +; CHECK: # %bb.0: +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: mvc 1(16,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 17, i1 false) + ret void +} + +define void @imm18(i8* %Dst) { +; CHECK-LABEL: imm18: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: mvhhi 16(%r2), -1 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 18, i1 false) + ret void +} + +define void @zero18(i8* %Dst) { +; CHECK-LABEL: zero18: +; CHECK: # %bb.0: +; CHECK-NEXT: xc 0(18,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 0, i64 18, i1 false) + ret void +} + +define void @imm19(i8* %Dst) { +; CHECK-LABEL: imm19: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vstef %v0, 15(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 19, i1 false) + ret void +} + +define void @imm20(i8* %Dst) { +; CHECK-LABEL: imm20: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: mvhi 16(%r2), -1 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 20, i1 false) + ret void +} + +define void @imm21(i8* %Dst) { +; CHECK-LABEL: imm21: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vsteg %v0, 13(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 21, i1 false) + ret void +} + +define void @imm22(i8* %Dst) { +; CHECK-LABEL: imm22: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: mvghi 14(%r2), -1 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 22, i1 false) + ret void +} + +define void @imm23(i8* %Dst) { +; CHECK-LABEL: imm23: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vsteg %v0, 15(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 23, i1 false) + ret void +} + +define void @imm24(i8* %Dst) { +; CHECK-LABEL: imm24: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: mvghi 16(%r2), -1 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 24, i1 false) + ret void +} + +define void @imm25(i8* %Dst) { +; CHECK-LABEL: imm25: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vst %v0, 9(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 25, i1 false) + ret void +} + +define void @imm26(i8* %Dst) { +; CHECK-LABEL: imm26: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 10(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 26, i1 false) + ret void +} + +define void @zero26(i8* %Dst) { +; CHECK-LABEL: zero26: +; CHECK: # %bb.0: +; CHECK-NEXT: xc 0(26,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 0, i64 26, i1 false) + ret void +} + +define void @imm27(i8* %Dst) { +; CHECK-LABEL: imm27: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vst %v0, 11(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 27, i1 false) + ret void +} + +define void @imm28(i8* %Dst) { +; CHECK-LABEL: imm28: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 12(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 28, i1 false) + ret void +} + +define void @imm29(i8* %Dst) { +; CHECK-LABEL: imm29: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vst %v0, 13(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 29, i1 false) + ret void +} + +define void @imm30(i8* %Dst) { +; CHECK-LABEL: imm30: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 14(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 30, i1 false) + ret void +} + +define void @imm31(i8* %Dst) { +; CHECK-LABEL: imm31: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vst %v0, 15(%r2) +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 31, i1 false) + ret void +} + +define void @imm32(i8* %Dst) { +; CHECK-LABEL: imm32: +; CHECK: # %bb.0: +; CHECK-NEXT: vgbm %v0, 65535 +; CHECK-NEXT: vst %v0, 16(%r2), 4 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 -1, i64 32, i1 false) + ret void +} + +define void @zero32(i8* %Dst) { +; CHECK-LABEL: zero32: +; CHECK: # %bb.0: +; CHECK-NEXT: xc 0(32,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 0, i64 32, i1 false) + ret void +} + +define void @imm33(i8* %Dst) { +; CHECK-LABEL: imm33: +; CHECK: # %bb.0: +; CHECK-NEXT: mvi 0(%r2), 1 +; CHECK-NEXT: mvc 1(32,%r2), 0(%r2) +; CHECK-NEXT: br %r14 + call void @llvm.memset.p0i8.i64(i8* align 16 %Dst, i8 1, i64 33, i1 false) + ret void +} diff --git a/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll b/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll new file mode 100644 index 000000000000..c18806a43338 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/store-replicated-vals.ll @@ -0,0 +1,380 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s +; +; Test storing of replicated values using vector replicate type instructions. + +;; Replicated registers + +define void @fun_2x1b(i8* %Src, i16* %Dst) { +; CHECK-LABEL: fun_2x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteh %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i16 + %Val = mul i16 %ZE, 257 + store i16 %Val, i16* %Dst + ret void +} + +; Test multiple stores of same value. +define void @fun_4x1b(i8* %Src, i32* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_4x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: vstef %v0, 0(%r4), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i32 + %Val = mul i32 %ZE, 16843009 + store i32 %Val, i32* %Dst + store i32 %Val, i32* %Dst2 + ret void +} + +define void @fun_8x1b(i8* %Src, i64* %Dst) { +; CHECK-LABEL: fun_8x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + ret void +} + +; A second truncated store of same value. +define void @fun_8x1b_4x1b(i8* %Src, i64* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_8x1b_4x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: vstef %v0, 0(%r4), 0 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + %TrVal = trunc i64 %Val to i32 + store i32 %TrVal, i32* %Dst2 + ret void +} + +define void @fun_2x2b(i16* %Src, i32* %Dst) { +; CHECK-LABEL: fun_2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Val = mul i32 %ZE, 65537 + store i32 %Val, i32* %Dst + ret void +} + +define void @fun_4x2b(i16* %Src, i64* %Dst) { +; CHECK-LABEL: fun_4x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i64 + %Val = mul i64 %ZE, 281479271743489 + store i64 %Val, i64* %Dst + ret void +} + +define void @fun_2x4b(i32* %Src, i64* %Dst) { +; CHECK-LABEL: fun_2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Val = mul i64 %ZE, 4294967297 + store i64 %Val, i64* %Dst + ret void +} + +;; Replicated registers already in a vector. + +; Test multiple stores of same value. +define void @fun_2Eltsx8x1b(i8* %Src, <2 x i64>* %Dst, <2 x i64>* %Dst2) { +; CHECK-LABEL: fun_2Eltsx8x1b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepb %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: vst %v0, 0(%r4), 3 +; CHECK-NEXT: br %r14 + %i = load i8, i8* %Src + %ZE = zext i8 %i to i64 + %Mul = mul i64 %ZE, 72340172838076673 + %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %Val, <2 x i64>* %Dst + store <2 x i64> %Val, <2 x i64>* %Dst2 + ret void +} + +define void @fun_4Eltsx2x2b(i16* %Src, <4 x i32>* %Dst) { +; CHECK-LABEL: fun_4Eltsx2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Mul = mul i32 %ZE, 65537 + %tmp = insertelement <4 x i32> undef, i32 %Mul, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + ret void +} + +define void @fun_6Eltsx2x2b(i16* %Src, <6 x i32>* %Dst) { +; CHECK-LABEL: fun_6Eltsx2x2b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlreph %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 16(%r3), 0 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %i = load i16, i16* %Src + %ZE = zext i16 %i to i32 + %Mul = mul i32 %ZE, 65537 + %tmp = insertelement <6 x i32> undef, i32 %Mul, i32 0 + %Val = shufflevector <6 x i32> %tmp, <6 x i32> undef, <6 x i32> zeroinitializer + store <6 x i32> %Val, <6 x i32>* %Dst + ret void +} + +define void @fun_2Eltsx2x4b(i32* %Src, <2 x i64>* %Dst) { +; CHECK-LABEL: fun_2Eltsx2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vst %v0, 0(%r3), 3 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Mul = mul i64 %ZE, 4294967297 + %tmp = insertelement <2 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <2 x i64> %tmp, <2 x i64> undef, <2 x i32> zeroinitializer + store <2 x i64> %Val, <2 x i64>* %Dst + ret void +} + +define void @fun_5Eltsx2x4b(i32* %Src, <5 x i64>* %Dst) { +; CHECK-LABEL: fun_5Eltsx2x4b: +; CHECK: # %bb.0: +; CHECK-NEXT: vlrepf %v0, 0(%r2) +; CHECK-NEXT: vsteg %v0, 32(%r3), 0 +; CHECK-NEXT: vst %v0, 16(%r3), 4 +; CHECK-NEXT: vst %v0, 0(%r3), 4 +; CHECK-NEXT: br %r14 + %i = load i32, i32* %Src + %ZE = zext i32 %i to i64 + %Mul = mul i64 %ZE, 4294967297 + %tmp = insertelement <5 x i64> undef, i64 %Mul, i32 0 + %Val = shufflevector <5 x i64> %tmp, <5 x i64> undef, <5 x i32> zeroinitializer + store <5 x i64> %Val, <5 x i64>* %Dst + ret void +} + +; Test replicating an incoming argument. +define void @fun_8x1b_arg(i8 %Arg, i64* %Dst) { +; CHECK-LABEL: fun_8x1b_arg: +; CHECK: # %bb.0: +; CHECK-NEXT: vlvgp %v0, %r2, %r2 +; CHECK-NEXT: vrepb %v0, %v0, 7 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %ZE = zext i8 %Arg to i64 + %Val = mul i64 %ZE, 72340172838076673 + store i64 %Val, i64* %Dst + ret void +} + +; A replication of a non-local value (ISD::AssertZext case). +define void @fun_nonlocalval() { +; CHECK-LABEL: fun_nonlocalval: +; CHECK: # %bb.0: +; CHECK-NEXT: lhi %r0, 0 +; CHECK-NEXT: ciblh %r0, 0, 0(%r14) +; CHECK-NEXT: .LBB13_1: # %bb2 +; CHECK-NEXT: llgf %r0, 0(%r1) +; CHECK-NEXT: vlvgp %v0, %r0, %r0 +; CHECK-NEXT: vrepf %v0, %v0, 1 +; CHECK-NEXT: vst %v0, 0(%r1), 3 +; CHECK-NEXT: br %r14 + %i = load i32, i32* undef, align 4 + br i1 undef, label %bb2, label %bb7 + +bb2: ; preds = %bb1 + %i3 = zext i32 %i to i64 + %i4 = mul nuw i64 %i3, 4294967297 + %i5 = insertelement <2 x i64> poison, i64 %i4, i64 0 + %i6 = shufflevector <2 x i64> %i5, <2 x i64> poison, <2 x i32> zeroinitializer + store <2 x i64> %i6, <2 x i64>* undef, align 8 + ret void + +bb7: + ret void +} + +;; Replicated immediates + +; Some cases where scalar instruction is better +define void @fun_8x1i_zero(i64* %Dst) { +; CHECK-LABEL: fun_8x1i_zero: +; CHECK: # %bb.0: +; CHECK-NEXT: mvghi 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 0, i64* %Dst + ret void +} + +define void @fun_4x1i_minus1(i32* %Dst) { +; CHECK-LABEL: fun_4x1i_minus1: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhi 0(%r2), -1 +; CHECK-NEXT: br %r14 + store i32 -1, i32* %Dst + ret void +} + +define void @fun_4x1i_allones(i32* %Dst) { +; CHECK-LABEL: fun_4x1i_allones: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhi 0(%r2), -1 +; CHECK-NEXT: br %r14 + store i32 4294967295, i32* %Dst + ret void +} + +define void @fun_2i(i16* %Dst) { +; CHECK-LABEL: fun_2i: +; CHECK: # %bb.0: +; CHECK-NEXT: mvhhi 0(%r2), 1 +; CHECK-NEXT: br %r14 + store i16 1, i16* %Dst + ret void +} + +define void @fun_2x2i(i32* %Dst) { +; CHECK-LABEL: fun_2x2i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepih %v0, 1 +; CHECK-NEXT: vstef %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i32 65537, i32* %Dst + ret void +} + +define void @fun_4x2i(i64* %Dst) { +; CHECK-LABEL: fun_4x2i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepih %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 281479271743489, i64* %Dst + ret void +} + +define void @fun_2x4i(i64* %Dst) { +; CHECK-LABEL: fun_2x4i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepif %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: br %r14 + store i64 4294967297, i64* %Dst + ret void +} + +; Store replicated immediate twice using the same vector. +define void @fun_4x1i(i32* %Dst, i32* %Dst2) { +; CHECK-LABEL: fun_4x1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vstef %v0, 0(%r2), 0 +; CHECK-NEXT: vstef %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + store i32 50529027, i32* %Dst + store i32 50529027, i32* %Dst2 + ret void +} + +define void @fun_8x1i(i64* %Dst, i64* %Dst2) { +; CHECK-LABEL: fun_8x1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 1 +; CHECK-NEXT: vsteg %v0, 0(%r2), 0 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + store i64 72340172838076673, i64* %Dst + store i64 72340172838076673, i64* %Dst2 + ret void +} + +; Similar, but with vectors. +define void @fun_4Eltsx4x1i_2Eltsx4x1i(<4 x i32>* %Dst, <2 x i32>* %Dst2) { +; CHECK-LABEL: fun_4Eltsx4x1i_2Eltsx4x1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + %tmp2 = insertelement <2 x i32> undef, i32 50529027, i32 0 + %Val2 = shufflevector <2 x i32> %tmp2, <2 x i32> undef, <2 x i32> zeroinitializer + store <2 x i32> %Val2, <2 x i32>* %Dst2 + ret void +} + +; Same, but 64-bit store is scalar. +define void @fun_4Eltsx4x1i_8x1i(<4 x i32>* %Dst, i64* %Dst2) { +; CHECK-LABEL: fun_4Eltsx4x1i_8x1i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepib %v0, 3 +; CHECK-NEXT: vst %v0, 0(%r2), 3 +; CHECK-NEXT: vsteg %v0, 0(%r3), 0 +; CHECK-NEXT: br %r14 + %tmp = insertelement <4 x i32> undef, i32 50529027, i32 0 + %Val = shufflevector <4 x i32> %tmp, <4 x i32> undef, <4 x i32> zeroinitializer + store <4 x i32> %Val, <4 x i32>* %Dst + store i64 217020518514230019, i64* %Dst2 + ret void +} + +define void @fun_3Eltsx2x4i(<3 x i64>* %Dst) { +; CHECK-LABEL: fun_3Eltsx2x4i: +; CHECK: # %bb.0: +; CHECK-NEXT: vrepif %v0, 1 +; CHECK-NEXT: vsteg %v0, 16(%r2), 0 +; CHECK-NEXT: vst %v0, 0(%r2), 4 +; CHECK-NEXT: br %r14 + %tmp = insertelement <3 x i64> undef, i64 4294967297, i32 0 + %Val = shufflevector <3 x i64> %tmp, <3 x i64> undef, <3 x i32> zeroinitializer + store <3 x i64> %Val, <3 x i64>* %Dst + ret void +} + +; i128 replicated '1': not using vrepib, but should compile. +define void @fun_16x1i(i128* %Dst) { +; CHECK-LABEL: fun_16x1i: +; CHECK: # %bb.0: +; CHECK-NEXT: llihf %r0, 16843009 +; CHECK-NEXT: oilf %r0, 16843009 +; CHECK-NEXT: stg %r0, 8(%r2) +; CHECK-NEXT: stg %r0, 0(%r2) +; CHECK-NEXT: br %r14 + store i128 1334440654591915542993625911497130241, i128* %Dst + ret void +}