forked from OSchip/llvm-project
[SystemZ] Improve codegen for memset.
Memset with a constant length was implemented with a single store followed by a series of MVC:s. This patch changes this so that one store of the byte is emitted for each MVC, which avoids data dependencies between the MVCs. An MVI/STC + MVC(len-1) is done for each block. In addition, memset with a variable length is now also handled without a libcall. Since the byte is first stored and then MVC is used from that address, a length of two must now be subtracted instead of one for the loop and EXRL. This requires an extra check for the one-byte case, which is handled in a special block with just a single MVI/STC (like GCC). Review: Ulrich Weigand Differential Revision: https://reviews.llvm.org/D112004
This commit is contained in:
parent
327d966365
commit
cbf682cb1c
|
@ -5714,6 +5714,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
|
|||
OPCODE(OC);
|
||||
OPCODE(XC);
|
||||
OPCODE(CLC);
|
||||
OPCODE(MEMSET_MVC);
|
||||
OPCODE(STPCPY);
|
||||
OPCODE(STRCMP);
|
||||
OPCODE(SEARCH_STRING);
|
||||
|
@ -7860,8 +7861,10 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
|
|||
return MBB;
|
||||
}
|
||||
|
||||
MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
||||
MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
|
||||
MachineBasicBlock *
|
||||
SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
|
||||
MachineBasicBlock *MBB,
|
||||
unsigned Opcode, bool IsMemset) const {
|
||||
MachineFunction &MF = *MBB->getParent();
|
||||
const SystemZInstrInfo *TII =
|
||||
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
|
||||
|
@ -7870,18 +7873,64 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
|
||||
MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
|
||||
uint64_t DestDisp = MI.getOperand(1).getImm();
|
||||
MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
|
||||
uint64_t SrcDisp = MI.getOperand(3).getImm();
|
||||
MachineOperand &LengthMO = MI.getOperand(4);
|
||||
MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
|
||||
uint64_t SrcDisp;
|
||||
|
||||
// Fold the displacement Disp if it is out of range.
|
||||
auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
|
||||
if (!isUInt<12>(Disp)) {
|
||||
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
|
||||
unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
|
||||
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
|
||||
.add(Base).addImm(Disp).addReg(0);
|
||||
Base = MachineOperand::CreateReg(Reg, false);
|
||||
Disp = 0;
|
||||
}
|
||||
};
|
||||
|
||||
if (!IsMemset) {
|
||||
SrcBase = earlyUseOperand(MI.getOperand(2));
|
||||
SrcDisp = MI.getOperand(3).getImm();
|
||||
} else {
|
||||
SrcBase = DestBase;
|
||||
SrcDisp = DestDisp++;
|
||||
foldDisplIfNeeded(DestBase, DestDisp);
|
||||
}
|
||||
|
||||
MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
|
||||
bool IsImmForm = LengthMO.isImm();
|
||||
bool IsRegForm = !IsImmForm;
|
||||
|
||||
// Build and insert one Opcode of Length, with special treatment for memset.
|
||||
auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
|
||||
MachineBasicBlock::iterator InsPos,
|
||||
MachineOperand DBase, uint64_t DDisp,
|
||||
MachineOperand SBase, uint64_t SDisp,
|
||||
unsigned Length) -> void {
|
||||
assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
|
||||
if (IsMemset) {
|
||||
MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
|
||||
if (ByteMO.isImm())
|
||||
BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
|
||||
.add(SBase).addImm(SDisp).add(ByteMO);
|
||||
else
|
||||
BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
|
||||
.add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
|
||||
if (--Length == 0)
|
||||
return;
|
||||
}
|
||||
BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
|
||||
.add(DBase).addImm(DDisp).addImm(Length)
|
||||
.add(SBase).addImm(SDisp)
|
||||
.setMemRefs(MI.memoperands());
|
||||
};
|
||||
|
||||
bool NeedsLoop = false;
|
||||
uint64_t ImmLength = 0;
|
||||
Register LenMinus1Reg = SystemZ::NoRegister;
|
||||
Register LenAdjReg = SystemZ::NoRegister;
|
||||
if (IsImmForm) {
|
||||
ImmLength = LengthMO.getImm();
|
||||
ImmLength++; // Add back the '1' subtracted originally.
|
||||
ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
|
||||
if (ImmLength == 0) {
|
||||
MI.eraseFromParent();
|
||||
return MBB;
|
||||
|
@ -7905,7 +7954,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
NeedsLoop = true;
|
||||
} else {
|
||||
NeedsLoop = true;
|
||||
LenMinus1Reg = LengthMO.getReg();
|
||||
LenAdjReg = LengthMO.getReg();
|
||||
}
|
||||
|
||||
// When generating more than one CLC, all but the last will need to
|
||||
|
@ -7923,17 +7972,17 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
ImmLength &= 255;
|
||||
} else {
|
||||
BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
|
||||
.addReg(LenMinus1Reg)
|
||||
.addReg(LenAdjReg)
|
||||
.addReg(0)
|
||||
.addImm(8);
|
||||
}
|
||||
|
||||
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
|
||||
auto loadZeroAddress = [&]() -> MachineOperand {
|
||||
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
|
||||
BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
|
||||
return MachineOperand::CreateReg(Reg, false);
|
||||
};
|
||||
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
|
||||
if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
|
||||
DestBase = loadZeroAddress();
|
||||
if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
|
||||
|
@ -7968,14 +8017,41 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
DoneMBB = SystemZ::emitBlockAfter(NextMBB);
|
||||
|
||||
// MBB:
|
||||
// # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
|
||||
// # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
|
||||
.addReg(LenMinus1Reg).addImm(-1);
|
||||
.addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
|
||||
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
|
||||
.addMBB(AllDoneMBB);
|
||||
MBB->addSuccessor(AllDoneMBB);
|
||||
MBB->addSuccessor(StartMBB);
|
||||
if (!IsMemset)
|
||||
MBB->addSuccessor(StartMBB);
|
||||
else {
|
||||
// MemsetOneCheckMBB:
|
||||
// # Jump to MemsetOneMBB for a memset of length 1, or
|
||||
// # fall thru to StartMBB.
|
||||
MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
|
||||
MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
|
||||
MBB->addSuccessor(MemsetOneCheckMBB);
|
||||
MBB = MemsetOneCheckMBB;
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
|
||||
.addReg(LenAdjReg).addImm(-1);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
|
||||
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
|
||||
.addMBB(MemsetOneMBB);
|
||||
MBB->addSuccessor(MemsetOneMBB, {10, 100});
|
||||
MBB->addSuccessor(StartMBB, {90, 100});
|
||||
|
||||
// MemsetOneMBB:
|
||||
// # Jump back to AllDoneMBB after a single MVI or STC.
|
||||
MBB = MemsetOneMBB;
|
||||
insertMemMemOp(MBB, MBB->end(),
|
||||
MachineOperand::CreateReg(StartDestReg, false), DestDisp,
|
||||
MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
|
||||
1);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
|
||||
MBB->addSuccessor(AllDoneMBB);
|
||||
}
|
||||
|
||||
// StartMBB:
|
||||
// # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
|
||||
|
@ -8032,10 +8108,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
if (Opcode == SystemZ::MVC)
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
|
||||
.addImm(SystemZ::PFD_WRITE)
|
||||
.addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
|
||||
BuildMI(MBB, DL, TII->get(Opcode))
|
||||
.addReg(ThisDestReg).addImm(DestDisp).addImm(256)
|
||||
.addReg(ThisSrcReg).addImm(SrcDisp);
|
||||
.addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
|
||||
insertMemMemOp(MBB, MBB->end(),
|
||||
MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
|
||||
MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
|
||||
if (EndMBB) {
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
|
||||
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
|
||||
|
@ -8075,7 +8151,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
// # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
|
||||
// # Use EXecute Relative Long for the remainder of the bytes. The target
|
||||
// instruction of the EXRL will have a length field of 1 since 0 is an
|
||||
// illegal value. The number of bytes processed becomes (%LenMinus1Reg &
|
||||
// illegal value. The number of bytes processed becomes (%LenAdjReg &
|
||||
// 0xff) + 1.
|
||||
// # Fall through to AllDoneMBB.
|
||||
Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
|
||||
|
@ -8088,10 +8164,14 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
|
||||
.addReg(StartSrcReg).addMBB(StartMBB)
|
||||
.addReg(NextSrcReg).addMBB(NextMBB);
|
||||
if (IsMemset)
|
||||
insertMemMemOp(MBB, MBB->end(),
|
||||
MachineOperand::CreateReg(RemDestReg, false), DestDisp,
|
||||
MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
|
||||
MachineInstrBuilder EXRL_MIB =
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
|
||||
.addImm(Opcode)
|
||||
.addReg(LenMinus1Reg)
|
||||
.addReg(LenAdjReg)
|
||||
.addReg(RemDestReg).addImm(DestDisp)
|
||||
.addReg(RemSrcReg).addImm(SrcDisp);
|
||||
MBB->addSuccessor(AllDoneMBB);
|
||||
|
@ -8107,32 +8187,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
|
|||
while (ImmLength > 0) {
|
||||
uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
|
||||
// The previous iteration might have created out-of-range displacements.
|
||||
// Apply them using LAY if so.
|
||||
if (!isUInt<12>(DestDisp)) {
|
||||
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
|
||||
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
|
||||
.add(DestBase)
|
||||
.addImm(DestDisp)
|
||||
.addReg(0);
|
||||
DestBase = MachineOperand::CreateReg(Reg, false);
|
||||
DestDisp = 0;
|
||||
}
|
||||
if (!isUInt<12>(SrcDisp)) {
|
||||
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
|
||||
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
|
||||
.add(SrcBase)
|
||||
.addImm(SrcDisp)
|
||||
.addReg(0);
|
||||
SrcBase = MachineOperand::CreateReg(Reg, false);
|
||||
SrcDisp = 0;
|
||||
}
|
||||
BuildMI(*MBB, MI, DL, TII->get(Opcode))
|
||||
.add(DestBase)
|
||||
.addImm(DestDisp)
|
||||
.addImm(ThisLength)
|
||||
.add(SrcBase)
|
||||
.addImm(SrcDisp)
|
||||
.setMemRefs(MI.memoperands());
|
||||
// Apply them using LA/LAY if so.
|
||||
foldDisplIfNeeded(DestBase, DestDisp);
|
||||
foldDisplIfNeeded(SrcBase, SrcDisp);
|
||||
insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
|
||||
DestDisp += ThisLength;
|
||||
SrcDisp += ThisLength;
|
||||
ImmLength -= ThisLength;
|
||||
|
@ -8630,6 +8688,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
|
|||
case SystemZ::CLCImm:
|
||||
case SystemZ::CLCReg:
|
||||
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
|
||||
case SystemZ::MemsetImmImm:
|
||||
case SystemZ::MemsetImmReg:
|
||||
case SystemZ::MemsetRegImm:
|
||||
case SystemZ::MemsetRegReg:
|
||||
return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
|
||||
case SystemZ::CLSTLoop:
|
||||
return emitStringWrapper(MI, MBB, SystemZ::CLST);
|
||||
case SystemZ::MVSTLoop:
|
||||
|
|
|
@ -126,6 +126,9 @@ enum NodeType : unsigned {
|
|||
// as for MVC.
|
||||
CLC,
|
||||
|
||||
// Use MVC to set a block of memory after storing the first byte.
|
||||
MEMSET_MVC,
|
||||
|
||||
// Use an MVST-based sequence to implement stpcpy().
|
||||
STPCPY,
|
||||
|
||||
|
@ -709,7 +712,8 @@ private:
|
|||
MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
|
||||
MachineBasicBlock *BB) const;
|
||||
MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
|
||||
unsigned Opcode) const;
|
||||
unsigned Opcode,
|
||||
bool IsMemset = false) const;
|
||||
MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
|
||||
unsigned Opcode) const;
|
||||
MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,
|
||||
|
|
|
@ -5256,6 +5256,16 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
|
|||
let Constraints = "$R1 = $R1src";
|
||||
}
|
||||
|
||||
class MemsetPseudo<DAGOperand lenop, DAGOperand byteop>
|
||||
: Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B),
|
||||
[(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> {
|
||||
let Defs = [CC];
|
||||
let mayLoad = 1;
|
||||
let mayStore = 1;
|
||||
let usesCustomInserter = 1;
|
||||
let hasNoSchedulingInfo = 1;
|
||||
}
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Multiclasses that emit both real and pseudo instructions
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
|
|
@ -510,6 +510,12 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in {
|
|||
def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
|
||||
}
|
||||
|
||||
// Memset[Length][Byte] pseudos.
|
||||
def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>;
|
||||
def MemsetImmReg : MemsetPseudo<imm64, GR32>;
|
||||
def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>;
|
||||
def MemsetRegReg : MemsetPseudo<ADDR64, GR32>;
|
||||
|
||||
// Move right.
|
||||
let Predicates = [FeatureMiscellaneousExtensions3],
|
||||
mayLoad = 1, mayStore = 1, Uses = [R0L] in
|
||||
|
|
|
@ -102,6 +102,10 @@ def SDT_ZMemMemLengthCC : SDTypeProfile<1, 3,
|
|||
SDTCisPtrTy<1>,
|
||||
SDTCisPtrTy<2>,
|
||||
SDTCisVT<3, i64>]>;
|
||||
def SDT_ZMemsetMVC : SDTypeProfile<0, 3,
|
||||
[SDTCisPtrTy<0>,
|
||||
SDTCisVT<1, i64>,
|
||||
SDTCisVT<2, i32>]>;
|
||||
def SDT_ZString : SDTypeProfile<1, 3,
|
||||
[SDTCisPtrTy<0>,
|
||||
SDTCisPtrTy<1>,
|
||||
|
@ -413,6 +417,8 @@ def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
|
|||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
|
||||
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
|
||||
[SDNPHasChain, SDNPMayLoad]>;
|
||||
def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
|
||||
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
|
||||
def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
|
||||
[SDNPHasChain, SDNPMayLoad]>;
|
||||
def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,
|
||||
|
|
|
@ -17,29 +17,44 @@ using namespace llvm;
|
|||
|
||||
#define DEBUG_TYPE "systemz-selectiondag-info"
|
||||
|
||||
static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) {
|
||||
return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
|
||||
: DAG.getVTList(MVT::Other);
|
||||
static unsigned getMemMemLenAdj(unsigned Op) {
|
||||
return Op == SystemZISD::MEMSET_MVC ? 2 : 1;
|
||||
}
|
||||
|
||||
// Emit a mem-mem operation after subtracting one from size, which will be
|
||||
// added back during pseudo expansion. As the Reg case emitted here may be
|
||||
// converted by DAGCombiner into having an Imm length, they are both emitted
|
||||
// the same way.
|
||||
static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
|
||||
SDValue Chain, SDValue Dst, SDValue Src,
|
||||
SDValue LenAdj, SDValue Byte) {
|
||||
SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
|
||||
: DAG.getVTList(MVT::Other);
|
||||
SmallVector<SDValue, 6> Ops;
|
||||
if (Op == SystemZISD::MEMSET_MVC)
|
||||
Ops = { Chain, Dst, LenAdj, Byte };
|
||||
else
|
||||
Ops = { Chain, Dst, Src, LenAdj };
|
||||
return DAG.getNode(Op, DL, VTs, Ops);
|
||||
}
|
||||
|
||||
// Emit a mem-mem operation after subtracting one (or two for memset) from
|
||||
// size, which will be added back during pseudo expansion. As the Reg case
|
||||
// emitted here may be converted by DAGCombiner into having an Imm length,
|
||||
// they are both emitted the same way.
|
||||
static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
|
||||
SDValue Chain, SDValue Dst, SDValue Src,
|
||||
uint64_t Size) {
|
||||
return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src,
|
||||
DAG.getConstant(Size - 1, DL, Src.getValueType()));
|
||||
uint64_t Size, SDValue Byte = SDValue()) {
|
||||
unsigned Adj = getMemMemLenAdj(Op);
|
||||
assert(Size >= Adj && "Adjusted length overflow.");
|
||||
SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType());
|
||||
return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
|
||||
}
|
||||
|
||||
static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
|
||||
SDValue Chain, SDValue Dst, SDValue Src,
|
||||
SDValue Size) {
|
||||
SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
|
||||
DAG.getZExtOrTrunc(Size, DL, MVT::i64),
|
||||
DAG.getConstant(-1, DL, MVT::i64));
|
||||
return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1);
|
||||
SDValue Size, SDValue Byte = SDValue()) {
|
||||
int64_t Adj = getMemMemLenAdj(Op);
|
||||
SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64,
|
||||
DAG.getZExtOrTrunc(Size, DL, MVT::i64),
|
||||
DAG.getConstant(0 - Adj, DL, MVT::i64));
|
||||
return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
|
||||
}
|
||||
|
||||
SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
|
||||
|
@ -127,13 +142,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
|
|||
if (CByte && CByte->getZExtValue() == 0)
|
||||
return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
|
||||
|
||||
// Copy the byte to the first location and then use MVC to copy
|
||||
// it to the rest.
|
||||
Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
|
||||
SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
|
||||
DAG.getConstant(1, DL, PtrVT));
|
||||
return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
|
||||
Bytes - 1);
|
||||
return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
|
||||
Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
|
||||
}
|
||||
|
||||
// Variable length
|
||||
|
@ -141,7 +151,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
|
|||
// Handle the special case of a variable length memset of 0 with XC.
|
||||
return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
|
||||
|
||||
return SDValue();
|
||||
return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
|
||||
Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
|
||||
}
|
||||
|
||||
// Convert the current CC value into an integer that is 0 if CC == 0,
|
||||
|
|
|
@ -87,7 +87,8 @@ define void @f8(i8* %dest, i8 %val) {
|
|||
define void @f9(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f9:
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: stc %r3, 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 257, i1 false)
|
||||
ret void
|
||||
|
@ -97,7 +98,8 @@ define void @f9(i8* %dest, i8 %val) {
|
|||
define void @f10(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f10:
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: stc %r3, 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 257, i1 false)
|
||||
ret void
|
||||
|
@ -107,7 +109,8 @@ define void @f10(i8* %dest, i8 %val) {
|
|||
define void @f11(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f11:
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: stc %r3, 256(%r2)
|
||||
; CHECK: mvc 257(1,%r2), 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 258, i1 false)
|
||||
|
@ -118,7 +121,8 @@ define void @f11(i8* %dest, i8 %val) {
|
|||
define void @f12(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f12:
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: stc %r3, 256(%r2)
|
||||
; CHECK: mvc 257(1,%r2), 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 258, i1 false)
|
||||
|
@ -129,30 +133,88 @@ define void @f12(i8* %dest, i8 %val) {
|
|||
define void @f13(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f13:
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 257(256,%r2), 256(%r2)
|
||||
; CHECK: mvc 513(256,%r2), 512(%r2)
|
||||
; CHECK: mvc 769(256,%r2), 768(%r2)
|
||||
; CHECK: mvc 1025(256,%r2), 1024(%r2)
|
||||
; CHECK: mvc 1281(256,%r2), 1280(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: stc %r3, 256(%r2)
|
||||
; CHECK: mvc 257(255,%r2), 256(%r2)
|
||||
; CHECK: stc %r3, 512(%r2)
|
||||
; CHECK: mvc 513(255,%r2), 512(%r2)
|
||||
; CHECK: stc %r3, 768(%r2)
|
||||
; CHECK: mvc 769(255,%r2), 768(%r2)
|
||||
; CHECK: stc %r3, 1024(%r2)
|
||||
; CHECK: mvc 1025(255,%r2), 1024(%r2)
|
||||
; CHECK: stc %r3, 1280(%r2)
|
||||
; CHECK: mvc 1281(255,%r2), 1280(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1536, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test the next size up, which uses a loop. We leave the other corner
|
||||
; cases to memcpy-01.ll.
|
||||
; cases to memcpy-01.ll and memset-07.ll.
|
||||
define void @f14(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f14:
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: lghi [[COUNT:%r[0-5]]], 6
|
||||
; CHECK: [[LABEL:\.L[^:]*]]:
|
||||
; CHECK: pfd 2, 769(%r2)
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: pfd 2, 768(%r2)
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: la %r2, 256(%r2)
|
||||
; CHECK: brctg [[COUNT]], [[LABEL]]
|
||||
; CHECK: mvc 1(1,%r2), 0(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK-NEXT: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test (no) folding of displacement: Begins with max(uint12) - 1.
|
||||
define void @f15(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f15:
|
||||
; CHECK-NOT: la {{.*}}%r2
|
||||
%addr = getelementptr i8, i8* %dest, i64 4094
|
||||
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test folding of displacement: Begins with max(uint12).
|
||||
define void @f16(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f16:
|
||||
; CHECK-DAG: lay %r1, 4096(%r2)
|
||||
; CHECK-DAG: stc %r3, 4095(%r2)
|
||||
%addr = getelementptr i8, i8* %dest, i64 4095
|
||||
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test folding of displacement with LA: First two ops are in range.
|
||||
define void @f17(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f17:
|
||||
; CHECK: stc %r3, 3583(%r2)
|
||||
; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2)
|
||||
; CHECK-NEXT: stc %r3, 3839(%r2)
|
||||
; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2)
|
||||
; CHECK-NEXT: lay %r1, 4096(%r2)
|
||||
; CHECK-NEXT: stc %r3, 4095(%r2)
|
||||
; CHECK-NEXT: mvc 0(1,%r1), 4095(%r2)
|
||||
; CHECK-NEXT: br %r14
|
||||
%addr = getelementptr i8, i8* %dest, i64 3583
|
||||
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test folding of displacement with LAY: First two ops are in range.
|
||||
define void @f18(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f18:
|
||||
; CHECK: stc %r3, 3584(%r2)
|
||||
; CHECK-NEXT: mvc 3585(255,%r2), 3584(%r2)
|
||||
; CHECK-NEXT: stc %r3, 3840(%r2)
|
||||
; CHECK-NEXT: mvc 3841(255,%r2), 3840(%r2)
|
||||
; CHECK-NEXT: lay %r1, 4097(%r2)
|
||||
; CHECK-NEXT: lay %r2, 4096(%r2)
|
||||
; CHECK-NEXT: stc %r3, 0(%r2)
|
||||
; CHECK-NEXT: mvc 0(1,%r1), 0(%r2)
|
||||
; CHECK-NEXT: br %r14
|
||||
%addr = getelementptr i8, i8* %dest, i64 3584
|
||||
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
@ -123,7 +123,8 @@ define void @f12(i8* %dest) {
|
|||
define void @f13(i8* %dest) {
|
||||
; CHECK-LABEL: f13:
|
||||
; CHECK: mvi 0(%r2), 128
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 128
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 257, i1 false)
|
||||
ret void
|
||||
|
@ -133,7 +134,8 @@ define void @f13(i8* %dest) {
|
|||
define void @f14(i8* %dest) {
|
||||
; CHECK-LABEL: f14:
|
||||
; CHECK: mvi 0(%r2), 128
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 128
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 257, i1 false)
|
||||
ret void
|
||||
|
@ -143,7 +145,8 @@ define void @f14(i8* %dest) {
|
|||
define void @f15(i8* %dest) {
|
||||
; CHECK-LABEL: f15:
|
||||
; CHECK: mvi 0(%r2), 128
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 128
|
||||
; CHECK: mvc 257(1,%r2), 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 258, i1 false)
|
||||
|
@ -154,7 +157,8 @@ define void @f15(i8* %dest) {
|
|||
define void @f16(i8* %dest) {
|
||||
; CHECK-LABEL: f16:
|
||||
; CHECK: mvi 0(%r2), 128
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 128
|
||||
; CHECK: mvc 257(1,%r2), 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 258, i1 false)
|
||||
|
|
|
@ -359,7 +359,8 @@ define void @f36(i8* %dest) {
|
|||
define void @f37(i8* %dest) {
|
||||
; CHECK-LABEL: f37:
|
||||
; CHECK: mvi 0(%r2), 255
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 255
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 257, i1 false)
|
||||
ret void
|
||||
|
@ -369,7 +370,8 @@ define void @f37(i8* %dest) {
|
|||
define void @f38(i8* %dest) {
|
||||
; CHECK-LABEL: f38:
|
||||
; CHECK: mvi 0(%r2), 255
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 255
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 257, i1 false)
|
||||
ret void
|
||||
|
@ -379,7 +381,8 @@ define void @f38(i8* %dest) {
|
|||
define void @f39(i8* %dest) {
|
||||
; CHECK-LABEL: f39:
|
||||
; CHECK: mvi 0(%r2), 255
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 255
|
||||
; CHECK: mvc 257(1,%r2), 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 258, i1 false)
|
||||
|
@ -390,7 +393,8 @@ define void @f39(i8* %dest) {
|
|||
define void @f40(i8* %dest) {
|
||||
; CHECK-LABEL: f40:
|
||||
; CHECK: mvi 0(%r2), 255
|
||||
; CHECK: mvc 1(256,%r2), 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: mvi 256(%r2), 255
|
||||
; CHECK: mvc 257(1,%r2), 256(%r2)
|
||||
; CHECK: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 258, i1 false)
|
||||
|
|
|
@ -0,0 +1,100 @@
|
|||
; Test memset in cases where a loop is used.
|
||||
;
|
||||
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
|
||||
|
||||
declare void @llvm.memset.p0i8.i32(i8 *nocapture, i8, i32, i1) nounwind
|
||||
declare void @llvm.memset.p0i8.i64(i8 *nocapture, i8, i64, i1) nounwind
|
||||
|
||||
; Constant length: 6 iterations and 2 bytes remainder.
|
||||
define void @f1(i8* %dest, i8 %val) {
|
||||
; CHECK-LABEL: f1:
|
||||
; CHECK: lghi [[COUNT:%r[0-5]]], 6
|
||||
; CHECK: [[LABEL:\.L[^:]*]]:
|
||||
; CHECK: pfd 2, 768(%r2)
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: la %r2, 256(%r2)
|
||||
; CHECK: brctg [[COUNT]], [[LABEL]]
|
||||
; CHECK: stc %r3, 0(%r2)
|
||||
; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
|
||||
; CHECK-NEXT: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Constant length: 6 iterations and 255 bytes remainder.
|
||||
define void @f2(i8* %dest) {
|
||||
; CHECK-LABEL: f2:
|
||||
; CHECK: lghi [[COUNT:%r[0-5]]], 6
|
||||
; CHECK: [[LABEL:\.L[^:]*]]:
|
||||
; CHECK: pfd 2, 768(%r2)
|
||||
; CHECK: mvi 0(%r2), 1
|
||||
; CHECK: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK: la %r2, 256(%r2)
|
||||
; CHECK: brctg [[COUNT]], [[LABEL]]
|
||||
; CHECK: mvi 0(%r2), 1
|
||||
; CHECK-NEXT: mvc 1(254,%r2), 0(%r2)
|
||||
; CHECK-NEXT: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 1791, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Variable length, byte in register.
|
||||
define void @f3(i8* %dest, i8 %val, i64 %Len) {
|
||||
; CHECK-LABEL: f3:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: aghi %r4, -2
|
||||
; CHECK-NEXT: cgibe %r4, -2, 0(%r14)
|
||||
; CHECK-NEXT: .LBB2_1:
|
||||
; CHECK-NEXT: cgije %r4, -1, .LBB2_5
|
||||
; CHECK-NEXT:# %bb.2:
|
||||
; CHECK-NEXT: srlg %r0, %r4, 8
|
||||
; CHECK-NEXT: cgije %r0, 0, .LBB2_4
|
||||
; CHECK-NEXT:.LBB2_3: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: pfd 2, 768(%r2)
|
||||
; CHECK-NEXT: stc %r3, 0(%r2)
|
||||
; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK-NEXT: la %r2, 256(%r2)
|
||||
; CHECK-NEXT: brctg %r0, .LBB2_3
|
||||
; CHECK-NEXT:.LBB2_4:
|
||||
; CHECK-NEXT: stc %r3, 0(%r2)
|
||||
; CHECK-NEXT: exrl %r4, .Ltmp0
|
||||
; CHECK-NEXT: br %r14
|
||||
; CHECK-NEXT:.LBB2_5:
|
||||
; CHECK-NEXT: stc %r3, 0(%r2)
|
||||
; CHECK-NEXT: br %r14
|
||||
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %Len, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; Variable length, immediate byte.
|
||||
define void @f4(i8* %dest, i32 %Len) {
|
||||
; CHECK-LABEL: f4:
|
||||
; CHECK: # %bb.0:
|
||||
; CHECK-NEXT: llgfr %r1, %r3
|
||||
; CHECK-NEXT: aghi %r1, -2
|
||||
; CHECK-NEXT: cgibe %r1, -2, 0(%r14)
|
||||
; CHECK-NEXT:.LBB3_1:
|
||||
; CHECK-NEXT: cgije %r1, -1, .LBB3_5
|
||||
; CHECK-NEXT:# %bb.2:
|
||||
; CHECK-NEXT: srlg %r0, %r1, 8
|
||||
; CHECK-NEXT: cgije %r0, 0, .LBB3_4
|
||||
; CHECK-NEXT:.LBB3_3: # =>This Inner Loop Header: Depth=1
|
||||
; CHECK-NEXT: pfd 2, 768(%r2)
|
||||
; CHECK-NEXT: mvi 0(%r2), 1
|
||||
; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
|
||||
; CHECK-NEXT: la %r2, 256(%r2)
|
||||
; CHECK-NEXT: brctg %r0, .LBB3_3
|
||||
; CHECK-NEXT:.LBB3_4:
|
||||
; CHECK-NEXT: mvi 0(%r2), 1
|
||||
; CHECK-NEXT: exrl %r1, .Ltmp0
|
||||
; CHECK-NEXT: br %r14
|
||||
; CHECK-NEXT:.LBB3_5:
|
||||
; CHECK-NEXT: mvi 0(%r2), 1
|
||||
; CHECK-NEXT: br %r14
|
||||
call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 %Len, i1 false)
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK: .Ltmp0:
|
||||
; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
|
|
@ -12,7 +12,7 @@ entry:
|
|||
; CHECK: jg memset
|
||||
define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
|
||||
entry:
|
||||
tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 false)
|
||||
tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 true)
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue