[SystemZ] Improve codegen for memset.

Memset with a constant length was implemented with a single store followed by
a series of MVC:s. This patch changes this so that one store of the byte is
emitted for each MVC, which avoids data dependencies between the MVCs. An
MVI/STC + MVC(len-1) is done for each block.

In addition, memset with a variable length is now also handled without a
libcall. Since the byte is first stored and then MVC is used from that
address, a length of two must now be subtracted instead of one for the loop
and EXRL. This requires an extra check for the one-byte case, which is
handled in a special block with just a single MVI/STC (like GCC).

Review: Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D112004
This commit is contained in:
Jonas Paulsson 2021-10-14 20:10:47 +02:00
parent 327d966365
commit cbf682cb1c
11 changed files with 366 additions and 96 deletions

View File

@ -5714,6 +5714,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
OPCODE(OC);
OPCODE(XC);
OPCODE(CLC);
OPCODE(MEMSET_MVC);
OPCODE(STPCPY);
OPCODE(STRCMP);
OPCODE(SEARCH_STRING);
@ -7860,8 +7861,10 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI,
return MBB;
}
MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const {
MachineBasicBlock *
SystemZTargetLowering::emitMemMemWrapper(MachineInstr &MI,
MachineBasicBlock *MBB,
unsigned Opcode, bool IsMemset) const {
MachineFunction &MF = *MBB->getParent();
const SystemZInstrInfo *TII =
static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
@ -7870,18 +7873,64 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
MachineOperand DestBase = earlyUseOperand(MI.getOperand(0));
uint64_t DestDisp = MI.getOperand(1).getImm();
MachineOperand SrcBase = earlyUseOperand(MI.getOperand(2));
uint64_t SrcDisp = MI.getOperand(3).getImm();
MachineOperand &LengthMO = MI.getOperand(4);
MachineOperand SrcBase = MachineOperand::CreateReg(0U, false);
uint64_t SrcDisp;
// Fold the displacement Disp if it is out of range.
auto foldDisplIfNeeded = [&](MachineOperand &Base, uint64_t &Disp) -> void {
if (!isUInt<12>(Disp)) {
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
unsigned Opcode = TII->getOpcodeForOffset(SystemZ::LA, Disp);
BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), TII->get(Opcode), Reg)
.add(Base).addImm(Disp).addReg(0);
Base = MachineOperand::CreateReg(Reg, false);
Disp = 0;
}
};
if (!IsMemset) {
SrcBase = earlyUseOperand(MI.getOperand(2));
SrcDisp = MI.getOperand(3).getImm();
} else {
SrcBase = DestBase;
SrcDisp = DestDisp++;
foldDisplIfNeeded(DestBase, DestDisp);
}
MachineOperand &LengthMO = MI.getOperand(IsMemset ? 2 : 4);
bool IsImmForm = LengthMO.isImm();
bool IsRegForm = !IsImmForm;
// Build and insert one Opcode of Length, with special treatment for memset.
auto insertMemMemOp = [&](MachineBasicBlock *InsMBB,
MachineBasicBlock::iterator InsPos,
MachineOperand DBase, uint64_t DDisp,
MachineOperand SBase, uint64_t SDisp,
unsigned Length) -> void {
assert(Length > 0 && Length <= 256 && "Building memory op with bad length.");
if (IsMemset) {
MachineOperand ByteMO = earlyUseOperand(MI.getOperand(3));
if (ByteMO.isImm())
BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::MVI))
.add(SBase).addImm(SDisp).add(ByteMO);
else
BuildMI(*InsMBB, InsPos, DL, TII->get(SystemZ::STC))
.add(ByteMO).add(SBase).addImm(SDisp).addReg(0);
if (--Length == 0)
return;
}
BuildMI(*MBB, InsPos, DL, TII->get(Opcode))
.add(DBase).addImm(DDisp).addImm(Length)
.add(SBase).addImm(SDisp)
.setMemRefs(MI.memoperands());
};
bool NeedsLoop = false;
uint64_t ImmLength = 0;
Register LenMinus1Reg = SystemZ::NoRegister;
Register LenAdjReg = SystemZ::NoRegister;
if (IsImmForm) {
ImmLength = LengthMO.getImm();
ImmLength++; // Add back the '1' subtracted originally.
ImmLength += IsMemset ? 2 : 1; // Add back the subtracted adjustment.
if (ImmLength == 0) {
MI.eraseFromParent();
return MBB;
@ -7905,7 +7954,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
NeedsLoop = true;
} else {
NeedsLoop = true;
LenMinus1Reg = LengthMO.getReg();
LenAdjReg = LengthMO.getReg();
}
// When generating more than one CLC, all but the last will need to
@ -7923,17 +7972,17 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
ImmLength &= 255;
} else {
BuildMI(*MBB, MI, DL, TII->get(SystemZ::SRLG), StartCountReg)
.addReg(LenMinus1Reg)
.addReg(LenAdjReg)
.addReg(0)
.addImm(8);
}
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
auto loadZeroAddress = [&]() -> MachineOperand {
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, DL, TII->get(SystemZ::LGHI), Reg).addImm(0);
return MachineOperand::CreateReg(Reg, false);
};
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
if (DestBase.isReg() && DestBase.getReg() == SystemZ::NoRegister)
DestBase = loadZeroAddress();
if (SrcBase.isReg() && SrcBase.getReg() == SystemZ::NoRegister)
@ -7968,14 +8017,41 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
DoneMBB = SystemZ::emitBlockAfter(NextMBB);
// MBB:
// # Jump to AllDoneMBB if LenMinus1Reg is -1, or fall thru to StartMBB.
// # Jump to AllDoneMBB if LenAdjReg means 0, or fall thru to StartMBB.
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
.addReg(LenMinus1Reg).addImm(-1);
.addReg(LenAdjReg).addImm(IsMemset ? -2 : -1);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
.addMBB(AllDoneMBB);
MBB->addSuccessor(AllDoneMBB);
MBB->addSuccessor(StartMBB);
if (!IsMemset)
MBB->addSuccessor(StartMBB);
else {
// MemsetOneCheckMBB:
// # Jump to MemsetOneMBB for a memset of length 1, or
// # fall thru to StartMBB.
MachineBasicBlock *MemsetOneCheckMBB = SystemZ::emitBlockAfter(MBB);
MachineBasicBlock *MemsetOneMBB = SystemZ::emitBlockAfter(&*MF.rbegin());
MBB->addSuccessor(MemsetOneCheckMBB);
MBB = MemsetOneCheckMBB;
BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
.addReg(LenAdjReg).addImm(-1);
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
.addMBB(MemsetOneMBB);
MBB->addSuccessor(MemsetOneMBB, {10, 100});
MBB->addSuccessor(StartMBB, {90, 100});
// MemsetOneMBB:
// # Jump back to AllDoneMBB after a single MVI or STC.
MBB = MemsetOneMBB;
insertMemMemOp(MBB, MBB->end(),
MachineOperand::CreateReg(StartDestReg, false), DestDisp,
MachineOperand::CreateReg(StartSrcReg, false), SrcDisp,
1);
BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(AllDoneMBB);
MBB->addSuccessor(AllDoneMBB);
}
// StartMBB:
// # Jump to DoneMBB if %StartCountReg is zero, or fall through to LoopMBB.
@ -8032,10 +8108,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
if (Opcode == SystemZ::MVC)
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
.addImm(SystemZ::PFD_WRITE)
.addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
BuildMI(MBB, DL, TII->get(Opcode))
.addReg(ThisDestReg).addImm(DestDisp).addImm(256)
.addReg(ThisSrcReg).addImm(SrcDisp);
.addReg(ThisDestReg).addImm(DestDisp - IsMemset + 768).addReg(0);
insertMemMemOp(MBB, MBB->end(),
MachineOperand::CreateReg(ThisDestReg, false), DestDisp,
MachineOperand::CreateReg(ThisSrcReg, false), SrcDisp, 256);
if (EndMBB) {
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
@ -8075,7 +8151,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
// # Make PHIs for RemDestReg/RemSrcReg as the loop may or may not run.
// # Use EXecute Relative Long for the remainder of the bytes. The target
// instruction of the EXRL will have a length field of 1 since 0 is an
// illegal value. The number of bytes processed becomes (%LenMinus1Reg &
// illegal value. The number of bytes processed becomes (%LenAdjReg &
// 0xff) + 1.
// # Fall through to AllDoneMBB.
Register RemSrcReg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
@ -8088,10 +8164,14 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
BuildMI(MBB, DL, TII->get(SystemZ::PHI), RemSrcReg)
.addReg(StartSrcReg).addMBB(StartMBB)
.addReg(NextSrcReg).addMBB(NextMBB);
if (IsMemset)
insertMemMemOp(MBB, MBB->end(),
MachineOperand::CreateReg(RemDestReg, false), DestDisp,
MachineOperand::CreateReg(RemSrcReg, false), SrcDisp, 1);
MachineInstrBuilder EXRL_MIB =
BuildMI(MBB, DL, TII->get(SystemZ::EXRL_Pseudo))
.addImm(Opcode)
.addReg(LenMinus1Reg)
.addReg(LenAdjReg)
.addReg(RemDestReg).addImm(DestDisp)
.addReg(RemSrcReg).addImm(SrcDisp);
MBB->addSuccessor(AllDoneMBB);
@ -8107,32 +8187,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
while (ImmLength > 0) {
uint64_t ThisLength = std::min(ImmLength, uint64_t(256));
// The previous iteration might have created out-of-range displacements.
// Apply them using LAY if so.
if (!isUInt<12>(DestDisp)) {
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
.add(DestBase)
.addImm(DestDisp)
.addReg(0);
DestBase = MachineOperand::CreateReg(Reg, false);
DestDisp = 0;
}
if (!isUInt<12>(SrcDisp)) {
Register Reg = MRI.createVirtualRegister(&SystemZ::ADDR64BitRegClass);
BuildMI(*MBB, MI, MI.getDebugLoc(), TII->get(SystemZ::LAY), Reg)
.add(SrcBase)
.addImm(SrcDisp)
.addReg(0);
SrcBase = MachineOperand::CreateReg(Reg, false);
SrcDisp = 0;
}
BuildMI(*MBB, MI, DL, TII->get(Opcode))
.add(DestBase)
.addImm(DestDisp)
.addImm(ThisLength)
.add(SrcBase)
.addImm(SrcDisp)
.setMemRefs(MI.memoperands());
// Apply them using LA/LAY if so.
foldDisplIfNeeded(DestBase, DestDisp);
foldDisplIfNeeded(SrcBase, SrcDisp);
insertMemMemOp(MBB, MI, DestBase, DestDisp, SrcBase, SrcDisp, ThisLength);
DestDisp += ThisLength;
SrcDisp += ThisLength;
ImmLength -= ThisLength;
@ -8630,6 +8688,11 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
case SystemZ::CLCImm:
case SystemZ::CLCReg:
return emitMemMemWrapper(MI, MBB, SystemZ::CLC);
case SystemZ::MemsetImmImm:
case SystemZ::MemsetImmReg:
case SystemZ::MemsetRegImm:
case SystemZ::MemsetRegReg:
return emitMemMemWrapper(MI, MBB, SystemZ::MVC, true/*IsMemset*/);
case SystemZ::CLSTLoop:
return emitStringWrapper(MI, MBB, SystemZ::CLST);
case SystemZ::MVSTLoop:

View File

@ -126,6 +126,9 @@ enum NodeType : unsigned {
// as for MVC.
CLC,
// Use MVC to set a block of memory after storing the first byte.
MEMSET_MVC,
// Use an MVST-based sequence to implement stpcpy().
STPCPY,
@ -709,7 +712,8 @@ private:
MachineBasicBlock *emitAtomicCmpSwapW(MachineInstr &MI,
MachineBasicBlock *BB) const;
MachineBasicBlock *emitMemMemWrapper(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode) const;
unsigned Opcode,
bool IsMemset = false) const;
MachineBasicBlock *emitStringWrapper(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode) const;
MachineBasicBlock *emitTransactionBegin(MachineInstr &MI,

View File

@ -5256,6 +5256,16 @@ class RotateSelectAliasRIEf<RegisterOperand cls1, RegisterOperand cls2>
let Constraints = "$R1 = $R1src";
}
class MemsetPseudo<DAGOperand lenop, DAGOperand byteop>
: Pseudo<(outs), (ins bdaddr12only:$dest, lenop:$length, byteop:$B),
[(z_memset_mvc bdaddr12only:$dest, lenop:$length, byteop:$B)]> {
let Defs = [CC];
let mayLoad = 1;
let mayStore = 1;
let usesCustomInserter = 1;
let hasNoSchedulingInfo = 1;
}
//===----------------------------------------------------------------------===//
// Multiclasses that emit both real and pseudo instructions
//===----------------------------------------------------------------------===//

View File

@ -510,6 +510,12 @@ let mayLoad = 1, mayStore = 1, Defs = [CC] in {
def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>;
}
// Memset[Length][Byte] pseudos.
def MemsetImmImm : MemsetPseudo<imm64, imm32zx8trunc>;
def MemsetImmReg : MemsetPseudo<imm64, GR32>;
def MemsetRegImm : MemsetPseudo<ADDR64, imm32zx8trunc>;
def MemsetRegReg : MemsetPseudo<ADDR64, GR32>;
// Move right.
let Predicates = [FeatureMiscellaneousExtensions3],
mayLoad = 1, mayStore = 1, Uses = [R0L] in

View File

@ -102,6 +102,10 @@ def SDT_ZMemMemLengthCC : SDTypeProfile<1, 3,
SDTCisPtrTy<1>,
SDTCisPtrTy<2>,
SDTCisVT<3, i64>]>;
def SDT_ZMemsetMVC : SDTypeProfile<0, 3,
[SDTCisPtrTy<0>,
SDTCisVT<1, i64>,
SDTCisVT<2, i32>]>;
def SDT_ZString : SDTypeProfile<1, 3,
[SDTCisPtrTy<0>,
SDTCisPtrTy<1>,
@ -413,6 +417,8 @@ def z_xc : SDNode<"SystemZISD::XC", SDT_ZMemMemLength,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_clc : SDNode<"SystemZISD::CLC", SDT_ZMemMemLengthCC,
[SDNPHasChain, SDNPMayLoad]>;
def z_memset_mvc : SDNode<"SystemZISD::MEMSET_MVC", SDT_ZMemsetMVC,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad]>;
def z_strcmp : SDNode<"SystemZISD::STRCMP", SDT_ZStringCC,
[SDNPHasChain, SDNPMayLoad]>;
def z_stpcpy : SDNode<"SystemZISD::STPCPY", SDT_ZString,

View File

@ -17,29 +17,44 @@ using namespace llvm;
#define DEBUG_TYPE "systemz-selectiondag-info"
static SDVTList getMemMemVTs(unsigned Op, SelectionDAG &DAG) {
return Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
: DAG.getVTList(MVT::Other);
static unsigned getMemMemLenAdj(unsigned Op) {
return Op == SystemZISD::MEMSET_MVC ? 2 : 1;
}
// Emit a mem-mem operation after subtracting one from size, which will be
// added back during pseudo expansion. As the Reg case emitted here may be
// converted by DAGCombiner into having an Imm length, they are both emitted
// the same way.
static SDValue createMemMemNode(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue LenAdj, SDValue Byte) {
SDVTList VTs = Op == SystemZISD::CLC ? DAG.getVTList(MVT::i32, MVT::Other)
: DAG.getVTList(MVT::Other);
SmallVector<SDValue, 6> Ops;
if (Op == SystemZISD::MEMSET_MVC)
Ops = { Chain, Dst, LenAdj, Byte };
else
Ops = { Chain, Dst, Src, LenAdj };
return DAG.getNode(Op, DL, VTs, Ops);
}
// Emit a mem-mem operation after subtracting one (or two for memset) from
// size, which will be added back during pseudo expansion. As the Reg case
// emitted here may be converted by DAGCombiner into having an Imm length,
// they are both emitted the same way.
static SDValue emitMemMemImm(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
SDValue Chain, SDValue Dst, SDValue Src,
uint64_t Size) {
return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src,
DAG.getConstant(Size - 1, DL, Src.getValueType()));
uint64_t Size, SDValue Byte = SDValue()) {
unsigned Adj = getMemMemLenAdj(Op);
assert(Size >= Adj && "Adjusted length overflow.");
SDValue LenAdj = DAG.getConstant(Size - Adj, DL, Dst.getValueType());
return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
}
static SDValue emitMemMemReg(SelectionDAG &DAG, const SDLoc &DL, unsigned Op,
SDValue Chain, SDValue Dst, SDValue Src,
SDValue Size) {
SDValue LenMinus1 = DAG.getNode(ISD::ADD, DL, MVT::i64,
DAG.getZExtOrTrunc(Size, DL, MVT::i64),
DAG.getConstant(-1, DL, MVT::i64));
return DAG.getNode(Op, DL, getMemMemVTs(Op, DAG), Chain, Dst, Src, LenMinus1);
SDValue Size, SDValue Byte = SDValue()) {
int64_t Adj = getMemMemLenAdj(Op);
SDValue LenAdj = DAG.getNode(ISD::ADD, DL, MVT::i64,
DAG.getZExtOrTrunc(Size, DL, MVT::i64),
DAG.getConstant(0 - Adj, DL, MVT::i64));
return createMemMemNode(DAG, DL, Op, Chain, Dst, Src, LenAdj, Byte);
}
SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
@ -127,13 +142,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
if (CByte && CByte->getZExtValue() == 0)
return emitMemMemImm(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Bytes);
// Copy the byte to the first location and then use MVC to copy
// it to the rest.
Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
DAG.getConstant(1, DL, PtrVT));
return emitMemMemImm(DAG, DL, SystemZISD::MVC, Chain, DstPlus1, Dst,
Bytes - 1);
return emitMemMemImm(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
Bytes, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
}
// Variable length
@ -141,7 +151,8 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
// Handle the special case of a variable length memset of 0 with XC.
return emitMemMemReg(DAG, DL, SystemZISD::XC, Chain, Dst, Dst, Size);
return SDValue();
return emitMemMemReg(DAG, DL, SystemZISD::MEMSET_MVC, Chain, Dst, SDValue(),
Size, DAG.getAnyExtOrTrunc(Byte, DL, MVT::i32));
}
// Convert the current CC value into an integer that is 0 if CC == 0,

View File

@ -87,7 +87,8 @@ define void @f8(i8* %dest, i8 %val) {
define void @f9(i8* %dest, i8 %val) {
; CHECK-LABEL: f9:
; CHECK: stc %r3, 0(%r2)
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: stc %r3, 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 257, i1 false)
ret void
@ -97,7 +98,8 @@ define void @f9(i8* %dest, i8 %val) {
define void @f10(i8* %dest, i8 %val) {
; CHECK-LABEL: f10:
; CHECK: stc %r3, 0(%r2)
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: stc %r3, 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 257, i1 false)
ret void
@ -107,7 +109,8 @@ define void @f10(i8* %dest, i8 %val) {
define void @f11(i8* %dest, i8 %val) {
; CHECK-LABEL: f11:
; CHECK: stc %r3, 0(%r2)
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: stc %r3, 256(%r2)
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 %val, i32 258, i1 false)
@ -118,7 +121,8 @@ define void @f11(i8* %dest, i8 %val) {
define void @f12(i8* %dest, i8 %val) {
; CHECK-LABEL: f12:
; CHECK: stc %r3, 0(%r2)
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: stc %r3, 256(%r2)
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 258, i1 false)
@ -129,30 +133,88 @@ define void @f12(i8* %dest, i8 %val) {
define void @f13(i8* %dest, i8 %val) {
; CHECK-LABEL: f13:
; CHECK: stc %r3, 0(%r2)
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 257(256,%r2), 256(%r2)
; CHECK: mvc 513(256,%r2), 512(%r2)
; CHECK: mvc 769(256,%r2), 768(%r2)
; CHECK: mvc 1025(256,%r2), 1024(%r2)
; CHECK: mvc 1281(256,%r2), 1280(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: stc %r3, 256(%r2)
; CHECK: mvc 257(255,%r2), 256(%r2)
; CHECK: stc %r3, 512(%r2)
; CHECK: mvc 513(255,%r2), 512(%r2)
; CHECK: stc %r3, 768(%r2)
; CHECK: mvc 769(255,%r2), 768(%r2)
; CHECK: stc %r3, 1024(%r2)
; CHECK: mvc 1025(255,%r2), 1024(%r2)
; CHECK: stc %r3, 1280(%r2)
; CHECK: mvc 1281(255,%r2), 1280(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1536, i1 false)
ret void
}
; Test the next size up, which uses a loop. We leave the other corner
; cases to memcpy-01.ll.
; cases to memcpy-01.ll and memset-07.ll.
define void @f14(i8* %dest, i8 %val) {
; CHECK-LABEL: f14:
; CHECK: stc %r3, 0(%r2)
; CHECK: lghi [[COUNT:%r[0-5]]], 6
; CHECK: [[LABEL:\.L[^:]*]]:
; CHECK: pfd 2, 769(%r2)
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: pfd 2, 768(%r2)
; CHECK: stc %r3, 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: la %r2, 256(%r2)
; CHECK: brctg [[COUNT]], [[LABEL]]
; CHECK: mvc 1(1,%r2), 0(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
; CHECK: stc %r3, 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1537, i1 false)
ret void
}
; Test (no) folding of displacement: Begins with max(uint12) - 1.
define void @f15(i8* %dest, i8 %val) {
; CHECK-LABEL: f15:
; CHECK-NOT: la {{.*}}%r2
%addr = getelementptr i8, i8* %dest, i64 4094
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
ret void
}
; Test folding of displacement: Begins with max(uint12).
define void @f16(i8* %dest, i8 %val) {
; CHECK-LABEL: f16:
; CHECK-DAG: lay %r1, 4096(%r2)
; CHECK-DAG: stc %r3, 4095(%r2)
%addr = getelementptr i8, i8* %dest, i64 4095
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 256, i1 false)
ret void
}
; Test folding of displacement with LA: First two ops are in range.
define void @f17(i8* %dest, i8 %val) {
; CHECK-LABEL: f17:
; CHECK: stc %r3, 3583(%r2)
; CHECK-NEXT: mvc 3584(255,%r2), 3583(%r2)
; CHECK-NEXT: stc %r3, 3839(%r2)
; CHECK-NEXT: mvc 3840(255,%r2), 3839(%r2)
; CHECK-NEXT: lay %r1, 4096(%r2)
; CHECK-NEXT: stc %r3, 4095(%r2)
; CHECK-NEXT: mvc 0(1,%r1), 4095(%r2)
; CHECK-NEXT: br %r14
%addr = getelementptr i8, i8* %dest, i64 3583
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
ret void
}
; Test folding of displacement with LAY: First two ops are in range.
define void @f18(i8* %dest, i8 %val) {
; CHECK-LABEL: f18:
; CHECK: stc %r3, 3584(%r2)
; CHECK-NEXT: mvc 3585(255,%r2), 3584(%r2)
; CHECK-NEXT: stc %r3, 3840(%r2)
; CHECK-NEXT: mvc 3841(255,%r2), 3840(%r2)
; CHECK-NEXT: lay %r1, 4097(%r2)
; CHECK-NEXT: lay %r2, 4096(%r2)
; CHECK-NEXT: stc %r3, 0(%r2)
; CHECK-NEXT: mvc 0(1,%r1), 0(%r2)
; CHECK-NEXT: br %r14
%addr = getelementptr i8, i8* %dest, i64 3584
call void @llvm.memset.p0i8.i64(i8* %addr, i8 %val, i64 514, i1 false)
ret void
}

View File

@ -123,7 +123,8 @@ define void @f12(i8* %dest) {
define void @f13(i8* %dest) {
; CHECK-LABEL: f13:
; CHECK: mvi 0(%r2), 128
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 128
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 257, i1 false)
ret void
@ -133,7 +134,8 @@ define void @f13(i8* %dest) {
define void @f14(i8* %dest) {
; CHECK-LABEL: f14:
; CHECK: mvi 0(%r2), 128
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 128
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 257, i1 false)
ret void
@ -143,7 +145,8 @@ define void @f14(i8* %dest) {
define void @f15(i8* %dest) {
; CHECK-LABEL: f15:
; CHECK: mvi 0(%r2), 128
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 128
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 128, i32 258, i1 false)
@ -154,7 +157,8 @@ define void @f15(i8* %dest) {
define void @f16(i8* %dest) {
; CHECK-LABEL: f16:
; CHECK: mvi 0(%r2), 128
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 128
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 128, i64 258, i1 false)

View File

@ -359,7 +359,8 @@ define void @f36(i8* %dest) {
define void @f37(i8* %dest) {
; CHECK-LABEL: f37:
; CHECK: mvi 0(%r2), 255
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 255
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 257, i1 false)
ret void
@ -369,7 +370,8 @@ define void @f37(i8* %dest) {
define void @f38(i8* %dest) {
; CHECK-LABEL: f38:
; CHECK: mvi 0(%r2), 255
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 255
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 257, i1 false)
ret void
@ -379,7 +381,8 @@ define void @f38(i8* %dest) {
define void @f39(i8* %dest) {
; CHECK-LABEL: f39:
; CHECK: mvi 0(%r2), 255
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 255
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 -1, i32 258, i1 false)
@ -390,7 +393,8 @@ define void @f39(i8* %dest) {
define void @f40(i8* %dest) {
; CHECK-LABEL: f40:
; CHECK: mvi 0(%r2), 255
; CHECK: mvc 1(256,%r2), 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: mvi 256(%r2), 255
; CHECK: mvc 257(1,%r2), 256(%r2)
; CHECK: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 -1, i64 258, i1 false)

View File

@ -0,0 +1,100 @@
; Test memset in cases where a loop is used.
;
; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
declare void @llvm.memset.p0i8.i32(i8 *nocapture, i8, i32, i1) nounwind
declare void @llvm.memset.p0i8.i64(i8 *nocapture, i8, i64, i1) nounwind
; Constant length: 6 iterations and 2 bytes remainder.
define void @f1(i8* %dest, i8 %val) {
; CHECK-LABEL: f1:
; CHECK: lghi [[COUNT:%r[0-5]]], 6
; CHECK: [[LABEL:\.L[^:]*]]:
; CHECK: pfd 2, 768(%r2)
; CHECK: stc %r3, 0(%r2)
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: la %r2, 256(%r2)
; CHECK: brctg [[COUNT]], [[LABEL]]
; CHECK: stc %r3, 0(%r2)
; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 1538, i1 false)
ret void
}
; Constant length: 6 iterations and 255 bytes remainder.
define void @f2(i8* %dest) {
; CHECK-LABEL: f2:
; CHECK: lghi [[COUNT:%r[0-5]]], 6
; CHECK: [[LABEL:\.L[^:]*]]:
; CHECK: pfd 2, 768(%r2)
; CHECK: mvi 0(%r2), 1
; CHECK: mvc 1(255,%r2), 0(%r2)
; CHECK: la %r2, 256(%r2)
; CHECK: brctg [[COUNT]], [[LABEL]]
; CHECK: mvi 0(%r2), 1
; CHECK-NEXT: mvc 1(254,%r2), 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 1791, i1 false)
ret void
}
; Variable length, byte in register.
define void @f3(i8* %dest, i8 %val, i64 %Len) {
; CHECK-LABEL: f3:
; CHECK: # %bb.0:
; CHECK-NEXT: aghi %r4, -2
; CHECK-NEXT: cgibe %r4, -2, 0(%r14)
; CHECK-NEXT: .LBB2_1:
; CHECK-NEXT: cgije %r4, -1, .LBB2_5
; CHECK-NEXT:# %bb.2:
; CHECK-NEXT: srlg %r0, %r4, 8
; CHECK-NEXT: cgije %r0, 0, .LBB2_4
; CHECK-NEXT:.LBB2_3: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: pfd 2, 768(%r2)
; CHECK-NEXT: stc %r3, 0(%r2)
; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
; CHECK-NEXT: la %r2, 256(%r2)
; CHECK-NEXT: brctg %r0, .LBB2_3
; CHECK-NEXT:.LBB2_4:
; CHECK-NEXT: stc %r3, 0(%r2)
; CHECK-NEXT: exrl %r4, .Ltmp0
; CHECK-NEXT: br %r14
; CHECK-NEXT:.LBB2_5:
; CHECK-NEXT: stc %r3, 0(%r2)
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i64(i8* %dest, i8 %val, i64 %Len, i1 false)
ret void
}
; Variable length, immediate byte.
define void @f4(i8* %dest, i32 %Len) {
; CHECK-LABEL: f4:
; CHECK: # %bb.0:
; CHECK-NEXT: llgfr %r1, %r3
; CHECK-NEXT: aghi %r1, -2
; CHECK-NEXT: cgibe %r1, -2, 0(%r14)
; CHECK-NEXT:.LBB3_1:
; CHECK-NEXT: cgije %r1, -1, .LBB3_5
; CHECK-NEXT:# %bb.2:
; CHECK-NEXT: srlg %r0, %r1, 8
; CHECK-NEXT: cgije %r0, 0, .LBB3_4
; CHECK-NEXT:.LBB3_3: # =>This Inner Loop Header: Depth=1
; CHECK-NEXT: pfd 2, 768(%r2)
; CHECK-NEXT: mvi 0(%r2), 1
; CHECK-NEXT: mvc 1(255,%r2), 0(%r2)
; CHECK-NEXT: la %r2, 256(%r2)
; CHECK-NEXT: brctg %r0, .LBB3_3
; CHECK-NEXT:.LBB3_4:
; CHECK-NEXT: mvi 0(%r2), 1
; CHECK-NEXT: exrl %r1, .Ltmp0
; CHECK-NEXT: br %r14
; CHECK-NEXT:.LBB3_5:
; CHECK-NEXT: mvi 0(%r2), 1
; CHECK-NEXT: br %r14
call void @llvm.memset.p0i8.i32(i8* %dest, i8 1, i32 %Len, i1 false)
ret void
}
; CHECK: .Ltmp0:
; CHECK-NEXT: mvc 1(1,%r2), 0(%r2)

View File

@ -12,7 +12,7 @@ entry:
; CHECK: jg memset
define void @tail_memset(i8* nocapture %p, i8 %c, i32 %n) #0 {
entry:
tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 false)
tail call void @llvm.memset.p0i8.i32(i8* %p, i8 %c, i32 %n, i1 true)
ret void
}