forked from OSchip/llvm-project
[SystemZ] Extend memcmp support to all constant lengths
This uses the infrastructure added for memcpy and memmove in r189331. llvm-svn: 189458
This commit is contained in:
parent
fab9336413
commit
be133a8757
|
@ -1954,6 +1954,18 @@ static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
|
|||
return NewMBB;
|
||||
}
|
||||
|
||||
// Split MBB after MI and return the new block (the one that contains
|
||||
// instructions after MI).
|
||||
static MachineBasicBlock *splitBlockAfter(MachineInstr *MI,
|
||||
MachineBasicBlock *MBB) {
|
||||
MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
|
||||
NewMBB->splice(NewMBB->begin(), MBB,
|
||||
llvm::next(MachineBasicBlock::iterator(MI)),
|
||||
MBB->end());
|
||||
NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
|
||||
return NewMBB;
|
||||
}
|
||||
|
||||
// Split MBB before MI and return the new block (the one that contains MI).
|
||||
static MachineBasicBlock *splitBlockBefore(MachineInstr *MI,
|
||||
MachineBasicBlock *MBB) {
|
||||
|
@ -2490,6 +2502,11 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
|
|||
uint64_t SrcDisp = MI->getOperand(3).getImm();
|
||||
uint64_t Length = MI->getOperand(4).getImm();
|
||||
|
||||
// When generating more than one CLC, all but the last will need to
|
||||
// branch to the end when a difference is found.
|
||||
MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
|
||||
splitBlockAfter(MI, MBB) : 0);
|
||||
|
||||
// Check for the loop form, in which operand 5 is the trip count.
|
||||
if (MI->getNumExplicitOperands() > 5) {
|
||||
bool HaveSingleBase = DestBase.isIdenticalTo(SrcBase);
|
||||
|
@ -2514,6 +2531,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
|
|||
MachineBasicBlock *StartMBB = MBB;
|
||||
MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
|
||||
MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
|
||||
MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
|
||||
|
||||
// StartMBB:
|
||||
// # fall through to LoopMMB
|
||||
|
@ -2521,13 +2539,44 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
|
|||
|
||||
// LoopMBB:
|
||||
// %ThisDestReg = phi [ %StartDestReg, StartMBB ],
|
||||
// [ %NextDestReg, LoopMBB ]
|
||||
// [ %NextDestReg, NextMBB ]
|
||||
// %ThisSrcReg = phi [ %StartSrcReg, StartMBB ],
|
||||
// [ %NextSrcReg, LoopMBB ]
|
||||
// [ %NextSrcReg, NextMBB ]
|
||||
// %ThisCountReg = phi [ %StartCountReg, StartMBB ],
|
||||
// [ %NextCountReg, LoopMBB ]
|
||||
// PFD 2, 768+DestDisp(%ThisDestReg)
|
||||
// [ %NextCountReg, NextMBB ]
|
||||
// ( PFD 2, 768+DestDisp(%ThisDestReg) )
|
||||
// Opcode DestDisp(256,%ThisDestReg), SrcDisp(%ThisSrcReg)
|
||||
// ( JLH EndMBB )
|
||||
//
|
||||
// The prefetch is used only for MVC. The JLH is used only for CLC.
|
||||
MBB = LoopMBB;
|
||||
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
|
||||
.addReg(StartDestReg).addMBB(StartMBB)
|
||||
.addReg(NextDestReg).addMBB(NextMBB);
|
||||
if (!HaveSingleBase)
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
|
||||
.addReg(StartSrcReg).addMBB(StartMBB)
|
||||
.addReg(NextSrcReg).addMBB(NextMBB);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
|
||||
.addReg(StartCountReg).addMBB(StartMBB)
|
||||
.addReg(NextCountReg).addMBB(NextMBB);
|
||||
if (Opcode == SystemZ::MVC)
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
|
||||
.addImm(SystemZ::PFD_WRITE)
|
||||
.addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
|
||||
BuildMI(MBB, DL, TII->get(Opcode))
|
||||
.addReg(ThisDestReg).addImm(DestDisp).addImm(256)
|
||||
.addReg(ThisSrcReg).addImm(SrcDisp);
|
||||
if (EndMBB) {
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
|
||||
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
|
||||
.addMBB(EndMBB);
|
||||
MBB->addSuccessor(EndMBB);
|
||||
MBB->addSuccessor(NextMBB);
|
||||
}
|
||||
|
||||
// NextMBB:
|
||||
// %NextDestReg = LA 256(%ThisDestReg)
|
||||
// %NextSrcReg = LA 256(%ThisSrcReg)
|
||||
// %NextCountReg = AGHI %ThisCountReg, -1
|
||||
|
@ -2536,24 +2585,8 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
|
|||
// # fall through to DoneMMB
|
||||
//
|
||||
// The AGHI, CGHI and JLH should be converted to BRCTG by later passes.
|
||||
MBB = LoopMBB;
|
||||
MBB = NextMBB;
|
||||
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisDestReg)
|
||||
.addReg(StartDestReg).addMBB(StartMBB)
|
||||
.addReg(NextDestReg).addMBB(LoopMBB);
|
||||
if (!HaveSingleBase)
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisSrcReg)
|
||||
.addReg(StartSrcReg).addMBB(StartMBB)
|
||||
.addReg(NextSrcReg).addMBB(LoopMBB);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PHI), ThisCountReg)
|
||||
.addReg(StartCountReg).addMBB(StartMBB)
|
||||
.addReg(NextCountReg).addMBB(LoopMBB);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::PFD))
|
||||
.addImm(SystemZ::PFD_WRITE)
|
||||
.addReg(ThisDestReg).addImm(DestDisp + 768).addReg(0);
|
||||
BuildMI(MBB, DL, TII->get(Opcode))
|
||||
.addReg(ThisDestReg).addImm(DestDisp).addImm(256)
|
||||
.addReg(ThisSrcReg).addImm(SrcDisp);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::LA), NextDestReg)
|
||||
.addReg(ThisDestReg).addImm(256).addReg(0);
|
||||
if (!HaveSingleBase)
|
||||
|
@ -2599,6 +2632,22 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
|
|||
DestDisp += ThisLength;
|
||||
SrcDisp += ThisLength;
|
||||
Length -= ThisLength;
|
||||
// If there's another CLC to go, branch to the end if a difference
|
||||
// was found.
|
||||
if (EndMBB && Length > 0) {
|
||||
MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
|
||||
BuildMI(MBB, DL, TII->get(SystemZ::BRC))
|
||||
.addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
|
||||
.addMBB(EndMBB);
|
||||
MBB->addSuccessor(EndMBB);
|
||||
MBB->addSuccessor(NextMBB);
|
||||
MBB = NextMBB;
|
||||
}
|
||||
}
|
||||
if (EndMBB) {
|
||||
MBB->addSuccessor(EndMBB);
|
||||
MBB = EndMBB;
|
||||
MBB->addLiveIn(SystemZ::CC);
|
||||
}
|
||||
|
||||
MI->eraseFromParent();
|
||||
|
|
|
@ -141,6 +141,28 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
|
|||
return SDValue();
|
||||
}
|
||||
|
||||
// Use CLC to compare [Src1, Src1 + Size) with [Src2, Src2 + Size),
|
||||
// deciding whether to use a loop or straight-line code.
|
||||
static SDValue emitCLC(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
|
||||
SDValue Src1, SDValue Src2, uint64_t Size) {
|
||||
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
||||
EVT PtrVT = Src1.getValueType();
|
||||
// A two-CLC sequence is a clear win over a loop, not least because it
|
||||
// needs only one branch. A three-CLC sequence needs the same number
|
||||
// of branches as a loop (i.e. 2), but is shorter. That brings us to
|
||||
// lengths greater than 768 bytes. It seems relatively likely that
|
||||
// a difference will be found within the first 768 bytes, so we just
|
||||
// optimize for the smallest number of branch instructions, in order
|
||||
// to avoid polluting the prediction buffer too much. A loop only ever
|
||||
// needs 2 branches, whereas a straight-line sequence would need 3 or more.
|
||||
if (Size > 3 * 256)
|
||||
return DAG.getNode(SystemZISD::CLC_LOOP, DL, VTs, Chain, Src1, Src2,
|
||||
DAG.getConstant(Size, PtrVT),
|
||||
DAG.getConstant(Size / 256, PtrVT));
|
||||
return DAG.getNode(SystemZISD::CLC, DL, VTs, Chain, Src1, Src2,
|
||||
DAG.getConstant(Size, PtrVT));
|
||||
}
|
||||
|
||||
// Convert the current CC value into an integer that is 0 if CC == 0,
|
||||
// less than zero if CC == 1 and greater than zero if CC >= 2.
|
||||
// The sequence starts with IPM, which puts CC into bits 29 and 28
|
||||
|
@ -159,17 +181,12 @@ EmitTargetCodeForMemcmp(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
|
|||
SDValue Src1, SDValue Src2, SDValue Size,
|
||||
MachinePointerInfo Op1PtrInfo,
|
||||
MachinePointerInfo Op2PtrInfo) const {
|
||||
EVT PtrVT = Src1.getValueType();
|
||||
if (ConstantSDNode *CSize = dyn_cast<ConstantSDNode>(Size)) {
|
||||
uint64_t Bytes = CSize->getZExtValue();
|
||||
if (Bytes >= 1 && Bytes <= 0x100) {
|
||||
// A single CLC.
|
||||
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
|
||||
Chain = DAG.getNode(SystemZISD::CLC, DL, VTs, Chain,
|
||||
Src1, Src2, Size, DAG.getConstant(0, PtrVT));
|
||||
SDValue Glue = Chain.getValue(1);
|
||||
return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
|
||||
}
|
||||
assert(Bytes > 0 && "Caller should have handled 0-size case");
|
||||
Chain = emitCLC(DAG, DL, Chain, Src1, Src2, Bytes);
|
||||
SDValue Glue = Chain.getValue(1);
|
||||
return std::make_pair(addIPMSequence(DL, Glue, DAG), Chain);
|
||||
}
|
||||
return std::make_pair(SDValue(), SDValue());
|
||||
}
|
||||
|
|
|
@ -123,11 +123,99 @@ exit:
|
|||
ret i32 %res
|
||||
}
|
||||
|
||||
; 257 bytes is too big for a single CLC. For now expect a call instead.
|
||||
; 257 bytes needs two CLCs.
|
||||
define i32 @f8(i8 *%src1, i8 *%src2) {
|
||||
; CHECK-LABEL: f8:
|
||||
; CHECK: brasl %r14, memcmp@PLT
|
||||
; CHECK: clc 0(256,%r2), 0(%r3)
|
||||
; CHECK: jlh [[LABEL:\..*]]
|
||||
; CHECK: clc 256(1,%r2), 256(%r3)
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: ipm [[REG:%r[0-5]]]
|
||||
; CHECK: br %r14
|
||||
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; Test a comparison of 258 bytes in which the CC result can be used directly.
|
||||
define void @f9(i8 *%src1, i8 *%src2, i32 *%dest) {
|
||||
; CHECK-LABEL: f9:
|
||||
; CHECK: clc 0(256,%r2), 0(%r3)
|
||||
; CHECK: jlh [[LABEL:\..*]]
|
||||
; CHECK: clc 256(1,%r2), 256(%r3)
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK-NEXT: jl .L
|
||||
; CHECK: br %r14
|
||||
entry:
|
||||
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 257)
|
||||
%cmp = icmp slt i32 %res, 0
|
||||
br i1 %cmp, label %exit, label %store
|
||||
|
||||
store:
|
||||
store i32 0, i32 *%dest
|
||||
br label %exit
|
||||
|
||||
exit:
|
||||
ret void
|
||||
}
|
||||
|
||||
; Test the largest size that can use two CLCs.
|
||||
define i32 @f10(i8 *%src1, i8 *%src2) {
|
||||
; CHECK-LABEL: f10:
|
||||
; CHECK: clc 0(256,%r2), 0(%r3)
|
||||
; CHECK: jlh [[LABEL:\..*]]
|
||||
; CHECK: clc 256(256,%r2), 256(%r3)
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: ipm [[REG:%r[0-5]]]
|
||||
; CHECK: br %r14
|
||||
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 512)
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; Test the smallest size that needs 3 CLCs.
|
||||
define i32 @f11(i8 *%src1, i8 *%src2) {
|
||||
; CHECK-LABEL: f11:
|
||||
; CHECK: clc 0(256,%r2), 0(%r3)
|
||||
; CHECK: jlh [[LABEL:\..*]]
|
||||
; CHECK: clc 256(256,%r2), 256(%r3)
|
||||
; CHECK: jlh [[LABEL]]
|
||||
; CHECK: clc 512(1,%r2), 512(%r3)
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: ipm [[REG:%r[0-5]]]
|
||||
; CHECK: br %r14
|
||||
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 513)
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; Test the largest size than can use 3 CLCs.
|
||||
define i32 @f12(i8 *%src1, i8 *%src2) {
|
||||
; CHECK-LABEL: f12:
|
||||
; CHECK: clc 0(256,%r2), 0(%r3)
|
||||
; CHECK: jlh [[LABEL:\..*]]
|
||||
; CHECK: clc 256(256,%r2), 256(%r3)
|
||||
; CHECK: jlh [[LABEL]]
|
||||
; CHECK: clc 512(256,%r2), 512(%r3)
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: ipm [[REG:%r[0-5]]]
|
||||
; CHECK: br %r14
|
||||
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 768)
|
||||
ret i32 %res
|
||||
}
|
||||
|
||||
; The next size up uses a loop instead. We leave the more complicated
|
||||
; loop tests to memcpy-01.ll, which shares the same form.
|
||||
define i32 @f13(i8 *%src1, i8 *%src2) {
|
||||
; CHECK-LABEL: f13:
|
||||
; CHECK: lghi [[COUNT:%r[0-5]]], 3
|
||||
; CHECK: [[LOOP:.L[^:]*]]:
|
||||
; CHECK: clc 0(256,%r2), 0(%r3)
|
||||
; CHECK: jlh [[LABEL:\..*]]
|
||||
; CHECK-DAG: la %r2, 256(%r2)
|
||||
; CHECK-DAG: la %r3, 256(%r3)
|
||||
; CHECK: brctg [[COUNT]], [[LOOP]]
|
||||
; CHECK: clc 0(1,%r2), 0(%r3)
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: ipm [[REG:%r[0-5]]]
|
||||
; CHECK: br %r14
|
||||
%res = call i32 @memcmp(i8 *%src1, i8 *%src2, i64 769)
|
||||
ret i32 %res
|
||||
}
|
||||
|
|
|
@ -125,10 +125,14 @@ exit:
|
|||
ret i64 %res
|
||||
}
|
||||
|
||||
; 257 bytes is too big for a single CLC. For now expect a call instead.
|
||||
; 257 bytes needs two CLCs.
|
||||
define i64 @f8(i8 *%src1, i8 *%src2) {
|
||||
; CHECK-LABEL: f8:
|
||||
; CHECK: brasl %r14, memcmp@PLT
|
||||
; CHECK: clc 0(256,%r2), 0(%r3)
|
||||
; CHECK: jlh [[LABEL:\..*]]
|
||||
; CHECK: clc 256(1,%r2), 256(%r3)
|
||||
; CHECK: [[LABEL]]:
|
||||
; CHECK: ipm [[REG:%r[0-5]]]
|
||||
; CHECK: br %r14
|
||||
%res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 257)
|
||||
ret i64 %res
|
||||
|
|
Loading…
Reference in New Issue