[RISCV] Support fast calling convention

LLVM may annotate the function with fastcc if there has only one caller
and there're no other caller out of the module and the function is not
naked or contain variable arguments.

The fastcc functions could pass the arguments by the caller saved registers.

Differential Revision: https://reviews.llvm.org/D68559

llvm-svn: 374857
This commit is contained in:
Shiva Chen 2019-10-15 02:04:29 +00:00
parent 232fd99d9e
commit 078bec6c48
3 changed files with 223 additions and 2 deletions

View File

@ -1796,6 +1796,63 @@ static SDValue unpackF64OnRV32DSoftABI(SelectionDAG &DAG, SDValue Chain,
return DAG.getNode(RISCVISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
}
// FastCC has less than 1% performance improvement for some particular
// benchmark. But theoretically, it may has benenfit for some cases.
static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State) {
if (LocVT == MVT::i32 || LocVT == MVT::i64) {
// X5 and X6 might be used for save-restore libcall.
static const MCPhysReg GPRList[] = {
RISCV::X10, RISCV::X11, RISCV::X12, RISCV::X13, RISCV::X14,
RISCV::X15, RISCV::X16, RISCV::X17, RISCV::X7, RISCV::X28,
RISCV::X29, RISCV::X30, RISCV::X31};
if (unsigned Reg = State.AllocateReg(GPRList)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
}
}
if (LocVT == MVT::f32) {
static const MCPhysReg FPR32List[] = {
RISCV::F10_F, RISCV::F11_F, RISCV::F12_F, RISCV::F13_F, RISCV::F14_F,
RISCV::F15_F, RISCV::F16_F, RISCV::F17_F, RISCV::F0_F, RISCV::F1_F,
RISCV::F2_F, RISCV::F3_F, RISCV::F4_F, RISCV::F5_F, RISCV::F6_F,
RISCV::F7_F, RISCV::F28_F, RISCV::F29_F, RISCV::F30_F, RISCV::F31_F};
if (unsigned Reg = State.AllocateReg(FPR32List)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
}
}
if (LocVT == MVT::f64) {
static const MCPhysReg FPR64List[] = {
RISCV::F10_D, RISCV::F11_D, RISCV::F12_D, RISCV::F13_D, RISCV::F14_D,
RISCV::F15_D, RISCV::F16_D, RISCV::F17_D, RISCV::F0_D, RISCV::F1_D,
RISCV::F2_D, RISCV::F3_D, RISCV::F4_D, RISCV::F5_D, RISCV::F6_D,
RISCV::F7_D, RISCV::F28_D, RISCV::F29_D, RISCV::F30_D, RISCV::F31_D};
if (unsigned Reg = State.AllocateReg(FPR64List)) {
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
return false;
}
}
if (LocVT == MVT::i32 || LocVT == MVT::f32) {
unsigned Offset4 = State.AllocateStack(4, 4);
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
return false;
}
if (LocVT == MVT::i64 || LocVT == MVT::f64) {
unsigned Offset5 = State.AllocateStack(8, 8);
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
return false;
}
return true; // CC didn't match.
}
// Transform physical registers into virtual registers.
SDValue RISCVTargetLowering::LowerFormalArguments(
SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
@ -1835,7 +1892,11 @@ SDValue RISCVTargetLowering::LowerFormalArguments(
// Assign locations to all of the incoming arguments.
SmallVector<CCValAssign, 16> ArgLocs;
CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
if (CallConv == CallingConv::Fast)
CCInfo.AnalyzeFormalArguments(Ins, CC_RISCV_FastCC);
else
analyzeInputArgs(MF, CCInfo, Ins, /*IsRet=*/false);
for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
@ -2035,7 +2096,11 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
// Analyze the operands of the call, assigning locations to each operand.
SmallVector<CCValAssign, 16> ArgLocs;
CCState ArgCCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
if (CallConv == CallingConv::Fast)
ArgCCInfo.AnalyzeCallOperands(Outs, CC_RISCV_FastCC);
else
analyzeOutputArgs(MF, ArgCCInfo, Outs, /*IsRet=*/false, &CLI);
// Check if it's really possible to do a tail call.
if (IsTailCall)

View File

@ -0,0 +1,71 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -mattr=+f,+d -verify-machineinstrs < %s \
; RUN: | FileCheck %s
define fastcc float @callee(<32 x float> %A) nounwind {
; CHECK-LABEL: callee:
; CHECK: # %bb.0:
; CHECK-NEXT: fmv.x.w a0, fa0
; CHECK-NEXT: ret
%B = extractelement <32 x float> %A, i32 0
ret float %B
}
; With the fastcc, arguments will be passed by fa0-fa7 and ft0-ft11.
; The rest will be pushed on the stack.
define float @caller(<32 x float> %A) nounwind {
; CHECK-LABEL: caller:
; CHECK: # %bb.0:
; CHECK-NEXT: addi sp, sp, -64
; CHECK-NEXT: sw ra, 60(sp)
; CHECK-NEXT: flw fa0, 0(a0)
; CHECK-NEXT: flw fa1, 4(a0)
; CHECK-NEXT: flw fa2, 8(a0)
; CHECK-NEXT: flw fa3, 12(a0)
; CHECK-NEXT: flw fa4, 16(a0)
; CHECK-NEXT: flw fa5, 20(a0)
; CHECK-NEXT: flw fa6, 24(a0)
; CHECK-NEXT: flw fa7, 28(a0)
; CHECK-NEXT: flw ft0, 32(a0)
; CHECK-NEXT: flw ft1, 36(a0)
; CHECK-NEXT: flw ft2, 40(a0)
; CHECK-NEXT: flw ft3, 44(a0)
; CHECK-NEXT: flw ft4, 48(a0)
; CHECK-NEXT: flw ft5, 52(a0)
; CHECK-NEXT: flw ft6, 56(a0)
; CHECK-NEXT: flw ft7, 60(a0)
; CHECK-NEXT: flw ft8, 64(a0)
; CHECK-NEXT: flw ft9, 68(a0)
; CHECK-NEXT: flw ft10, 72(a0)
; CHECK-NEXT: flw ft11, 76(a0)
; CHECK-NEXT: flw fs0, 80(a0)
; CHECK-NEXT: flw fs1, 84(a0)
; CHECK-NEXT: flw fs2, 88(a0)
; CHECK-NEXT: flw fs3, 92(a0)
; CHECK-NEXT: flw fs4, 96(a0)
; CHECK-NEXT: flw fs5, 100(a0)
; CHECK-NEXT: flw fs6, 104(a0)
; CHECK-NEXT: flw fs7, 108(a0)
; CHECK-NEXT: flw fs8, 112(a0)
; CHECK-NEXT: flw fs9, 116(a0)
; CHECK-NEXT: flw fs10, 120(a0)
; CHECK-NEXT: flw fs11, 124(a0)
; CHECK-NEXT: fsw fs11, 44(sp)
; CHECK-NEXT: fsw fs10, 40(sp)
; CHECK-NEXT: fsw fs9, 36(sp)
; CHECK-NEXT: fsw fs8, 32(sp)
; CHECK-NEXT: fsw fs7, 28(sp)
; CHECK-NEXT: fsw fs6, 24(sp)
; CHECK-NEXT: fsw fs5, 20(sp)
; CHECK-NEXT: fsw fs4, 16(sp)
; CHECK-NEXT: fsw fs3, 12(sp)
; CHECK-NEXT: fsw fs2, 8(sp)
; CHECK-NEXT: fsw fs1, 4(sp)
; CHECK-NEXT: fsw fs0, 0(sp)
; CHECK-NEXT: call callee
; CHECK-NEXT: lw ra, 60(sp)
; CHECK-NEXT: addi sp, sp, 64
; CHECK-NEXT: ret
%C = call fastcc float @callee(<32 x float> %A)
ret float %C
}

View File

@ -0,0 +1,85 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefix=RV32 %s
; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
; RUN: | FileCheck -check-prefix=RV64 %s
define fastcc i32 @callee(<16 x i32> %A) nounwind {
; RV32-LABEL: callee:
; RV32: # %bb.0:
; RV32-NEXT: ret
;
; RV64-LABEL: callee:
; RV64: # %bb.0:
; RV64-NEXT: ret
%B = extractelement <16 x i32> %A, i32 0
ret i32 %B
}
; With the fastcc, arguments will be passed by a0-a7 and t2-t6.
; The rest will be pushed on the stack.
define i32 @caller(<16 x i32> %A) nounwind {
; RV32-LABEL: caller:
; RV32: # %bb.0:
; RV32-NEXT: addi sp, sp, -32
; RV32-NEXT: sw ra, 28(sp)
; RV32-NEXT: sw s0, 24(sp)
; RV32-NEXT: lw t0, 0(a0)
; RV32-NEXT: lw a1, 4(a0)
; RV32-NEXT: lw a2, 8(a0)
; RV32-NEXT: lw a3, 12(a0)
; RV32-NEXT: lw a4, 16(a0)
; RV32-NEXT: lw a5, 20(a0)
; RV32-NEXT: lw a6, 24(a0)
; RV32-NEXT: lw a7, 28(a0)
; RV32-NEXT: lw t2, 32(a0)
; RV32-NEXT: lw t3, 36(a0)
; RV32-NEXT: lw t4, 40(a0)
; RV32-NEXT: lw t5, 44(a0)
; RV32-NEXT: lw t6, 48(a0)
; RV32-NEXT: lw t1, 52(a0)
; RV32-NEXT: lw s0, 56(a0)
; RV32-NEXT: lw a0, 60(a0)
; RV32-NEXT: sw a0, 8(sp)
; RV32-NEXT: sw s0, 4(sp)
; RV32-NEXT: sw t1, 0(sp)
; RV32-NEXT: mv a0, t0
; RV32-NEXT: call callee
; RV32-NEXT: lw s0, 24(sp)
; RV32-NEXT: lw ra, 28(sp)
; RV32-NEXT: addi sp, sp, 32
; RV32-NEXT: ret
;
; RV64-LABEL: caller:
; RV64: # %bb.0:
; RV64-NEXT: addi sp, sp, -48
; RV64-NEXT: sd ra, 40(sp)
; RV64-NEXT: sd s0, 32(sp)
; RV64-NEXT: ld t0, 0(a0)
; RV64-NEXT: ld a1, 8(a0)
; RV64-NEXT: ld a2, 16(a0)
; RV64-NEXT: ld a3, 24(a0)
; RV64-NEXT: ld a4, 32(a0)
; RV64-NEXT: ld a5, 40(a0)
; RV64-NEXT: ld a6, 48(a0)
; RV64-NEXT: ld a7, 56(a0)
; RV64-NEXT: ld t2, 64(a0)
; RV64-NEXT: ld t3, 72(a0)
; RV64-NEXT: ld t4, 80(a0)
; RV64-NEXT: ld t5, 88(a0)
; RV64-NEXT: ld t6, 96(a0)
; RV64-NEXT: ld t1, 104(a0)
; RV64-NEXT: ld s0, 112(a0)
; RV64-NEXT: ld a0, 120(a0)
; RV64-NEXT: sd a0, 16(sp)
; RV64-NEXT: sd s0, 8(sp)
; RV64-NEXT: sd t1, 0(sp)
; RV64-NEXT: mv a0, t0
; RV64-NEXT: call callee
; RV64-NEXT: ld s0, 32(sp)
; RV64-NEXT: ld ra, 40(sp)
; RV64-NEXT: addi sp, sp, 48
; RV64-NEXT: ret
%C = call fastcc i32 @callee(<16 x i32> %A)
ret i32 %C
}