2010-10-05 14:04:14 +08:00
|
|
|
//===- X86InstrControl.td - Control Flow Instructions ------*- tablegen -*-===//
|
2011-01-26 10:03:37 +08:00
|
|
|
//
|
2010-10-05 14:04:14 +08:00
|
|
|
// The LLVM Compiler Infrastructure
|
|
|
|
//
|
|
|
|
// This file is distributed under the University of Illinois Open Source
|
|
|
|
// License. See LICENSE.TXT for details.
|
2011-01-26 10:03:37 +08:00
|
|
|
//
|
2010-10-05 14:04:14 +08:00
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
//
|
|
|
|
// This file describes the X86 jump, return, call, and related instructions.
|
|
|
|
//
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Control Flow Instructions.
|
|
|
|
//
|
|
|
|
|
|
|
|
// Return instructions.
|
|
|
|
let isTerminator = 1, isReturn = 1, isBarrier = 1,
|
|
|
|
hasCtrlDep = 1, FPForm = SpecialFP in {
|
|
|
|
def RET : I <0xC3, RawFrm, (outs), (ins variable_ops),
|
|
|
|
"ret",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(X86retflag 0)], IIC_RET>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def RETI : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
|
|
|
|
"ret\t$amt",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(X86retflag timm:$amt)], IIC_RET_IMM>;
|
2010-10-19 01:04:36 +08:00
|
|
|
def RETIW : Ii16<0xC2, RawFrm, (outs), (ins i16imm:$amt, variable_ops),
|
|
|
|
"retw\t$amt",
|
2012-02-02 07:20:51 +08:00
|
|
|
[], IIC_RET_IMM>, OpSize;
|
2010-11-13 02:54:56 +08:00
|
|
|
def LRETL : I <0xCB, RawFrm, (outs), (ins),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lretl", [], IIC_RET>;
|
2010-11-13 01:41:20 +08:00
|
|
|
def LRETQ : RI <0xCB, RawFrm, (outs), (ins),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lretq", [], IIC_RET>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def LRETI : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lret\t$amt", [], IIC_RET>;
|
2010-10-19 01:04:36 +08:00
|
|
|
def LRETIW : Ii16<0xCA, RawFrm, (outs), (ins i16imm:$amt),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lretw\t$amt", [], IIC_RET>, OpSize;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Unconditional branches.
|
|
|
|
let isBarrier = 1, isBranch = 1, isTerminator = 1 in {
|
|
|
|
def JMP_4 : Ii32PCRel<0xE9, RawFrm, (outs), (ins brtarget:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmp\t$dst", [(br bb:$dst)], IIC_JMP_REL>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def JMP_1 : Ii8PCRel<0xEB, RawFrm, (outs), (ins brtarget8:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmp\t$dst", [], IIC_JMP_REL>;
|
2012-01-21 05:14:06 +08:00
|
|
|
// FIXME : Intel syntax for JMP64pcrel32 such that it is not ambiguious
|
|
|
|
// with JMP_1.
|
2011-01-26 10:03:37 +08:00
|
|
|
def JMP64pcrel32 : I<0xE9, RawFrm, (outs), (ins brtarget:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmpq\t$dst", [], IIC_JMP_REL>;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Conditional Branches.
|
|
|
|
let isBranch = 1, isTerminator = 1, Uses = [EFLAGS] in {
|
|
|
|
multiclass ICBr<bits<8> opc1, bits<8> opc4, string asm, PatFrag Cond> {
|
2012-02-02 07:20:51 +08:00
|
|
|
def _1 : Ii8PCRel <opc1, RawFrm, (outs), (ins brtarget8:$dst), asm, [],
|
|
|
|
IIC_Jcc>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def _4 : Ii32PCRel<opc4, RawFrm, (outs), (ins brtarget:$dst), asm,
|
2012-02-02 07:20:51 +08:00
|
|
|
[(X86brcond bb:$dst, Cond, EFLAGS)], IIC_Jcc>, TB;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
defm JO : ICBr<0x70, 0x80, "jo\t$dst" , X86_COND_O>;
|
|
|
|
defm JNO : ICBr<0x71, 0x81, "jno\t$dst" , X86_COND_NO>;
|
|
|
|
defm JB : ICBr<0x72, 0x82, "jb\t$dst" , X86_COND_B>;
|
|
|
|
defm JAE : ICBr<0x73, 0x83, "jae\t$dst", X86_COND_AE>;
|
|
|
|
defm JE : ICBr<0x74, 0x84, "je\t$dst" , X86_COND_E>;
|
|
|
|
defm JNE : ICBr<0x75, 0x85, "jne\t$dst", X86_COND_NE>;
|
|
|
|
defm JBE : ICBr<0x76, 0x86, "jbe\t$dst", X86_COND_BE>;
|
|
|
|
defm JA : ICBr<0x77, 0x87, "ja\t$dst" , X86_COND_A>;
|
|
|
|
defm JS : ICBr<0x78, 0x88, "js\t$dst" , X86_COND_S>;
|
|
|
|
defm JNS : ICBr<0x79, 0x89, "jns\t$dst", X86_COND_NS>;
|
|
|
|
defm JP : ICBr<0x7A, 0x8A, "jp\t$dst" , X86_COND_P>;
|
|
|
|
defm JNP : ICBr<0x7B, 0x8B, "jnp\t$dst", X86_COND_NP>;
|
|
|
|
defm JL : ICBr<0x7C, 0x8C, "jl\t$dst" , X86_COND_L>;
|
|
|
|
defm JGE : ICBr<0x7D, 0x8D, "jge\t$dst", X86_COND_GE>;
|
|
|
|
defm JLE : ICBr<0x7E, 0x8E, "jle\t$dst", X86_COND_LE>;
|
|
|
|
defm JG : ICBr<0x7F, 0x8F, "jg\t$dst" , X86_COND_G>;
|
|
|
|
|
|
|
|
// jcx/jecx/jrcx instructions.
|
|
|
|
let isAsmParserOnly = 1, isBranch = 1, isTerminator = 1 in {
|
|
|
|
// These are the 32-bit versions of this instruction for the asmparser. In
|
|
|
|
// 32-bit mode, the address size prefix is jcxz and the unprefixed version is
|
|
|
|
// jecxz.
|
|
|
|
let Uses = [CX] in
|
|
|
|
def JCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jcxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In32BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
let Uses = [ECX] in
|
|
|
|
def JECXZ_32 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jecxz\t$dst", [], IIC_JCXZ>, Requires<[In32BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
// J*CXZ instruction: 64-bit versions of this instruction for the asmparser.
|
|
|
|
// In 64-bit mode, the address size prefix is jecxz and the unprefixed version
|
|
|
|
// is jrcxz.
|
|
|
|
let Uses = [ECX] in
|
|
|
|
def JECXZ_64 : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jecxz\t$dst", [], IIC_JCXZ>, AdSize, Requires<[In64BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
let Uses = [RCX] in
|
|
|
|
def JRCXZ : Ii8PCRel<0xE3, RawFrm, (outs), (ins brtarget8:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jrcxz\t$dst", [], IIC_JCXZ>, Requires<[In64BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Indirect branches
|
|
|
|
let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
|
|
|
|
def JMP32r : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def JMP32m : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, Requires<[In32BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
def JMP64r : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def JMP64m : I<0xFF, MRM4m, (outs), (ins i64mem:$dst), "jmp{q}\t{*}$dst",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(brind (loadi64 addr:$dst))], IIC_JMP_MEM>, Requires<[In64BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
def FARJMP16i : Iseg16<0xEA, RawFrmImm16, (outs),
|
2010-10-05 14:04:14 +08:00
|
|
|
(ins i16imm:$off, i16imm:$seg),
|
2012-02-02 07:20:51 +08:00
|
|
|
"ljmp{w}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>, OpSize;
|
2010-10-05 14:04:14 +08:00
|
|
|
def FARJMP32i : Iseg32<0xEA, RawFrmImm16, (outs),
|
|
|
|
(ins i32imm:$off, i16imm:$seg),
|
2012-02-02 07:20:51 +08:00
|
|
|
"ljmp{l}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def FARJMP64 : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
def FARJMP16m : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize;
|
2010-10-05 14:04:14 +08:00
|
|
|
def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Loop instructions
|
|
|
|
|
2012-02-02 07:20:51 +08:00
|
|
|
def LOOP : Ii8PCRel<0xE2, RawFrm, (outs), (ins brtarget8:$dst), "loop\t$dst", [], IIC_LOOP>;
|
|
|
|
def LOOPE : Ii8PCRel<0xE1, RawFrm, (outs), (ins brtarget8:$dst), "loope\t$dst", [], IIC_LOOPE>;
|
|
|
|
def LOOPNE : Ii8PCRel<0xE0, RawFrm, (outs), (ins brtarget8:$dst), "loopne\t$dst", [], IIC_LOOPNE>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Call Instructions...
|
|
|
|
//
|
|
|
|
let isCall = 1 in
|
|
|
|
// All calls clobber the non-callee saved registers. ESP is marked as
|
|
|
|
// a use to prevent stack-pointer assignments that appear immediately
|
|
|
|
// before calls from potentially appearing dead. Uses for argument
|
|
|
|
// registers are added manually.
|
Enable register mask operands for x86 calls.
Call instructions no longer have a list of 43 call-clobbered registers.
Instead, they get a single register mask operand with a bit vector of
call-preserved registers.
This saves a lot of memory, 42 x 32 bytes = 1344 bytes per call
instruction, and it speeds up building call instructions because those
43 imp-def operands no longer need to be added to use-def lists. (And
removed and shifted and re-added for every explicit call operand).
Passes like LiveVariables, LiveIntervals, RAGreedy, PEI, and
BranchFolding are significantly faster because they can deal with call
clobbers in bulk.
Overall, clang -O2 is between 0% and 8% faster, uniformly distributed
depending on call density in the compiled code. Debug builds using
clang -O0 are 0% - 3% faster.
I have verified that this patch doesn't change the assembly generated
for the LLVM nightly test suite when building with -disable-copyprop
and -disable-branch-fold.
Branch folding behaves slightly differently in a few cases because call
instructions have different hash values now.
Copy propagation flushes its data structures when it crosses a register
mask operand. This causes it to leave a few dead copies behind, on the
order of 20 instruction across the entire nightly test suite, including
SPEC. Fixing this properly would require the pass to use different data
structures.
llvm-svn: 150638
2012-02-16 08:02:50 +08:00
|
|
|
let Uses = [ESP] in {
|
2010-10-05 14:04:14 +08:00
|
|
|
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
|
|
|
|
(outs), (ins i32imm_pcrel:$dst,variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
|
2010-10-05 14:04:14 +08:00
|
|
|
def CALL32r : I<0xFF, MRM2r, (outs), (ins GR32:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
|
2010-10-05 14:04:14 +08:00
|
|
|
Requires<[In32BitMode]>;
|
|
|
|
def CALL32m : I<0xFF, MRM2m, (outs), (ins i32mem:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>,
|
2010-10-05 14:04:14 +08:00
|
|
|
Requires<[In32BitMode]>;
|
2011-01-26 10:03:37 +08:00
|
|
|
|
|
|
|
def FARCALL16i : Iseg16<0x9A, RawFrmImm16, (outs),
|
2010-10-05 14:04:14 +08:00
|
|
|
(ins i16imm:$off, i16imm:$seg),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lcall{w}\t{$seg, $off|$off, $seg}", [],
|
|
|
|
IIC_CALL_FAR_PTR>, OpSize;
|
2010-10-05 14:04:14 +08:00
|
|
|
def FARCALL32i : Iseg32<0x9A, RawFrmImm16, (outs),
|
|
|
|
(ins i32imm:$off, i16imm:$seg),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lcall{l}\t{$seg, $off|$off, $seg}", [],
|
|
|
|
IIC_CALL_FAR_PTR>;
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2010-10-05 14:04:14 +08:00
|
|
|
def FARCALL16m : I<0xFF, MRM3m, (outs), (ins opaque32mem:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize;
|
2010-10-05 14:04:14 +08:00
|
|
|
def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
// callw for 16 bit code for the assembler.
|
|
|
|
let isAsmParserOnly = 1 in
|
|
|
|
def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
|
|
|
|
(outs), (ins i16imm_pcrel:$dst, variable_ops),
|
|
|
|
"callw\t$dst", []>, OpSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// Tail call stuff.
|
|
|
|
|
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
|
|
|
|
isCodeGenOnly = 1 in
|
Enable register mask operands for x86 calls.
Call instructions no longer have a list of 43 call-clobbered registers.
Instead, they get a single register mask operand with a bit vector of
call-preserved registers.
This saves a lot of memory, 42 x 32 bytes = 1344 bytes per call
instruction, and it speeds up building call instructions because those
43 imp-def operands no longer need to be added to use-def lists. (And
removed and shifted and re-added for every explicit call operand).
Passes like LiveVariables, LiveIntervals, RAGreedy, PEI, and
BranchFolding are significantly faster because they can deal with call
clobbers in bulk.
Overall, clang -O2 is between 0% and 8% faster, uniformly distributed
depending on call density in the compiled code. Debug builds using
clang -O0 are 0% - 3% faster.
I have verified that this patch doesn't change the assembly generated
for the LLVM nightly test suite when building with -disable-copyprop
and -disable-branch-fold.
Branch folding behaves slightly differently in a few cases because call
instructions have different hash values now.
Copy propagation flushes its data structures when it crosses a register
mask operand. This causes it to leave a few dead copies behind, on the
order of 20 instruction across the entire nightly test suite, including
SPEC. Fixing this properly would require the pass to use different data
structures.
llvm-svn: 150638
2012-02-16 08:02:50 +08:00
|
|
|
let Uses = [ESP] in {
|
2011-01-26 10:03:37 +08:00
|
|
|
def TCRETURNdi : PseudoI<(outs),
|
2010-12-01 05:37:36 +08:00
|
|
|
(ins i32imm_pcrel:$dst, i32imm:$offset, variable_ops), []>;
|
2011-01-26 10:03:37 +08:00
|
|
|
def TCRETURNri : PseudoI<(outs),
|
2010-12-01 05:37:36 +08:00
|
|
|
(ins GR32_TC:$dst, i32imm:$offset, variable_ops), []>;
|
2010-10-05 14:04:14 +08:00
|
|
|
let mayLoad = 1 in
|
2011-01-26 10:03:37 +08:00
|
|
|
def TCRETURNmi : PseudoI<(outs),
|
2010-12-01 05:37:36 +08:00
|
|
|
(ins i32mem_TC:$dst, i32imm:$offset, variable_ops), []>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
// FIXME: The should be pseudo instructions that are lowered when going to
|
|
|
|
// mcinst.
|
|
|
|
def TAILJMPd : Ii32PCRel<0xE9, RawFrm, (outs),
|
|
|
|
(ins i32imm_pcrel:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmp\t$dst # TAILCALL",
|
|
|
|
[], IIC_JMP_REL>;
|
2011-01-26 10:03:37 +08:00
|
|
|
def TAILJMPr : I<0xFF, MRM4r, (outs), (ins GR32_TC:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"", [], IIC_JMP_REG>; // FIXME: Remove encoding when JIT is dead.
|
2010-10-05 14:04:14 +08:00
|
|
|
let mayLoad = 1 in
|
|
|
|
def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmp{l}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
// Call Instructions...
|
|
|
|
//
|
|
|
|
let isCall = 1 in
|
|
|
|
// All calls clobber the non-callee saved registers. RSP is marked as
|
|
|
|
// a use to prevent stack-pointer assignments that appear immediately
|
|
|
|
// before calls from potentially appearing dead. Uses for argument
|
|
|
|
// registers are added manually.
|
Enable register mask operands for x86 calls.
Call instructions no longer have a list of 43 call-clobbered registers.
Instead, they get a single register mask operand with a bit vector of
call-preserved registers.
This saves a lot of memory, 42 x 32 bytes = 1344 bytes per call
instruction, and it speeds up building call instructions because those
43 imp-def operands no longer need to be added to use-def lists. (And
removed and shifted and re-added for every explicit call operand).
Passes like LiveVariables, LiveIntervals, RAGreedy, PEI, and
BranchFolding are significantly faster because they can deal with call
clobbers in bulk.
Overall, clang -O2 is between 0% and 8% faster, uniformly distributed
depending on call density in the compiled code. Debug builds using
clang -O0 are 0% - 3% faster.
I have verified that this patch doesn't change the assembly generated
for the LLVM nightly test suite when building with -disable-copyprop
and -disable-branch-fold.
Branch folding behaves slightly differently in a few cases because call
instructions have different hash values now.
Copy propagation flushes its data structures when it crosses a register
mask operand. This causes it to leave a few dead copies behind, on the
order of 20 instruction across the entire nightly test suite, including
SPEC. Fixing this properly would require the pass to use different data
structures.
llvm-svn: 150638
2012-02-16 08:02:50 +08:00
|
|
|
let Uses = [RSP] in {
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2010-10-05 14:04:14 +08:00
|
|
|
// NOTE: this pattern doesn't match "X86call imm", because we do not know
|
|
|
|
// that the offset between an arbitrary immediate and the call will fit in
|
|
|
|
// the 32-bit pcrel field that we have.
|
|
|
|
def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
|
|
|
|
(outs), (ins i64i32imm_pcrel:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{q}\t$dst", [], IIC_CALL_RI>,
|
2010-10-05 14:04:14 +08:00
|
|
|
Requires<[In64BitMode, NotWin64]>;
|
|
|
|
def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{q}\t{*}$dst", [(X86call GR64:$dst)],
|
|
|
|
IIC_CALL_RI>,
|
2010-10-05 14:04:14 +08:00
|
|
|
Requires<[In64BitMode, NotWin64]>;
|
|
|
|
def CALL64m : I<0xFF, MRM2m, (outs), (ins i64mem:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
|
|
|
|
IIC_CALL_MEM>,
|
2010-10-05 14:04:14 +08:00
|
|
|
Requires<[In64BitMode, NotWin64]>;
|
2011-01-26 10:03:37 +08:00
|
|
|
|
2010-10-05 14:04:14 +08:00
|
|
|
def FARCALL64 : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
|
2012-02-02 07:20:51 +08:00
|
|
|
"lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|
|
|
|
|
2011-01-26 10:03:37 +08:00
|
|
|
// FIXME: We need to teach codegen about single list of call-clobbered
|
2010-10-05 14:04:14 +08:00
|
|
|
// registers.
|
|
|
|
let isCall = 1, isCodeGenOnly = 1 in
|
|
|
|
// All calls clobber the non-callee saved registers. RSP is marked as
|
|
|
|
// a use to prevent stack-pointer assignments that appear immediately
|
|
|
|
// before calls from potentially appearing dead. Uses for argument
|
|
|
|
// registers are added manually.
|
Enable register mask operands for x86 calls.
Call instructions no longer have a list of 43 call-clobbered registers.
Instead, they get a single register mask operand with a bit vector of
call-preserved registers.
This saves a lot of memory, 42 x 32 bytes = 1344 bytes per call
instruction, and it speeds up building call instructions because those
43 imp-def operands no longer need to be added to use-def lists. (And
removed and shifted and re-added for every explicit call operand).
Passes like LiveVariables, LiveIntervals, RAGreedy, PEI, and
BranchFolding are significantly faster because they can deal with call
clobbers in bulk.
Overall, clang -O2 is between 0% and 8% faster, uniformly distributed
depending on call density in the compiled code. Debug builds using
clang -O0 are 0% - 3% faster.
I have verified that this patch doesn't change the assembly generated
for the LLVM nightly test suite when building with -disable-copyprop
and -disable-branch-fold.
Branch folding behaves slightly differently in a few cases because call
instructions have different hash values now.
Copy propagation flushes its data structures when it crosses a register
mask operand. This causes it to leave a few dead copies behind, on the
order of 20 instruction across the entire nightly test suite, including
SPEC. Fixing this properly would require the pass to use different data
structures.
llvm-svn: 150638
2012-02-16 08:02:50 +08:00
|
|
|
let Uses = [RSP] in {
|
2010-10-05 14:04:14 +08:00
|
|
|
def WINCALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
|
|
|
|
(outs), (ins i64i32imm_pcrel:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{q}\t$dst", [], IIC_CALL_RI>,
|
2010-10-05 14:04:14 +08:00
|
|
|
Requires<[IsWin64]>;
|
|
|
|
def WINCALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst, variable_ops),
|
|
|
|
"call{q}\t{*}$dst",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(X86call GR64:$dst)], IIC_CALL_RI>,
|
|
|
|
Requires<[IsWin64]>;
|
2011-01-26 10:03:37 +08:00
|
|
|
def WINCALL64m : I<0xFF, MRM2m, (outs),
|
2010-12-01 05:37:36 +08:00
|
|
|
(ins i64mem:$dst,variable_ops),
|
2010-10-05 14:04:14 +08:00
|
|
|
"call{q}\t{*}$dst",
|
2012-02-02 07:20:51 +08:00
|
|
|
[(X86call (loadi64 addr:$dst))], IIC_CALL_MEM>,
|
2010-10-05 14:04:14 +08:00
|
|
|
Requires<[IsWin64]>;
|
|
|
|
}
|
|
|
|
|
2011-03-24 15:07:00 +08:00
|
|
|
let isCall = 1, isCodeGenOnly = 1 in
|
|
|
|
// __chkstk(MSVC): clobber R10, R11 and EFLAGS.
|
|
|
|
// ___chkstk(Mingw64): clobber R10, R11, RAX and EFLAGS, and update RSP.
|
|
|
|
let Defs = [RAX, R10, R11, RSP, EFLAGS],
|
|
|
|
Uses = [RSP] in {
|
|
|
|
def W64ALLOCA : Ii32PCRel<0xE8, RawFrm,
|
|
|
|
(outs), (ins i64i32imm_pcrel:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"call{q}\t$dst", [], IIC_CALL_RI>,
|
2011-03-24 15:07:00 +08:00
|
|
|
Requires<[IsWin64]>;
|
|
|
|
}
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
|
|
|
|
isCodeGenOnly = 1 in
|
Enable register mask operands for x86 calls.
Call instructions no longer have a list of 43 call-clobbered registers.
Instead, they get a single register mask operand with a bit vector of
call-preserved registers.
This saves a lot of memory, 42 x 32 bytes = 1344 bytes per call
instruction, and it speeds up building call instructions because those
43 imp-def operands no longer need to be added to use-def lists. (And
removed and shifted and re-added for every explicit call operand).
Passes like LiveVariables, LiveIntervals, RAGreedy, PEI, and
BranchFolding are significantly faster because they can deal with call
clobbers in bulk.
Overall, clang -O2 is between 0% and 8% faster, uniformly distributed
depending on call density in the compiled code. Debug builds using
clang -O0 are 0% - 3% faster.
I have verified that this patch doesn't change the assembly generated
for the LLVM nightly test suite when building with -disable-copyprop
and -disable-branch-fold.
Branch folding behaves slightly differently in a few cases because call
instructions have different hash values now.
Copy propagation flushes its data structures when it crosses a register
mask operand. This causes it to leave a few dead copies behind, on the
order of 20 instruction across the entire nightly test suite, including
SPEC. Fixing this properly would require the pass to use different data
structures.
llvm-svn: 150638
2012-02-16 08:02:50 +08:00
|
|
|
let Uses = [RSP],
|
2011-01-26 10:04:09 +08:00
|
|
|
usesCustomInserter = 1 in {
|
2010-12-01 05:37:36 +08:00
|
|
|
def TCRETURNdi64 : PseudoI<(outs),
|
|
|
|
(ins i64i32imm_pcrel:$dst, i32imm:$offset, variable_ops),
|
|
|
|
[]>;
|
|
|
|
def TCRETURNri64 : PseudoI<(outs),
|
2011-01-26 10:04:09 +08:00
|
|
|
(ins ptr_rc_tailcall:$dst, i32imm:$offset, variable_ops), []>;
|
2010-10-05 14:04:14 +08:00
|
|
|
let mayLoad = 1 in
|
2011-01-26 10:03:37 +08:00
|
|
|
def TCRETURNmi64 : PseudoI<(outs),
|
2010-12-01 05:37:36 +08:00
|
|
|
(ins i64mem_TC:$dst, i32imm:$offset, variable_ops), []>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs),
|
|
|
|
(ins i64i32imm_pcrel:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmp\t$dst # TAILCALL", [], IIC_JMP_REL>;
|
2011-01-26 10:04:09 +08:00
|
|
|
def TAILJMPr64 : I<0xFF, MRM4r, (outs), (ins ptr_rc_tailcall:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
|
2010-10-05 14:04:14 +08:00
|
|
|
|
|
|
|
let mayLoad = 1 in
|
|
|
|
def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst, variable_ops),
|
2012-02-02 07:20:51 +08:00
|
|
|
"jmp{q}\t{*}$dst # TAILCALL", [], IIC_JMP_MEM>;
|
2010-10-05 14:04:14 +08:00
|
|
|
}
|