forked from OSchip/llvm-project
The current X86 NOP padding uses one long NOP followed by the remainder in
one-byte NOPs. If the processor actually executes those NOPs, as it sometimes does with aligned bundling, this can have a performance impact. From my micro-benchmarks run on my one machine, a 15-byte NOP followed by twelve one-byte NOPs is about 20% worse than a 15 followed by a 12. This patch changes NOP emission to emit as many 15-byte (the maximum) as possible followed by at most one shorter NOP. llvm-svn: 176464
This commit is contained in:
parent
85707b28e8
commit
4c8979cd4d
|
@ -315,18 +315,18 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write an optimal sequence for the first 15 bytes.
|
// 15 is the longest single nop instruction. Emit as many 15-byte nops as
|
||||||
const uint64_t OptimalCount = (Count < 16) ? Count : 15;
|
// needed, then emit a nop of the remaining length.
|
||||||
const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
|
do {
|
||||||
for (uint64_t i = 0, e = Prefixes; i != e; i++)
|
const uint8_t ThisNopLength = (uint8_t) std::min(Count, (uint64_t) 15);
|
||||||
OW->Write8(0x66);
|
const uint8_t Prefixes = ThisNopLength <= 10 ? 0 : ThisNopLength - 10;
|
||||||
const uint64_t Rest = OptimalCount - Prefixes;
|
for (uint8_t i = 0; i < Prefixes; i++)
|
||||||
for (uint64_t i = 0, e = Rest; i != e; i++)
|
OW->Write8(0x66);
|
||||||
OW->Write8(Nops[Rest - 1][i]);
|
const uint8_t Rest = ThisNopLength - Prefixes;
|
||||||
|
for (uint8_t i = 0; i < Rest; i++)
|
||||||
// Finish with single byte nops.
|
OW->Write8(Nops[Rest - 1][i]);
|
||||||
for (uint64_t i = OptimalCount, e = Count; i != e; ++i)
|
Count -= ThisNopLength;
|
||||||
OW->Write8(0x90);
|
} while (Count != 0);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
# RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - \
|
||||||
|
# RUN: | llvm-objdump -disassemble -no-show-raw-insn - | FileCheck %s
|
||||||
|
|
||||||
|
# Test that long nops are generated for padding where possible.
|
||||||
|
|
||||||
|
.text
|
||||||
|
foo:
|
||||||
|
.bundle_align_mode 5
|
||||||
|
|
||||||
|
# This callq instruction is 5 bytes long
|
||||||
|
.bundle_lock align_to_end
|
||||||
|
callq bar
|
||||||
|
.bundle_unlock
|
||||||
|
# To align this group to a bundle end, we need a 15-byte NOP and a 12-byte NOP.
|
||||||
|
# CHECK: 0: nop
|
||||||
|
# CHECK-NEXT: f: nop
|
||||||
|
# CHECK-NEXT: 1b: callq
|
||||||
|
|
||||||
|
# This push instruction is 1 byte long
|
||||||
|
.bundle_lock align_to_end
|
||||||
|
push %rax
|
||||||
|
.bundle_unlock
|
||||||
|
# To align this group to a bundle end, we need two 15-byte NOPs, and a 1-byte.
|
||||||
|
# CHECK: 20: nop
|
||||||
|
# CHECK-NEXT: 2f: nop
|
||||||
|
# CHECK-NEXT: 3e: nop
|
||||||
|
# CHECK-NEXT: 3f: pushq
|
Loading…
Reference in New Issue